Repository: haomo-ai/Cam4DOcc
Branch: main
Commit: 542f14a9d9e1
Files: 110
Total size: 543.7 KB
Directory structure:
gitextract_hgf40bk9/
├── LICENSE
├── README.md
├── data/
│ ├── README.md
│ ├── cam4docc/
│ │ ├── .gitkeep
│ │ ├── GMO/
│ │ │ └── .gitkeep
│ │ ├── GMO_lyft/
│ │ │ └── .gitkeep
│ │ ├── MMO/
│ │ │ └── .gitkeep
│ │ └── MMO_lyft/
│ │ └── .gitkeep
│ └── nuscenes/
│ └── .gitkeep
├── other_baselines/
│ ├── README.md
│ ├── lifted_2d/
│ │ └── eval_lifted_2d.py
│ ├── static_world/
│ │ └── eval_static_world.py
│ └── voxel_pcp/
│ └── eval_voxel_pcp.py
├── projects/
│ ├── __init__.py
│ ├── configs/
│ │ ├── _base_/
│ │ │ ├── datasets/
│ │ │ │ ├── custom_lyft-3d.py
│ │ │ │ ├── custom_nus-3d.py
│ │ │ │ └── custom_waymo-3d.py
│ │ │ ├── default_runtime.py
│ │ │ └── schedules/
│ │ │ ├── cosine.py
│ │ │ ├── cyclic_20e.py
│ │ │ ├── cyclic_40e.py
│ │ │ ├── mmdet_schedule_1x.py
│ │ │ ├── schedule_2x.py
│ │ │ ├── schedule_3x.py
│ │ │ ├── seg_cosine_150e.py
│ │ │ ├── seg_cosine_200e.py
│ │ │ └── seg_cosine_50e.py
│ │ ├── baselines/
│ │ │ ├── OCFNet_in_Cam4DOcc_V1.1.py
│ │ │ ├── OCFNet_in_Cam4DOcc_V1.1_lyft.py
│ │ │ ├── OCFNet_in_Cam4DOcc_V1.2.py
│ │ │ └── OCFNet_in_Cam4DOcc_V1.2_lyft.py
│ │ └── datasets/
│ │ └── custom_nus-3d.py
│ └── occ_plugin/
│ ├── __init__.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── efficiency_hooks.py
│ │ │ └── eval_hooks.py
│ │ └── visualizer/
│ │ ├── __init__.py
│ │ └── show_occ.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── builder.py
│ │ ├── cam4docc_dataset.py
│ │ ├── cam4docc_lyft_dataset.py
│ │ ├── nuscenes_dataset.py
│ │ ├── pipelines/
│ │ │ ├── __init__.py
│ │ │ ├── formating.py
│ │ │ ├── loading_bevdet.py
│ │ │ ├── loading_instance.py
│ │ │ ├── loading_occupancy.py
│ │ │ └── transform_3d.py
│ │ └── samplers/
│ │ ├── __init__.py
│ │ ├── distributed_sampler.py
│ │ ├── group_sampler.py
│ │ └── sampler.py
│ ├── occupancy/
│ │ ├── __init__.py
│ │ ├── apis/
│ │ │ ├── __init__.py
│ │ │ ├── mmdet_train.py
│ │ │ ├── test.py
│ │ │ └── train.py
│ │ ├── backbones/
│ │ │ ├── __init__.py
│ │ │ ├── pred_block.py
│ │ │ └── resnet3d.py
│ │ ├── dense_heads/
│ │ │ ├── __init__.py
│ │ │ ├── flow_head.py
│ │ │ ├── lovasz_softmax.py
│ │ │ ├── occ_head.py
│ │ │ └── utils.py
│ │ ├── detectors/
│ │ │ ├── __init__.py
│ │ │ ├── bevdepth.py
│ │ │ └── ocfnet.py
│ │ ├── fuser/
│ │ │ ├── __init__.py
│ │ │ ├── addfuse.py
│ │ │ ├── convfuse.py
│ │ │ └── visfuse.py
│ │ ├── image2bev/
│ │ │ ├── ViewTransformerLSSBEVDepth.py
│ │ │ ├── ViewTransformerLSSVoxel.py
│ │ │ └── __init__.py
│ │ ├── necks/
│ │ │ ├── __init__.py
│ │ │ ├── fpn3d.py
│ │ │ └── second_fpn_3d.py
│ │ └── voxel_encoder/
│ │ ├── __init__.py
│ │ └── sparse_lidar_enc.py
│ ├── ops/
│ │ ├── __init__.py
│ │ └── occ_pooling/
│ │ ├── OCC_Pool.py
│ │ ├── __init__.py
│ │ └── src/
│ │ ├── occ_pool.cpp
│ │ └── occ_pool_cuda.cu
│ └── utils/
│ ├── __init__.py
│ ├── coordinate_transform.py
│ ├── formating.py
│ ├── gaussian.py
│ ├── geometry.py
│ ├── metric_util.py
│ ├── nusc_param.py
│ ├── semkitti.py
│ └── voxel_to_points.py
├── run.sh
├── run_eval.sh
├── setup.py
├── tools/
│ ├── dist_test.sh
│ ├── dist_train.sh
│ ├── gen_data/
│ │ └── gen_depth_gt.py
│ ├── misc/
│ │ ├── browse_dataset.py
│ │ ├── fuse_conv_bn.py
│ │ ├── print_config.py
│ │ └── visualize_results.py
│ ├── test.py
│ └── train.py
└── viz/
├── viz_gt.py
└── viz_pred.py
================================================
FILE CONTENTS
================================================
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2023 HAOMO.AI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Cam4DOcc
The official code an data for the benchmark with baselines for our paper: [Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications](https://arxiv.org/abs/2311.17663)
This work has been accepted by CVPR 2024 :tada:
[Junyi Ma#](https://github.com/BIT-MJY), [Xieyuanli Chen#](https://github.com/Chen-Xieyuanli), Jiawei Huang, [Jingyi Xu](https://github.com/BIT-XJY), [Zhen Luo](https://github.com/Blurryface0814), Jintao Xu, Weihao Gu, Rui Ai, [Hesheng Wang*](https://scholar.google.com/citations?hl=en&user=q6AY9XsAAAAJ)
## Citation
If you use Cam4DOcc in an academic work, please cite our paper:
@inproceedings{ma2024cvpr,
author = {Junyi Ma and Xieyuanli Chen and Jiawei Huang and Jingyi Xu and Zhen Luo and Jintao Xu and Weihao Gu and Rui Ai and Hesheng Wang},
title = {{Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications}},
booktitle = {Proc.~of the IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR)},
year = 2024
}
## Installation
We follow the installation instructions of our codebase OpenOccupancy, which are also posted here
* Create a conda virtual environment and activate it
```bash
conda create -n cam4docc python=3.7 -y
conda activate cam4docc
```
* Install PyTorch and torchvision (tested on torch==1.10.1 & cuda=11.3)
```bash
conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge
```
* Install gcc>=5 in conda env
```bash
conda install -c omgarcia gcc-6
```
* Install mmcv, mmdet, and mmseg
```bash
pip install mmcv-full==1.4.0
pip install mmdet==2.14.0
pip install mmsegmentation==0.14.1
```
* Install mmdet3d from the source code
```bash
git clone https://github.com/open-mmlab/mmdetection3d.git
cd mmdetection3d
git checkout v0.17.1 # Other versions may not be compatible.
python setup.py install
```
* Install other dependencies
```bash
pip install timm
pip install open3d-python
pip install PyMCubes
pip install spconv-cu113
pip install fvcore
pip install setuptools==59.5.0
pip install lyft_dataset_sdk # for lyft dataset
```
* Install occupancy pooling
```
git clone git@github.com:haomo-ai/Cam4DOcc.git
cd Cam4DOcc
export PYTHONPATH=“.”
python setup.py develop
```
## Data Structure
### nuScenes dataset
* Please link your [nuScenes V1.0 full dataset](https://www.nuscenes.org/nuscenes#download) to the data folder.
* [nuScenes-Occupancy](https://drive.google.com/file/d/1vTbgddMzUN6nLyWSsCZMb9KwihS7nPoH/view?usp=sharing), [nuscenes_occ_infos_train.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/train_pkl), and [nuscenes_occ_infos_val.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/val_pkl) are also provided by the previous work. If you only want to reproduce the forecasting results with "inflated" form, nuScenes dataset and Cam4DOcc are all you need.
### Lyft dataset
* Please link your [Lyft dataset](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) to the data folder.
* The required folders are listed below.
Note that the folders under `cam4docc` will be generated automatically once you first run our training or evaluation scripts.
```bash
Cam4DOcc
├── data/
│ ├── nuscenes/
│ │ ├── maps/
│ │ ├── samples/
│ │ ├── sweeps/
│ │ ├── lidarseg/
│ │ ├── v1.0-test/
│ │ ├── v1.0-trainval/
│ │ ├── nuscenes_occ_infos_train.pkl
│ │ ├── nuscenes_occ_infos_val.pkl
│ ├── nuScenes-Occupancy/
│ ├── lyft/
│ │ ├── maps/
│ │ ├── train_data/
│ │ ├── images/ # from train images, containing xxx.jpeg
│ ├── cam4docc
│ │ ├── GMO/
│ │ │ ├── segmentation/
│ │ │ ├── instance/
│ │ │ ├── flow/
│ │ ├── MMO/
│ │ │ ├── segmentation/
│ │ │ ├── instance/
│ │ │ ├── flow/
│ │ ├── GMO_lyft/
│ │ │ ├── ...
│ │ ├── MMO_lyft/
│ │ │ ├── ...
```
Alternatively, you could manually modify the path parameters in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines) instead of using the default data structure, which are also listed here:
```
occ_path = "./data/nuScenes-Occupancy"
depth_gt_path = './data/depth_gt'
train_ann_file = "./data/nuscenes/nuscenes_occ_infos_train.pkl"
val_ann_file = "./data/nuscenes/nuscenes_occ_infos_val.pkl"
cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/nuscenes/'
```
## Training and Evaluation
We directly integrate the Cam4DOcc dataset generation pipeline into the dataloader, so you can directly run training or evaluate scripts and just wait :smirk:
Optionally, you can set `only_generate_dataset=True` in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines) to only generate the Cam4DOcc data without model training and inference.
### Train OCFNetV1.1 with 8 GPUs
OCFNetV1.1 can forecast inflated GMO and others. In this case, _vehicle_ and _human_ are considered as one unified category.
For the nuScenes dataset, please run
```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py 8
```
For the Lyft dataset, please run
```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1_lyft.py 8
```
### Train OCFNetV1.2 with 8 GPUs
OCFNetV1.2 can forecast inflated GMO including _bicycle_, _bus_, _car_, _construction_, _motorcycle_, _trailer_, _truck_, _pedestrian_, and others. In this case, _vehicle_ and _human_ are divided into multiple categories for clearer evaluation on forecasting performance.
For the nuScenes dataset, please run
```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py 8
```
For the Lyft dataset, please run
```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2_lyft.py 8
```
* The training/test process will be accelerated several times after you generate datasets by the first epoch.
### Test OCFNet for different tasks
If you only want to test the performance of occupancy prediction for the present frame (current observation), please set `test_present=True` in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines). Otherwise, forecasting performance on the future interval is evaluated.
```bash
bash run_eval.sh $PATH_TO_CFG $PATH_TO_CKPT $GPU_NUM
# e.g. bash run_eval.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py ./work_dirs/OCFNet_in_Cam4DOcc_V1.1/epoch_20.pth 8
```
Please set `save_pred` and `save_path` in the config files once saving prediction results is needed.
`VPQ` evaluation of 3D instance prediction will be refined in the future.
### Visualization
Please install the dependencies as follows:
```bash
sudo apt-get install Xvfb
pip install xvfbwrapper
pip install mayavi
```
where `Xvfb` may be needed for visualization in your server.
**Visualize ground-truth occupancy labels**. Set `show_time_change = True` if you want to show the changing state of occupancy in time intervals.
```bash
cd viz
python viz_gt.py
```
**Visualize occupancy forecasting results**. Set `show_time_change = True` if you want to show the changing state of occupancy in time intervals.
```bash
cd viz
python viz_pred.py
```
There is still room for improvement. Camera-only 4D occupancy forecasting remains challenging, especially for predicting over longer time intervals with many moving objects. We envision this benchmark as a valuable evaluation tool, and our OCFNet can serve as a foundational codebase for future research on 4D occupancy forecasting.
## Basic Information
Some basic information as well as key parameters for our current version.
| Type | Info | Parameter |
| :----: | :----: | :----: |
| train | 23,930 sequences | train_capacity |
| val | 5,119 frames | test_capacity |
| voxel size | 0.2m | voxel_x/y/z |
| range | [-51.2m, -51.2m, -5m, 51.2m, 51.2m, 3m]| point_cloud_range |
| volume size | [512, 512, 40]| occ_size |
| classes | 2 for V1.1 / 9 for V1.2 | num_cls |
| observation frames | 3 | time_receptive_field |
| future frames | 4 | n_future_frames |
| extension frames | 6 | n_future_frames_plus |
Our proposed OCFNet can still perform well while being trained with partial data. Please try to decrease `train_capacity` if you want to explore more details with sparser supervision signals.
In addition, please make sure that `n_future_frames_plus <= time_receptive_field + n_future_frames` because `n_future_frames_plus` means the real prediction number. We estimate more frames including the past ones rather than only `n_future_frames`.
## Pretrained Models
We will provide our pretrained models of the erratum version. Your patience is appreciated.
**Deprecated:**
~~Please download our pretrained models (for epoch=20) to resume training or reproduce results.~~
| Version | Google Drive
| Baidu Cloud
| Config |
| :---: | :---: | :---: | :---: |
| ~~V1.0~~ | ~~link~~ | ~~link~~ | ~~only vehicle~~ |
| V1.1 | [link](https://drive.google.com/file/d/1IXRqOQk3RKpIjGgBBqV9D9vgSt58QDr8/view?usp=sharing) | [link](https://pan.baidu.com/s/18gODsVnBAXEJ4pzv2-LqGA?pwd=m99b) | [OCFNet_in_Cam4DOcc_V1.1.py](https://github.com/haomo-ai/Cam4DOcc/blob/main/projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py) |
| V1.2 | [link](https://drive.google.com/file/d/1q1XnRt0wYE3oq6YBMBnagpGL7h2I46uN/view?usp=sharing) | [link](https://pan.baidu.com/s/1OPc1-a2McOO_0QPX63J7WQ?pwd=adic) | [OCFNet_in_Cam4DOcc_V1.2.py](https://github.com/haomo-ai/Cam4DOcc/blob/main/projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py) |
## Other Baselines
We also provide the evaluation on the forecasting performance of [other baselines](https://github.com/haomo-ai/Cam4DOcc/tree/main/other_baselines) in Cam4DOcc.
## TODO
The tutorial is being updated ...
We will release our pretrained models as soon as possible. OCFNetV1.3 and OCFNetV2 are on their way ...
### Acknowledgement
We thank the fantastic works [OpenOccupancy](https://github.com/JeffWang987/OpenOccupancy), [PowerBEV](https://github.com/EdwardLeeLPZ/PowerBEV), and [FIERY](https://anthonyhu.github.io/fiery) for their pioneer code release, which provide codebase for this benchmark.
================================================
FILE: data/README.md
================================================
### Data Structure
Please link your [nuScenes V1.0 full dataset ](https://www.nuscenes.org/nuscenes#download) to the data folder.
[nuScenes-Occupancy](https://drive.google.com/file/d/1vTbgddMzUN6nLyWSsCZMb9KwihS7nPoH/view?usp=sharing), [nuscenes_occ_infos_train.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/train_pkl), and [nuscenes_occ_infos_val.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/val_pkl) are also provided by the previous work. If you only want to reproduce the forecasting results with "inflated" form, nuScenes dataset and Cam4DOcc are all you need.
Note that the folders under `cam4docc` will be generated automatically once you first run our training or evaluation scripts.
```bash
Cam4DOcc
├── data/
│ ├── nuscenes/
│ │ ├── maps/
│ │ ├── samples/
│ │ ├── sweeps/
│ │ ├── lidarseg/
│ │ ├── v1.0-test/
│ │ ├── v1.0-trainval/
│ │ ├── nuscenes_occ_infos_train.pkl/
│ │ ├── nuscenes_occ_infos_val.pkl/
│ ├── nuScenes-Occupancy/
│ ├── cam4docc
│ │ ├── GMO/
│ │ │ ├── segmentation/
│ │ │ ├── instance/
│ │ │ ├── flow/
│ │ ├── MMO/
│ │ │ ├── segmentation/
│ │ │ ├── instance/
│ │ │ ├── flow/
```
The GMO folder will contain the data where vehicle and human are considered as one unified category.
The MMO folder will contain the data where vehicle and human are divided into multiple categories for clearer evaluation on forecasting performance.
In near future, we will unify GMO and MMO for easier usage.
================================================
FILE: data/cam4docc/.gitkeep
================================================
================================================
FILE: data/cam4docc/GMO/.gitkeep
================================================
================================================
FILE: data/cam4docc/GMO_lyft/.gitkeep
================================================
================================================
FILE: data/cam4docc/MMO/.gitkeep
================================================
================================================
FILE: data/cam4docc/MMO_lyft/.gitkeep
================================================
================================================
FILE: data/nuscenes/.gitkeep
================================================
================================================
FILE: other_baselines/README.md
================================================
## I. Static World
The static world model is built based on the identity hypothesis.
```bash
cd other_baselines/static_world
python ./eval_static_world.py
```
#### Parameters:
* **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process.
* **test_results_dir**: Path of occupancy prediction results. Here we simply set it to the path of OCFNet forecasting results and use the present occupancy prediction results for evaluation. You can also replace them with [OpenOccupancy](https://github.com/JeffWang987/OpenOccupancy) estimation results.
* **gt_dir**: Path of ground-truth segmentations.
## II. Voxelization of PCP
Voxelization of point cloud prediction requires the outputs of [PCPNet](https://github.com/Blurryface0814/PCPNet). Here we use nuScenes-Occupancy as ground-truth since predicted points are limited by sparsity.
```bash
cd other_baselines/voxel_pcp
python ./eval_voxel_pcp.py
```
#### Parameters:
* **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process.
* **occ_path**: Path of nuScenes-Occupancy.
* **test_results_dir**: Path of point cloud prediction results. The data is organized as follows:
```bash
Cam4DOcc
├── data/
│ ├── cam4docc/
│ │ ├── pcpnet_results/
│ │ │ ├── point_clouds/
│ │ │ │ ├── past/
│ │ │ │ │ ├── 000000.ply
│ │ │ │ │ ├── 000001.ply
│ │ │ │ │ ├── 000002.ply
│ │ │ │ │ ├── 000003.ply
│ │ │ │ ├── pred/
│ │ │ │ │ ├── 000000.ply
│ │ │ │ │ ├── ...
│ │ │ ├── saved_labels/
│ │ │ │ ├── past/
│ │ │ │ │ ├── 000000.label
│ │ │ │ │ ├── 000001.label
│ │ │ │ │ ├── 000002.label
│ │ │ │ │ ├── 000003.label
│ │ │ │ ├── pred/
│ │ │ │ │ ├── 000000.ply
│ │ │ │ │ ├── ...
```
We will provide our PCPNet predictions soon and please open an issue [here](https://github.com/Blurryface0814/PCPNet) if you have questions about how PCPNet is implemented for points forecasting.
## III. 2D-3D Lifted Prediction
2D-3D lifted prediction requires the outputs of [PowerBEV](https://github.com/EdwardLeeLPZ/PowerBEV).
```bash
cd other_baselines/lifted_2d
python ./eval_lifted_2d.py
```
#### Parameters:
* **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process.
* **gt_dir**: Path of ground-truth segmentations.
* **hmin**: minimum height for lifting operation.
* **hmax**: maximum height for lifting operation.
* **test_results_dir**: Path of point cloud prediction results. The data is organized as follows:
```bash
Cam4DOcc
├── data/
│ ├── cam4docc/
│ │ ├── powerbev_results/
│ │ │ ├── {scene_token}_{lidar_token}.npz
│ │ │ ├── ...
```
We have provided our [PowerBEV predictions](https://drive.google.com/file/d/1X_N-GwU2ZB65UI9-EYpeQrb2BzS44VVX/view?usp=sharing) and please open an issue [here](https://github.com/EdwardLeeLPZ/PowerBEV) if you have questions about how PowerBEV is implemented for BEV-based instance prediction.
More refinement strategies for the baselines will be released ... Before that, please simply use the scripts here for fast evaluation.
## Publications
If you use our proposed baselines in your work, please cite as:
* Cam4DOcc
```
@inproceedings{ma2024cvpr,
author = {Junyi Ma and Xieyuanli Chen and Jiawei Huang and Jingyi Xu and Zhen Luo and Jintao Xu and Weihao Gu and Rui Ai and Hesheng Wang},
title = {{Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications}},
booktitle = {Proc.~of the IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR)},
year = 2024
}
```
* OpenOccupancy
```
@article{wang2023openoccupancy,
title={Openoccupancy: A large scale benchmark for surrounding semantic occupancy perception},
author={Wang, Xiaofeng and Zhu, Zheng and Xu, Wenbo and Zhang, Yunpeng and Wei, Yi and Chi, Xu and Ye, Yun and Du, Dalong and Lu, Jiwen and Wang, Xingang},
journal={arXiv preprint arXiv:2303.03991},
year={2023}
}
```
* PCPNet
```
@ARTICLE{10141631,
author={Luo, Zhen and Ma, Junyi and Zhou, Zijie and Xiong, Guangming},
journal={IEEE Robotics and Automation Letters},
title={PCPNet: An Efficient and Semantic-Enhanced Transformer Network for Point Cloud Prediction},
year={2023},
volume={8},
number={7},
pages={4267-4274},
doi={10.1109/LRA.2023.3281937}}
```
* PowerBEV
```
@inproceedings{ijcai2023p120,
title = {PowerBEV: A Powerful Yet Lightweight Framework for Instance Prediction in Bird’s-Eye View},
author = {Li, Peizheng and Ding, Shuxiao and Chen, Xieyuanli and Hanselmann, Niklas and Cordts, Marius and Gall, Juergen},
booktitle = {Proceedings of the Thirty-Second International Joint Conference on
Artificial Intelligence, {IJCAI-23}},
pages = {1080--1088},
year = {2023},
month = {8},
doi = {10.24963/ijcai.2023/120},
}
```
================================================
FILE: other_baselines/lifted_2d/eval_lifted_2d.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
from tqdm import trange
import numpy as np
from nuscenes import NuScenes
import os
import torch
import torch.nn.functional as F
import copy
from pyquaternion import Quaternion
# Setups =================================================================================================
test_idx_dir = "../../data/cam4docc/test_ids/"
test_results_dir = "../../data/cam4docc/powerbev_results/"
gt_dir = "../../data/cam4docc/MMO/segmentation/"
test_seqs = os.listdir(test_idx_dir)
test_segmentations = os.listdir(test_results_dir)
dimension = [512, 512, 40]
future_ious = [0, 0, 0, 0]
voxel_size = np.array([0.2,0.2,0.2])
pc_range = np.array([-50, -50, 0, 50, 50, 0])
voxel_size_new = np.array([0.2,0.2,0.2])
pc_range_new = np.array([-51.2, -51.2, -5, 51.2, 51.2, 3])
# 10*0.2=2m
# You can modify the parameters to show the changes with variable heights for lifting
hmin = -1
hmax = 9
nusc = NuScenes(version='v1.0-trainval', dataroot="../../data/nuscenes", verbose=False)
# ========================================================================================================
def cm_to_ious(cm):
mean_ious = []
cls_num = len(cm)
for i in range(cls_num):
tp = cm[i, i]
p = cm[:, i].sum()
g = cm[i, :].sum()
union = p + g - tp
mean_ious.append(tp / union)
return mean_ious
def fast_hist(pred, label, max_label=18):
pred = copy.deepcopy(pred.flatten())
label = copy.deepcopy(label.flatten())
bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2)
iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred
for i in trange(len(test_seqs)):
segmentation_file = test_results_dir + test_seqs[i]
instance_seq = np.load(segmentation_file)['arr_0']
instance_seq = torch.from_numpy(instance_seq)
test_seqs_idxs = np.load(test_idx_dir+test_seqs[i])["arr_0"]
gt_segmentation_file = os.path.join(gt_dir, test_seqs[i])
gt_segmentation_seqs = np.load(gt_segmentation_file, allow_pickle=True)['arr_0']
for t in range(3, 7):
scene_token_cur = test_seqs_idxs[t].split("_")[0]
lidar_token_cur = test_seqs_idxs[t].split("_")[1]
instance_ = instance_seq[0,(t-1)].unsqueeze(0) # t-1 -> t
instance_ = instance_.unsqueeze(0)
instance_ = F.interpolate(instance_.float(), size=[500, 500], mode='nearest').contiguous() # Note: default PowerBEV has different ranges with OCFNet
instance_ = instance_.squeeze(0)
x_grid = torch.linspace(0, 500-1, 500, dtype=torch.float)
x_grid = x_grid.view(500, 1).expand(500,500)
y_grid = torch.linspace(0, 500-1,500, dtype=torch.float)
y_grid = y_grid.view(1, 500).expand(500,500)
mesh_grid_2d = torch.stack((x_grid, y_grid), -1)
mesh_grid_2d = mesh_grid_2d.view(-1, 2)
instance_ = instance_.view(-1, 1)
semantics_lifted = []
for ii in range(hmin, hmax):
semantics_lifted_ = torch.cat((mesh_grid_2d, ii*torch.ones_like(mesh_grid_2d[:,0:1])),dim=-1)
semantics_lifted_ = torch.cat((semantics_lifted_, instance_),dim=-1)
semantics_lifted.append(semantics_lifted_)
semantics_lifted = np.array(torch.cat(semantics_lifted, dim=0))
kept = semantics_lifted[:,-1]!=0
semantics_lifted = semantics_lifted[kept]
if semantics_lifted.shape[0] == 0:
semantics_lifted = np.zeros((1,4))
lidar_sample = nusc.get('sample_data', lidar_token_cur)
lidar_sample_calib = nusc.get('calibrated_sensor', lidar_sample['calibrated_sensor_token'])
lidar_sensor_rotation = Quaternion(lidar_sample_calib['rotation'])
lidar_sensor_translation = np.array(lidar_sample_calib['translation'])[:, None]
lidar_to_lidarego = np.vstack([
np.hstack((lidar_sensor_rotation.rotation_matrix, lidar_sensor_translation)),
np.array([0, 0, 0, 1])
])
lidarego_to_lidar = np.linalg.inv(lidar_to_lidarego)
points = np.ones_like(semantics_lifted)
points[:,:3] = semantics_lifted[:,:3]
points[:,:3] = points[:,:3] * voxel_size[None, :] + pc_range[:3][None, :]
points = lidarego_to_lidar @ points.T
semantics_lifted_transformed = np.ones_like(semantics_lifted)
semantics_lifted_transformed[:,:3] = (points.T)[:,:3]
semantics_lifted_transformed[:,-1] = semantics_lifted[:,-1]
semantics_lifted_transformed[:,:3] = (semantics_lifted_transformed[:,:3] - pc_range_new[:3][None, :]) / voxel_size_new[None, :]
pred_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
for j in range(semantics_lifted_transformed.shape[0]):
cur_ind = semantics_lifted_transformed[j, :3].astype(int)
cur_label = semantics_lifted_transformed[j, -1]
if cur_label != 0:
pred_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = 1
gt_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
gt_segmentation_raw = gt_segmentation_seqs[t].cpu().numpy()
gt_segmentation[gt_segmentation_raw[:,0].astype(int),gt_segmentation_raw[:,1].astype(int),gt_segmentation_raw[:,2].astype(int)] = gt_segmentation_raw[:, -1]
hist_cur, iou_per_pred = fast_hist(pred_segmentation.astype(int), gt_segmentation.astype(int), max_label=2)
if t <= 3:
future_ious[0] = future_ious[0] + hist_cur
if t <= 4:
future_ious[1] = future_ious[1] + hist_cur
if t <= 5:
future_ious[2] = future_ious[2] + hist_cur
if t <= 6:
future_ious[3] = future_ious[3] + hist_cur
for t in range(len(future_ious)):
print("iou for step "+str(t), cm_to_ious(future_ious[t]))
================================================
FILE: other_baselines/static_world/eval_static_world.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import numpy as np
import os
import copy
from tqdm import trange
# Setups =================================================================================================
test_idx_dir = "../../data/cam4docc/test_ids/"
test_results_dir = "../../data/cam4docc/results/"
gt_dir = "../../data/cam4docc/MMO/segmentation/"
objects_max_label = 9
test_seqs = os.listdir(test_idx_dir)
test_segmentations = os.listdir(test_results_dir)
dimension = [512, 512, 40]
future_ious = [0, 0, 0, 0]
# ========================================================================================================
def cm_to_ious(cm):
mean_ious = []
cls_num = len(cm)
for i in range(cls_num):
tp = cm[i, i]
p = cm[:, i].sum()
g = cm[i, :].sum()
union = p + g - tp
mean_ious.append(tp / union)
return mean_ious
def fast_hist(pred, label, max_label=18):
pred = copy.deepcopy(pred.flatten())
label = copy.deepcopy(label.flatten())
bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2)
iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred
for i in trange(len(test_seqs)):
segmentation_file = test_results_dir + test_seqs[i]
if not os.path.exists(segmentation_file):
continue
segmentation = np.load(segmentation_file,allow_pickle=True)['arr_0']
test_seqs_idxs = np.load(os.path.join(test_idx_dir, test_seqs[i]))["arr_0"]
gt_segmentation_file = os.path.join(gt_dir, test_seqs[i])
gt_segmentation_seqs = np.load(gt_segmentation_file,allow_pickle=True)['arr_0']
# hard coding for input:3 output:4
for t in range(3,7):
# static world using present predictions
segmentation_t = segmentation[0]
pred_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
for j in range(segmentation_t.shape[0]):
cur_ind = segmentation_t[j, :3].astype(int)
cur_label = segmentation_t[j, -1]
pred_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label
gt_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
gt_segmentation_raw = gt_segmentation_seqs[t]
for k in range(gt_segmentation_raw.shape[0]):
cur_ind = gt_segmentation_raw[k, :3].astype(int)
cur_label = gt_segmentation_raw[k, -1]
gt_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label
hist_cur, iou_per_pred = fast_hist(pred_segmentation.astype(int), gt_segmentation.astype(int), max_label=objects_max_label)
if t <= 3:
future_ious[0] = future_ious[0] + hist_cur
if t <= 4:
future_ious[1] = future_ious[1] + hist_cur
if t <= 5:
future_ious[2] = future_ious[2] + hist_cur
if t <= 6:
future_ious[3] = future_ious[3] + hist_cur
for t in range(len(future_ious)):
print("iou for step "+str(t), cm_to_ious(future_ious[t]))
================================================
FILE: other_baselines/voxel_pcp/eval_voxel_pcp.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import numpy as np
import os
import copy
from tqdm import trange
import open3d as o3d
from nuscenes import NuScenes
from pyquaternion import Quaternion
# Setups =================================================================================================
test_idx_dir = "../../data/cam4docc/test_ids/"
test_results_dir = "../../data/cam4docc/pcpnet_results/"
occ_path = "../../data/nuScenes-Occupancy"
test_seqs = os.listdir(test_idx_dir)
test_segmentations = os.listdir(test_results_dir)
pc_range= np.array([-51.2, -51.2, -5.0, 51.2, 51.2, 3.0])
dimension = [512, 512, 40]
grid_size= np.array(dimension)
voxel_size = (pc_range[3:] -pc_range[:3]) / grid_size
future_ious = [0, 0, 0, 0]
nusc = NuScenes(version='v1.0-trainval', dataroot="../../data/nuscenes", verbose=False)
# ========================================================================================================
lidar_token2sample_token = {}
for i in range(len(nusc.sample)):
my_sample = nusc.sample[i]
frame_token = my_sample['token']
lidar_token = my_sample['data']['LIDAR_TOP']
lidar_token2sample_token[lidar_token] = frame_token
def voxel2world(voxel):
"""
voxel: [N, 3]
"""
return voxel *voxel_size[None, :] + pc_range[:3][None, :]
def world2voxel(world):
"""
world: [N, 3]
"""
return (world - pc_range[:3][None, :]) / voxel_size[None, :]
def cm_to_ious(cm):
mean_ious = []
cls_num = len(cm)
for i in range(cls_num):
tp = cm[i, i]
p = cm[:, i].sum()
g = cm[i, :].sum()
union = p + g - tp
mean_ious.append(tp / union)
return mean_ious
def fast_hist(pred, label, max_label=18):
pred = copy.deepcopy(pred.flatten())
label = copy.deepcopy(label.flatten())
bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2)
iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred
def nb_process_label(processed_label, sorted_label_voxel_pair):
label_size = 256
counter = np.zeros((label_size,), dtype=np.uint16)
counter[sorted_label_voxel_pair[0, 3]] = 1
cur_sear_ind = sorted_label_voxel_pair[0, :3]
for i in range(1, sorted_label_voxel_pair.shape[0]):
cur_ind = sorted_label_voxel_pair[i, :3]
if not np.all(np.equal(cur_ind, cur_sear_ind)):
processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
counter = np.zeros((label_size,), dtype=np.uint16)
cur_sear_ind = cur_ind
counter[sorted_label_voxel_pair[i, 3]] += 1
processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
return processed_label
def get_ego2lidar_pose(rec):
lidar_top_data = nusc.get('sample_data', rec['data']['LIDAR_TOP'])
lidar2ego_translation = nusc.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation']
lidar2ego_rotation = nusc.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation']
trans = -np.array(lidar2ego_translation)
rot = Quaternion(lidar2ego_rotation).inverse
return trans, rot
def get_lidar_pose(rec):
current_sample = nusc.get('sample', rec['token'])
egopose = nusc.get('ego_pose', nusc.get('sample_data', current_sample['data']['LIDAR_TOP'])['ego_pose_token'])
trans = -np.array(egopose['translation'])
rot = Quaternion(egopose['rotation']).inverse
return trans, rot
for i in trange(len(test_seqs)):
test_seqs_idxs = np.load(os.path.join(test_idx_dir, test_seqs[i]))['arr_0']
scene_token_present = test_seqs[i].split("_")[0]
lidar_token_present = test_seqs[i].split("_")[1][:-4]
# transform past point clouds to the present frame
# point cloud prediction baseline is limited by sparsity of laser points, so we aggregate
# past point clouds to mitigate in this version
# More reasonable versions will be released
past_voxels = []
for t in range(1, 4):
scene_token_ = test_seqs_idxs[t-1].split("_")[0]
lidar_token_ = test_seqs_idxs[t-1].split("_")[1]
point_file = test_results_dir+"point_clouds/"+scene_token_present+"_"+lidar_token_present+"/past/00000"+str(t)+".ply"
label_file = test_results_dir+"saved_labels/"+scene_token_present+"_"+lidar_token_present+"/past/00000"+str(t)+".label"
pcd_load = o3d.io.read_point_cloud(point_file)
xyz_load = np.asarray(pcd_load.points)
sample_token_present = lidar_token2sample_token[lidar_token_present]
rec_present = nusc.get('sample', sample_token_present)
translation_present, rotation_present = get_lidar_pose(rec_present)
ego2lidar_translation_present, ego2lidar_rotation_present = get_ego2lidar_pose(rec_present)
sample_token_ = lidar_token2sample_token[lidar_token_]
rec_ = nusc.get('sample', sample_token_)
translation_, rotation_ = get_lidar_pose(rec_)
ego2lidar_translation_, ego2lidar_rotation_ = get_ego2lidar_pose(rec_)
present_global2ego = [translation_present, rotation_present]
present_ego2lidar = [ego2lidar_translation_present, ego2lidar_rotation_present]
cur_global2ego = [translation_, rotation_]
cur_ego2lidar = [ego2lidar_translation_, ego2lidar_rotation_]
pcd_np_cor = np.dot(cur_ego2lidar[1].inverse.rotation_matrix, xyz_load.T)
pcd_np_cor = pcd_np_cor.T
pcd_np_cor = pcd_np_cor - cur_ego2lidar[0]
pcd_np_cor = np.dot(cur_global2ego[1].inverse.rotation_matrix, pcd_np_cor.T)
pcd_np_cor = pcd_np_cor.T
pcd_np_cor = pcd_np_cor - cur_global2ego[0]
pcd_np_cor = pcd_np_cor + present_global2ego[0]
pcd_np_cor = np.dot(present_global2ego[1].rotation_matrix, pcd_np_cor.T)
pcd_np_cor = pcd_np_cor.T
pcd_np_cor = pcd_np_cor + present_ego2lidar[0] # trans
pcd_np_cor = np.dot(present_ego2lidar[1].rotation_matrix, pcd_np_cor.T)
xyz_load = pcd_np_cor.T
xyz_load = world2voxel(xyz_load)
label = np.fromfile(label_file, dtype=np.uint32)
label = label.reshape((-1,1))
segmentation_t = np.concatenate((xyz_load, label), axis=-1)
kept = (segmentation_t[:,0]>0) & (segmentation_t[:,0]0) & (segmentation_t[:,1]0) & (segmentation_t[:,2]0) & (segmentation_t[:,0]0) & (segmentation_t[:,1]0) & (segmentation_t[:,2] n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False
# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
num_cls = len(class_names) + 1
else:
num_cls = 2
img_norm_cfg = None
# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"
# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccDataset'
file_client_args = dict(backend='disk')
data_config={
'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
'Ncams': 6,
'input_size': (896, 1600),
'src_size': (900, 1600),
# image-view augmentation
'resize': (-0.06, 0.11),
'rot': (-5.4, 5.4),
'flip': False,
'crop_h': (0.0, 0.0),
'resize_test': 0.00,
}
bda_aug_conf = dict(
rot_lim=(-0, 0),
scale_lim=(0.95, 1.05),
flip_dx_ratio=0.5,
flip_dy_ratio=0.5)
train_capacity = 23930 # default: use all sequences
test_capacity = 5119 # default: use all sequences
train_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes),
dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
dict(type='OccDefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]
test_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes),
dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]
train_config=dict(
type=dataset_type,
data_root=nusc_root,
occ_root=occ_path,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=train_ann_file,
pipeline=train_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
occ_size=occ_size,
pc_range=point_cloud_range,
box_type_3d='LiDAR',
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
test_config=dict(
type=dataset_type,
occ_root=occ_path,
data_root=nusc_root,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=val_ann_file,
pipeline=test_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
occ_size=occ_size,
pc_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
# in our work we use 8 NVIDIA A100 GPUs
data = dict(
samples_per_gpu=1,
workers_per_gpu=1,
train=train_config,
val=test_config,
test=test_config,
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'),
)
# Model params ******************************************
grid_config = {
'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
'dbound': [2.0, 58.0, 0.5],
}
voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]
numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)
model = dict(
type='OCFNet',
only_generate_dataset=only_generate_dataset,
loss_norm=False,
disable_loss_depth=True,
point_cloud_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
n_future_frames_plus=n_future_frames_plus,
max_label=num_cls,
iou_thresh_for_vpq=iou_thresh_for_vpq,
test_present=test_present,
record_time=False,
save_pred=save_pred,
save_path=save_path,
img_backbone=dict(
pretrained='torchvision://resnet50',
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
with_cp=False,
norm_cfg=dict(type='SyncBN', requires_grad=True),
norm_eval=False,
style='pytorch'),
img_neck=dict(
type='SECONDFPN',
in_channels=[256, 512, 1024, 2048],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[128, 128, 128, 128]),
img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
norm_cfg=dict(type='SyncBN', requires_grad=True),
loss_depth_weight=3.,
loss_depth_type='kld',
grid_config=grid_config,
data_config=data_config,
numC_Trans=numC_Trans,
vp_megvii=False),
occ_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=voxel_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=flow_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_head=dict(
type='FlowHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=3, # 3-dim flow
point_cloud_range=point_cloud_range,
),
pts_bbox_head=dict(
type='OccHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=num_cls,
point_cloud_range=point_cloud_range,
loss_weight_cfg=dict(
loss_voxel_ce_weight=1.0,
loss_voxel_sem_scal_weight=1.0,
loss_voxel_geo_scal_weight=1.0,
loss_voxel_lovasz_weight=1.0,
),
),
empty_idx=empty_idx,
)
# Learning policy params ******************************************
optimizer = dict(
type='AdamW',
lr=3e-4,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3)
runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
interval=1,
pipeline=test_pipeline,
save_best='SSC_mean',
rule='greater',
)
custom_hooks = [
dict(type='OccEfficiencyHook'),
]
================================================
FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1_lyft.py
================================================
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
# 2 classes: inflated GMO and others
# Basic params ******************************************
_base_ = [
'../datasets/custom_nus-3d.py',
'../_base_/default_runtime.py'
]
find_unused_parameters = True
# whether training and test together with dataset generation
only_generate_dataset = False
# we only consider use_camera in Cam4DOcc in the current version
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
plugin = True
plugin_dir = "projects/occ_plugin/"
# path unused for lyft
occ_path = " "
depth_gt_path = " "
train_ann_file = " "
val_ann_file = " "
cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/lyft/'
# GMO class names
class_names = ['vehicle', 'human']
use_separate_classes = False
use_fine_occ = False
# Forecasting-related params ******************************************
# we use *time_receptive_field* past frames to forecast future *n_future_frames* frames
# for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False
# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
num_cls = len(class_names) + 1
else:
num_cls = 2
img_norm_cfg = None
# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"
# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccLyftDataset'
file_client_args = dict(backend='disk')
data_config={
'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
'Ncams': 6,
'input_size': (896, 1600),
'src_size': (900, 1600),
# image-view augmentation
'resize': (-0.06, 0.11),
'rot': (-5.4, 5.4),
'flip': False,
'crop_h': (0.0, 0.0),
'resize_test': 0.00,
}
bda_aug_conf = dict(
rot_lim=(-0, 0),
scale_lim=(0.95, 1.05),
flip_dx_ratio=0.5,
flip_dy_ratio=0.5)
train_capacity = 15720 # default: use all sequences
test_capacity = 5880 # default: use all sequences
train_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes, use_lyft=True),
dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg, use_lyft=True),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
dict(type='OccDefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]
test_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes, use_lyft=True),
dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True, use_lyft=True),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]
train_config=dict(
type=dataset_type,
data_root=nusc_root,
occ_root=occ_path,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=train_ann_file,
pipeline=train_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
occ_size=occ_size,
pc_range=point_cloud_range,
box_type_3d='LiDAR',
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
test_config=dict(
type=dataset_type,
occ_root=occ_path,
data_root=nusc_root,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=val_ann_file,
pipeline=test_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
occ_size=occ_size,
pc_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
# in our work we use 8 NVIDIA A100 GPUs
data = dict(
samples_per_gpu=1,
workers_per_gpu=1,
train=train_config,
val=test_config,
test=test_config,
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'),
)
# Model params ******************************************
grid_config = {
'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
'dbound': [2.0, 58.0, 0.5],
}
voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]
numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)
model = dict(
type='OCFNet',
only_generate_dataset=only_generate_dataset,
loss_norm=False,
disable_loss_depth=True,
point_cloud_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
n_future_frames_plus=n_future_frames_plus,
max_label=num_cls,
iou_thresh_for_vpq=iou_thresh_for_vpq,
test_present=test_present,
record_time=False,
save_pred=save_pred,
save_path=save_path,
img_backbone=dict(
pretrained='torchvision://resnet50',
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
with_cp=False,
norm_cfg=dict(type='SyncBN', requires_grad=True),
norm_eval=False,
style='pytorch'),
img_neck=dict(
type='SECONDFPN',
in_channels=[256, 512, 1024, 2048],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[128, 128, 128, 128]),
img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
norm_cfg=dict(type='SyncBN', requires_grad=True),
loss_depth_weight=3.,
loss_depth_type='kld',
grid_config=grid_config,
data_config=data_config,
numC_Trans=numC_Trans,
vp_megvii=False),
occ_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=voxel_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=flow_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_head=dict(
type='FlowHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=3, # 3-dim flow
point_cloud_range=point_cloud_range,
),
pts_bbox_head=dict(
type='OccHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=num_cls,
point_cloud_range=point_cloud_range,
loss_weight_cfg=dict(
loss_voxel_ce_weight=1.0,
loss_voxel_sem_scal_weight=1.0,
loss_voxel_geo_scal_weight=1.0,
loss_voxel_lovasz_weight=1.0,
),
),
empty_idx=empty_idx,
)
# Learning policy params ******************************************
optimizer = dict(
type='AdamW',
lr=3e-4,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3)
runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
interval=1,
pipeline=test_pipeline,
save_best='SSC_mean',
rule='greater',
)
custom_hooks = [
dict(type='OccEfficiencyHook'),
]
================================================
FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py
================================================
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
# multiple classes: inflated multiple MO classes
# Basic params ******************************************
_base_ = [
'../datasets/custom_nus-3d.py',
'../_base_/default_runtime.py'
]
find_unused_parameters = True
# whether training and test together with dataset generation
only_generate_dataset = False
# we only consider use_camera in Cam4DOcc in the current version
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
plugin = True
plugin_dir = "projects/occ_plugin/"
occ_path = "./data/nuScenes-Occupancy"
depth_gt_path = './data/depth_gt'
train_ann_file = "./data/nuscenes/nuscenes_occ_infos_train.pkl"
val_ann_file = "./data/nuscenes/nuscenes_occ_infos_val.pkl"
cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/nuscenes/'
# GMO class names
class_names = [
'vehicle.bicycle', 'bus', 'car', 'construction', 'motorcycle', 'trailer', 'truck', 'pedestrian'
]
use_separate_classes = True
use_fine_occ = False
# Forecasting-related params ******************************************
# we use *time_receptive_field* past frames to forecast future *n_future_frames* frames
# for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False
# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
num_cls = len(class_names) + 1
else:
num_cls = 2
img_norm_cfg = None
# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"
# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccDataset'
file_client_args = dict(backend='disk')
data_config={
'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
'Ncams': 6,
'input_size': (896, 1600),
'src_size': (900, 1600),
# image-view augmentation
'resize': (-0.06, 0.11),
'rot': (-5.4, 5.4),
'flip': False,
'crop_h': (0.0, 0.0),
'resize_test': 0.00,
}
bda_aug_conf = dict(
rot_lim=(-0, 0),
scale_lim=(0.95, 1.05),
flip_dx_ratio=0.5,
flip_dy_ratio=0.5)
train_capacity = 23930 # default: use all sequences
test_capacity = 5119 # default: use all sequences
train_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes),
dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
dict(type='OccDefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]
test_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes),
dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]
train_config=dict(
type=dataset_type,
data_root=nusc_root,
occ_root=occ_path,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=train_ann_file,
pipeline=train_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
occ_size=occ_size,
pc_range=point_cloud_range,
box_type_3d='LiDAR',
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
test_config=dict(
type=dataset_type,
occ_root=occ_path,
data_root=nusc_root,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=val_ann_file,
pipeline=test_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
occ_size=occ_size,
pc_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
# in our work we use 8 NVIDIA A100 GPUs
data = dict(
samples_per_gpu=1,
workers_per_gpu=1,
train=train_config,
val=test_config,
test=test_config,
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'),
)
# Model params ******************************************
grid_config = {
'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
'dbound': [2.0, 58.0, 0.5],
}
voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]
numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)
model = dict(
type='OCFNet',
only_generate_dataset=only_generate_dataset,
loss_norm=False,
disable_loss_depth=True,
point_cloud_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
n_future_frames_plus=n_future_frames_plus,
max_label=num_cls,
iou_thresh_for_vpq=iou_thresh_for_vpq,
test_present=test_present,
record_time=False,
save_pred=save_pred,
save_path=save_path,
img_backbone=dict(
pretrained='torchvision://resnet50',
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
with_cp=False,
norm_cfg=dict(type='SyncBN', requires_grad=True),
norm_eval=False,
style='pytorch'),
img_neck=dict(
type='SECONDFPN',
in_channels=[256, 512, 1024, 2048],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[128, 128, 128, 128]),
img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
norm_cfg=dict(type='SyncBN', requires_grad=True),
loss_depth_weight=3.,
loss_depth_type='kld',
grid_config=grid_config,
data_config=data_config,
numC_Trans=numC_Trans,
vp_megvii=False),
occ_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=voxel_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=flow_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_head=dict(
type='FlowHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=3, # 3-dim flow
point_cloud_range=point_cloud_range,
),
pts_bbox_head=dict(
type='OccHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=num_cls,
point_cloud_range=point_cloud_range,
loss_weight_cfg=dict(
loss_voxel_ce_weight=1.0,
loss_voxel_sem_scal_weight=1.0,
loss_voxel_geo_scal_weight=1.0,
loss_voxel_lovasz_weight=1.0,
),
),
empty_idx=empty_idx,
)
# Learning policy params ******************************************
optimizer = dict(
type='AdamW',
lr=3e-4,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3)
runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
interval=1,
pipeline=test_pipeline,
save_best='SSC_mean',
rule='greater',
)
custom_hooks = [
dict(type='OccEfficiencyHook'),
]
================================================
FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2_lyft.py
================================================
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
# multiple classes: inflated multiple MO classes
# Basic params ******************************************
_base_ = [
'../datasets/custom_nus-3d.py',
'../_base_/default_runtime.py'
]
find_unused_parameters = True
# whether training and test together with dataset generation
only_generate_dataset = False
# we only consider use_camera in Cam4DOcc in the current version
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
plugin = True
plugin_dir = "projects/occ_plugin/"
# path unused for lyft
occ_path = " "
depth_gt_path = " "
train_ann_file = " "
val_ann_file = " "
cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/lyft/'
# GMO class names
# refine the classes for lyft datasets according to your needs
class_names = [
'bicycle', 'bus', 'car', 'construction', 'motorcycle', 'trailer', 'truck', 'pedestrian'
]
use_separate_classes = True
use_fine_occ = False
# Forecasting-related params ******************************************
# we use *time_receptive_field* past frames to forecast future *n_future_frames* frames
# for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False
# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
num_cls = len(class_names) + 1
else:
num_cls = 2
img_norm_cfg = None
# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"
# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccLyftDataset'
file_client_args = dict(backend='disk')
data_config={
'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
'Ncams': 6,
'input_size': (896, 1600),
'src_size': (900, 1600),
# image-view augmentation
'resize': (-0.06, 0.11),
'rot': (-5.4, 5.4),
'flip': False,
'crop_h': (0.0, 0.0),
'resize_test': 0.00,
}
bda_aug_conf = dict(
rot_lim=(-0, 0),
scale_lim=(0.95, 1.05),
flip_dx_ratio=0.5,
flip_dy_ratio=0.5)
train_capacity = 15720 # default: use all sequences
test_capacity = 5880 # default: use all sequences
train_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes, use_lyft=True),
dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg, use_lyft=True),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
dict(type='OccDefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]
test_pipeline = [
dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
use_separate_classes=use_separate_classes, use_lyft=True),
dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True, use_lyft=True),
dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False),
dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]
train_config=dict(
type=dataset_type,
data_root=nusc_root,
occ_root=occ_path,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=train_ann_file,
pipeline=train_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
occ_size=occ_size,
pc_range=point_cloud_range,
box_type_3d='LiDAR',
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
test_config=dict(
type=dataset_type,
occ_root=occ_path,
data_root=nusc_root,
idx_root=cam4docc_dataset_path,
ori_data_root=cam4docc_dataset_path,
ann_file=val_ann_file,
pipeline=test_pipeline,
classes=class_names,
use_separate_classes=use_separate_classes,
modality=input_modality,
occ_size=occ_size,
pc_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
train_capacity=train_capacity,
test_capacity=test_capacity,
)
# in our work we use 8 NVIDIA A100 GPUs
data = dict(
samples_per_gpu=1,
workers_per_gpu=1,
train=train_config,
val=test_config,
test=test_config,
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'),
)
# Model params ******************************************
grid_config = {
'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
'dbound': [2.0, 58.0, 0.5],
}
voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]
numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)
model = dict(
type='OCFNet',
only_generate_dataset=only_generate_dataset,
loss_norm=False,
disable_loss_depth=True,
point_cloud_range=point_cloud_range,
time_receptive_field=time_receptive_field,
n_future_frames=n_future_frames,
n_future_frames_plus=n_future_frames_plus,
max_label=num_cls,
iou_thresh_for_vpq=iou_thresh_for_vpq,
test_present=test_present,
record_time=False,
save_pred=save_pred,
save_path=save_path,
img_backbone=dict(
pretrained='torchvision://resnet50',
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
with_cp=False,
norm_cfg=dict(type='SyncBN', requires_grad=True),
norm_eval=False,
style='pytorch'),
img_neck=dict(
type='SECONDFPN',
in_channels=[256, 512, 1024, 2048],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[128, 128, 128, 128]),
img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
norm_cfg=dict(type='SyncBN', requires_grad=True),
loss_depth_weight=3.,
loss_depth_type='kld',
grid_config=grid_config,
data_config=data_config,
numC_Trans=numC_Trans,
vp_megvii=False),
occ_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
occ_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=voxel_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_backbone=dict(
type='CustomResNet3D',
depth=18,
n_input_channels=occ_encoder_input_channel,
block_inplanes=voxel_channels,
out_indices=my_voxel_out_indices,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_predictor=dict(
type='Predictor',
n_input_channels=pred_channels,
in_timesteps=time_receptive_field,
out_timesteps=n_future_frames_plus,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_encoder_neck=dict(
type='FPN3D',
with_cp=False,
in_channels=decoder_channels,
out_channels=flow_out_channel,
norm_cfg=dict(type='SyncBN', requires_grad=True),
),
flow_head=dict(
type='FlowHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=3, # 3-dim flow
point_cloud_range=point_cloud_range,
),
pts_bbox_head=dict(
type='OccHead',
norm_cfg=dict(type='SyncBN', requires_grad=True),
soft_weights=True,
final_occ_size=occ_size,
fine_topk=15000,
empty_idx=empty_idx,
num_level=len(my_voxel_out_indices),
in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
out_channel=num_cls,
point_cloud_range=point_cloud_range,
loss_weight_cfg=dict(
loss_voxel_ce_weight=1.0,
loss_voxel_sem_scal_weight=1.0,
loss_voxel_geo_scal_weight=1.0,
loss_voxel_lovasz_weight=1.0,
),
),
empty_idx=empty_idx,
)
# Learning policy params ******************************************
optimizer = dict(
type='AdamW',
lr=3e-4,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3)
runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
interval=1,
pipeline=test_pipeline,
save_best='SSC_mean',
rule='greater',
)
custom_hooks = [
dict(type='OccEfficiencyHook'),
]
================================================
FILE: projects/configs/datasets/custom_nus-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
dataset_type = 'NuScenesDataset_eval_modified'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR'),
val=dict(
type=dataset_type,
ann_file=data_root + 'nuscenes_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)
================================================
FILE: projects/occ_plugin/__init__.py
================================================
from .core.evaluation.eval_hooks import OccDistEvalHook, OccEvalHook
from .core.evaluation.efficiency_hooks import OccEfficiencyHook
from .core.visualizer import save_occ
from .datasets.pipelines import (
PhotoMetricDistortionMultiViewImage, PadMultiViewImage,
NormalizeMultiviewImage, CustomCollect3D)
from .occupancy import *
================================================
FILE: projects/occ_plugin/core/__init__.py
================================================
from .evaluation import *
from .visualizer import *
================================================
FILE: projects/occ_plugin/core/evaluation/__init__.py
================================================
from .eval_hooks import OccDistEvalHook, OccEvalHook
from .efficiency_hooks import OccEfficiencyHook
================================================
FILE: projects/occ_plugin/core/evaluation/efficiency_hooks.py
================================================
import copy
from mmcv.runner import HOOKS, Hook
import time
try:
from mmcv.cnn import get_model_complexity_info
except ImportError:
raise ImportError('Please upgrade mmcv to >0.6.2')
import torch
import torch.distributed as dist
@HOOKS.register_module()
class OccEfficiencyHook(Hook):
def __init__(self, dataloader, **kwargs):
self.dataloader = dataloader
self.warm_up = 5
def construct_input(self, DUMMY_SHAPE=None, m_info=None):
if m_info is None:
m_info = next(iter(self.dataloader))
img_metas = m_info['img_metas'].data
input = dict(
img_metas=img_metas,
)
if 'img_inputs' in m_info.keys():
img_inputs = m_info['img_inputs']
for i in range(len(img_inputs)):
if isinstance(img_inputs[i], list):
for j in range(len(img_inputs[i])):
img_inputs[i][j] = img_inputs[i][j].cuda()
else:
img_inputs[i] = img_inputs[i].cuda()
input['img_inputs'] = img_inputs
if 'points' in m_info.keys():
points = m_info['points'].data[0]
points[0] = points[0].cuda()
input['points'] = points
return input
def before_run(self, runner):
torch.cuda.reset_peak_memory_stats()
# model = copy.deepcopy(runner.model)
# if hasattr(model, 'module'):
# model = model.module
# if hasattr(model, 'forward_dummy'):
# model.forward_train = model.forward_dummy
# model.forward_test = model.forward_dummy
# model.eval()
# else:
# raise NotImplementedError(
# 'FLOPs counter is currently not supported for {}'.format(
# model.__class__.__name__))
# # inf time
# pure_inf_time = 0
# itv_sample = 10
# for i, data in enumerate(self.dataloader):
# torch.cuda.synchronize()
# start_time = time.perf_counter()
# with torch.no_grad():
# model(return_loss=False, rescale=True, **self.construct_input(m_info=data))
# torch.cuda.synchronize()
# elapsed = time.perf_counter() - start_time
# if i >= self.warm_up:
# pure_inf_time += elapsed
# if (i + 1) % itv_sample == 0:
# fps = (i + 1 - self.warm_up) / pure_inf_time
# if runner.rank == 0:
# runner.logger.info(f'Done sample [{i + 1:<3}/ {itv_sample*5}], '
# f'fps: {fps:.1f} sample / s')
# if (i + 1) == itv_sample*5:
# pure_inf_time += elapsed
# fps = (i + 1 - self.warm_up) / pure_inf_time
# if runner.rank == 0:
# runner.logger.info(f'Overall fps: {fps:.1f} sample / s')
# break
# # flops and params
# if runner.rank == 0:
# flops, params = get_model_complexity_info(
# model, (None, None), input_constructor=self.construct_input)
# split_line = '=' * 30
# gpu_measure = torch.cuda.max_memory_allocated() / 1024. / 1024. /1024.
# runner.logger.info(f'{split_line}\n' f'Flops: {flops}\nParams: {params}\nGPU memory: {gpu_measure:.2f}GB\n{split_line}')
if dist.is_available() and dist.is_initialized():
dist.barrier()
def after_run(self, runner):
pass
def before_epoch(self, runner):
pass
def after_epoch(self, runner):
pass
def before_iter(self, runner):
pass
def after_iter(self, runner):
pass
================================================
FILE: projects/occ_plugin/core/evaluation/eval_hooks.py
================================================
# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
# in order to avoid strong version dependency, we did not directly
# inherit EvalHook but BaseDistEvalHook.
import os.path as osp
import torch.distributed as dist
from mmcv.runner import DistEvalHook as BaseDistEvalHook
from torch.nn.modules.batchnorm import _BatchNorm
from mmcv.runner import EvalHook as BaseEvalHook
class OccEvalHook(BaseEvalHook):
def __init__(self, *args, **kwargs):
super(OccEvalHook, self).__init__(*args, **kwargs)
def _do_evaluate(self, runner):
"""perform evaluation and save ckpt."""
if not self._should_evaluate(runner):
return
from projects.occ_plugin.occupancy.apis.test import custom_single_gpu_test
results = custom_single_gpu_test(runner.model, self.dataloader, show=False)
runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
key_score = self.evaluate(runner, results)
if self.save_best:
self._save_ckpt(runner, key_score)
class OccDistEvalHook(BaseDistEvalHook):
def __init__(self, *args, **kwargs):
super(OccDistEvalHook, self).__init__(*args, **kwargs)
def _do_evaluate(self, runner):
"""perform evaluation and save ckpt."""
# Synchronization of BatchNorm's buffer (running_mean
# and running_var) is not supported in the DDP of pytorch,
# which may cause the inconsistent performance of models in
# different ranks, so we broadcast BatchNorm's buffers
# of rank 0 to other ranks to avoid this.
if self.broadcast_bn_buffer:
model = runner.model
for name, module in model.named_modules():
if isinstance(module,
_BatchNorm) and module.track_running_stats:
dist.broadcast(module.running_var, 0)
dist.broadcast(module.running_mean, 0)
if not self._should_evaluate(runner):
return
tmpdir = self.tmpdir
if tmpdir is None:
tmpdir = osp.join(runner.work_dir, '.eval_hook')
from projects.occ_plugin.occupancy.apis.test import custom_multi_gpu_test # to solve circlur import
results = custom_multi_gpu_test(
runner.model,
self.dataloader,
tmpdir=tmpdir,
gpu_collect=self.gpu_collect)
if runner.rank == 0:
print('\n')
runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
key_score = self.evaluate(runner, results)
if self.save_best:
self._save_ckpt(runner, key_score)
================================================
FILE: projects/occ_plugin/core/visualizer/__init__.py
================================================
from .show_occ import save_occ
================================================
FILE: projects/occ_plugin/core/visualizer/show_occ.py
================================================
import torch.nn.functional as F
import torch
import numpy as np
from os import path as osp
import os
def save_occ(pred_c, pred_f, img_metas, path, visible_mask=None, gt_occ=None, free_id=0, thres_low=0.4, thres_high=0.99):
"""
visualization saving for paper:
1. gt
2. pred_f pred_c
3. gt visible
4. pred_f visible
"""
pred_f = F.softmax(pred_f, dim=1)
pred_f = pred_f[0].cpu().numpy() # C W H D
pred_c = F.softmax(pred_c, dim=1)
pred_c = pred_c[0].cpu().numpy() # C W H D
visible_mask = visible_mask[0].cpu().numpy().reshape(-1) > 0 # WHD
gt_occ = gt_occ.data[0][0].cpu().numpy() # W H D
gt_occ[gt_occ==255] = 0
_, W, H, D = pred_f.shape
coordinates_3D_f = np.stack(np.meshgrid(np.arange(W), np.arange(H), np.arange(D), indexing='ij'), axis=-1).reshape(-1, 3) # (W*H*D, 3)
_, W, H, D = pred_c.shape
coordinates_3D_c = np.stack(np.meshgrid(np.arange(W), np.arange(H), np.arange(D), indexing='ij'), axis=-1).reshape(-1, 3) # (W*H*D, 3)
pred_f = np.argmax(pred_f, axis=0) # (W, H, D)
pred_c = np.argmax(pred_c, axis=0) # (W, H, D)
occ_pred_f_mask = (pred_f.reshape(-1))!=free_id
occ_pred_c_mask = (pred_c.reshape(-1))!=free_id
occ_gt_mask = (gt_occ.reshape(-1))!=free_id
pred_f_save = np.concatenate([coordinates_3D_f[occ_pred_f_mask], pred_f.reshape(-1)[occ_pred_f_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls
pred_c_save = np.concatenate([coordinates_3D_c[occ_pred_c_mask], pred_c.reshape(-1)[occ_pred_c_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls
pred_f_visible_save = np.concatenate([coordinates_3D_f[occ_pred_f_mask&visible_mask], pred_f.reshape(-1)[occ_pred_f_mask&visible_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls
gt_save = np.concatenate([coordinates_3D_f[occ_gt_mask], gt_occ.reshape(-1)[occ_gt_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls
gt_visible_save = np.concatenate([coordinates_3D_f[occ_gt_mask&visible_mask], gt_occ.reshape(-1)[occ_gt_mask&visible_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls
scene_token = img_metas.data[0][0]['scene_token']
lidar_token = img_metas.data[0][0]['lidar_token']
save_path = osp.join(path, scene_token, lidar_token)
if not osp.exists(save_path):
os.makedirs(save_path)
save_pred_f_path = osp.join(save_path, 'pred_f.npy')
save_pred_c_path = osp.join(save_path, 'pred_c.npy')
save_pred_f_v_path = osp.join(save_path, 'pred_f_visible.npy')
save_gt_path = osp.join(save_path, 'gt.npy')
save_gt_v_path = osp.join(save_path, 'gt_visible.npy')
np.save(save_pred_f_path, pred_f_save)
np.save(save_pred_c_path, pred_c_save)
np.save(save_pred_f_v_path, pred_f_visible_save)
np.save(save_gt_path, gt_save)
np.save(save_gt_v_path, gt_visible_save)
================================================
FILE: projects/occ_plugin/datasets/__init__.py
================================================
from .nuscenes_dataset import CustomNuScenesDataset
from .cam4docc_dataset import Cam4DOccDataset
from .cam4docc_lyft_dataset import Cam4DOccLyftDataset
from .builder import custom_build_dataset
__all__ = [
'CustomNuScenesDataset', 'NuscOCCDataset'
]
================================================
FILE: projects/occ_plugin/datasets/builder.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import platform
import random
from functools import partial
import numpy as np
from mmcv.parallel import collate
from mmcv.runner import get_dist_info
from mmcv.utils import Registry, build_from_cfg
from torch.utils.data import DataLoader
from mmdet.datasets.samplers import GroupSampler
from projects.occ_plugin.datasets.samplers.group_sampler import DistributedGroupSampler
from projects.occ_plugin.datasets.samplers.distributed_sampler import DistributedSampler
from projects.occ_plugin.datasets.samplers.sampler import build_sampler
def build_dataloader(dataset,
samples_per_gpu,
workers_per_gpu,
num_gpus=1,
dist=True,
shuffle=True,
seed=None,
shuffler_sampler=None,
nonshuffler_sampler=None,
**kwargs):
"""Build PyTorch DataLoader.
In distributed training, each GPU/process has a dataloader.
In non-distributed training, there is only one dataloader for all GPUs.
Args:
dataset (Dataset): A PyTorch dataset.
samples_per_gpu (int): Number of training samples on each GPU, i.e.,
batch size of each GPU.
workers_per_gpu (int): How many subprocesses to use for data loading
for each GPU.
num_gpus (int): Number of GPUs. Only used in non-distributed training.
dist (bool): Distributed training/test or not. Default: True.
shuffle (bool): Whether to shuffle the data at every epoch.
Default: True.
kwargs: any keyword argument to be used to initialize DataLoader
Returns:
DataLoader: A PyTorch dataloader.
"""
rank, world_size = get_dist_info()
if dist:
# DistributedGroupSampler will definitely shuffle the data to satisfy
# that images on each GPU are in the same group
if shuffle:
sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
dict(
dataset=dataset,
samples_per_gpu=samples_per_gpu,
num_replicas=world_size,
rank=rank,
seed=seed)
)
else:
sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
dict(
dataset=dataset,
num_replicas=world_size,
rank=rank,
shuffle=shuffle,
seed=seed)
)
batch_size = samples_per_gpu
num_workers = workers_per_gpu
else:
print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
batch_size = num_gpus * samples_per_gpu
num_workers = num_gpus * workers_per_gpu
init_fn = partial(
worker_init_fn, num_workers=num_workers, rank=rank,
seed=seed) if seed is not None else None
data_loader = DataLoader(
dataset,
batch_size=batch_size,
sampler=sampler,
num_workers=num_workers,
collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
pin_memory=False,
worker_init_fn=init_fn,
**kwargs)
return data_loader
def worker_init_fn(worker_id, num_workers, rank, seed):
# The seed of each worker equals to
# num_worker * rank + worker_id + user_seed
worker_seed = num_workers * rank + worker_id + seed
np.random.seed(worker_seed)
random.seed(worker_seed)
# Copyright (c) OpenMMLab. All rights reserved.
import platform
from mmcv.utils import Registry, build_from_cfg
from mmdet.datasets import DATASETS
from mmdet.datasets.builder import _concat_dataset
if platform.system() != 'Windows':
# https://github.com/pytorch/pytorch/issues/973
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
base_soft_limit = rlimit[0]
hard_limit = rlimit[1]
soft_limit = min(max(4096, base_soft_limit), hard_limit)
resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
OBJECTSAMPLERS = Registry('Object sampler')
def custom_build_dataset(cfg, default_args=None):
from mmdet3d.datasets.dataset_wrappers import CBGSDataset
from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
ConcatDataset, RepeatDataset)
if isinstance(cfg, (list, tuple)):
dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
elif cfg['type'] == 'ConcatDataset':
dataset = ConcatDataset(
[custom_build_dataset(c, default_args) for c in cfg['datasets']],
cfg.get('separate_eval', True))
elif cfg['type'] == 'RepeatDataset':
dataset = RepeatDataset(
custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
elif cfg['type'] == 'ClassBalancedDataset':
dataset = ClassBalancedDataset(
custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
elif cfg['type'] == 'CBGSDataset':
dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
elif isinstance(cfg.get('ann_file'), (list, tuple)):
dataset = _concat_dataset(cfg, default_args)
else:
dataset = build_from_cfg(cfg, DATASETS, default_args)
return dataset
================================================
FILE: projects/occ_plugin/datasets/cam4docc_dataset.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import numpy as np
from mmcv.runner import get_dist_info
from mmdet.datasets import DATASETS
from mmdet3d.datasets import NuScenesDataset
import os
from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
from projects.occ_plugin.utils.formating import cm_to_ious, format_iou_results
from projects.occ_plugin.utils.geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy
from nuscenes import NuScenes
from pyquaternion import Quaternion
import torch
import random
import time
@DATASETS.register_module()
class Cam4DOccDataset(NuScenesDataset):
def __init__(self, occ_size, pc_range, occ_root, idx_root, ori_data_root, data_root, time_receptive_field, n_future_frames, classes, use_separate_classes,
train_capacity, test_capacity, **kwargs):
'''
Cam4DOccDataset contains sequential occupancy states as well as instance flow for training occupancy forecasting models. We unify the related operations in the LiDAR coordinate system following OpenOccupancy.
occ_size: number of grids along H W L, default: [512, 512, 40]
pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_root: data path of nuScenes-Occupancy
idx_root: save path of test indexes
time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3
n_future_frames: number of forecasted future frames, default: 4
classes: predefiend categories in GMO
use_separate_classes: separate movable objects instead of the general one
train_capacity: number of sequences used for training, default: 23930
test_capacity: number of sequences used for testing, default: 5119
'''
self.train_capacity = train_capacity
self.test_capacity = test_capacity
super().__init__(**kwargs)
rank, world_size = get_dist_info()
self.time_receptive_field = time_receptive_field
self.n_future_frames = n_future_frames
self.sequence_length = time_receptive_field + n_future_frames
if rank == 0:
print("-------------")
print("use past " + str(self.time_receptive_field) + " frames to forecast future " + str(self.n_future_frames) + " frames")
print("-------------")
self.data_infos = list(sorted(self.data_infos, key=lambda e: e['timestamp']))
self.data_infos = self.data_infos[::self.load_interval]
self.occ_size = occ_size
self.pc_range = pc_range
self.occ_root = occ_root
self.idx_root = idx_root
self.ori_data_root = ori_data_root
self.data_root = data_root
self.classes = classes
self.use_separate_classes = use_separate_classes
self.indices = self.get_indices()
self.present_scene_lidar_token = " "
self._set_group_flag()
# load origin nusc dataset for instance annotation
self.nusc = NuScenes(version='v1.0-trainval', dataroot=self.data_root, verbose=False)
if self.test_mode:
self.chosen_list = random.sample(range(0, self.test_capacity) , self.test_capacity)
self.chosen_list_num = len(self.chosen_list)
else:
self.chosen_list = random.sample(range(0, self.train_capacity) , self.train_capacity)
self.chosen_list_num = len(self.chosen_list)
def _set_group_flag(self):
if self.test_mode:
self.flag = np.zeros(self.test_capacity, dtype=np.uint8)
else:
self.flag = np.zeros(self.train_capacity, dtype=np.uint8)
def __len__(self):
if self.test_mode:
return self.test_capacity
else:
return self.train_capacity
def __getitem__(self, idx):
idx = int(self.chosen_list[idx])
self.egopose_list = []
self.ego2lidar_list = []
self.visible_instance_set = set()
self.instance_dict = {}
if self.test_mode:
return self.prepare_test_data(idx)
while True:
data = self.prepare_train_data(idx)
if data is None:
idx = self._rand_another(idx)
idx = int(self.chosen_list[idx])
continue
return data
def get_indices(self):
'''
Generate sequential indexes for training and testing
'''
indices = []
for index in range(len(self.data_infos)):
is_valid_data = True
previous_rec = None
current_indices = []
for t in range(self.sequence_length):
index_t = index + t
# Going over the dataset size limit.
if index_t >= len(self.data_infos):
is_valid_data = False
break
rec = self.data_infos[index_t]
# Check if scene is the same
if (previous_rec is not None) and (rec['scene_token'] != previous_rec['scene_token']):
is_valid_data = False
break
current_indices.append(index_t)
previous_rec = rec
if is_valid_data:
indices.append(current_indices)
return np.asarray(indices)
def get_lidar_pose(self, rec):
'''
Get global poses for following bbox transforming
'''
ego2global_translation = rec['ego2global_translation']
ego2global_rotation = rec['ego2global_rotation']
trans = -np.array(ego2global_translation)
rot = Quaternion(ego2global_rotation).inverse
return trans, rot
def get_ego2lidar_pose(self, rec):
'''
Get LiDAR poses in ego system
'''
lidar2ego_translation = rec['lidar2ego_translation']
lidar2ego_rotation = rec['lidar2ego_rotation']
trans = -np.array(lidar2ego_translation)
rot = Quaternion(lidar2ego_rotation).inverse
return trans, rot
def record_instance(self, idx, instance_map):
"""
Record information about each visible instance in the sequence and assign a unique ID to it
"""
rec = self.data_infos[idx]
translation, rotation = self.get_lidar_pose(rec)
self.egopose_list.append([translation, rotation])
ego2lidar_translation, ego2lidar_rotation = self.get_ego2lidar_pose(rec)
self.ego2lidar_list.append([ego2lidar_translation, ego2lidar_rotation])
current_sample = self.nusc.get('sample', rec['token'])
for annotation_token in current_sample['anns']:
annotation = self.nusc.get('sample_annotation', annotation_token)
# Instance extraction for Cam4DOcc-V1
# Filter out all non vehicle instances
# if 'vehicle' not in annotation['category_name']:
# continue
gmo_flag = False
for class_name in self.classes:
if class_name in annotation['category_name']:
gmo_flag = True
break
if not gmo_flag:
continue
# Specify semantic id if use_separate_classes
semantic_id = 1
if self.use_separate_classes:
if 'vehicle.bicycle' in annotation['category_name']: # rm static_object.bicycle_rack
semantic_id = 1
elif 'bus' in annotation['category_name']:
semantic_id = 2
elif 'car' in annotation['category_name']:
semantic_id = 3
elif 'construction' in annotation['category_name']:
semantic_id = 4
elif 'motorcycle' in annotation['category_name']:
semantic_id = 5
elif 'trailer' in annotation['category_name']:
semantic_id = 6
elif 'truck' in annotation['category_name']:
semantic_id = 7
elif 'pedestrian' in annotation['category_name']:
semantic_id = 8
# Filter out invisible vehicles
FILTER_INVISIBLE_VEHICLES = True
if FILTER_INVISIBLE_VEHICLES and int(annotation['visibility_token']) == 1 and annotation['instance_token'] not in self.visible_instance_set:
continue
# Filter out vehicles that have not been seen in the past
if self.counter >= self.time_receptive_field and annotation['instance_token'] not in self.visible_instance_set:
continue
self.visible_instance_set.add(annotation['instance_token'])
if annotation['instance_token'] not in instance_map:
instance_map[annotation['instance_token']] = len(instance_map) + 1
instance_id = instance_map[annotation['instance_token']]
instance_attribute = int(annotation['visibility_token'])
if annotation['instance_token'] not in self.instance_dict:
# For the first occurrence of an instance
self.instance_dict[annotation['instance_token']] = {
'timestep': [self.counter],
'translation': [annotation['translation']],
'rotation': [annotation['rotation']],
'size': annotation['size'],
'instance_id': instance_id,
'semantic_id': semantic_id,
'attribute_label': [instance_attribute],
}
else:
# For the instance that have appeared before
self.instance_dict[annotation['instance_token']]['timestep'].append(self.counter)
self.instance_dict[annotation['instance_token']]['translation'].append(annotation['translation'])
self.instance_dict[annotation['instance_token']]['rotation'].append(annotation['rotation'])
self.instance_dict[annotation['instance_token']]['attribute_label'].append(instance_attribute)
return instance_map
def get_future_egomotion(self, idx):
'''
Calculate LiDAR pose updates between idx and idx+1
'''
rec_t0 = self.data_infos[idx]
future_egomotion = np.eye(4, dtype=np.float32)
if idx < len(self.data_infos) - 1:
rec_t1 = self.data_infos[idx + 1]
if rec_t0['scene_token'] == rec_t1['scene_token']:
egopose_t0_trans = rec_t0['ego2global_translation']
egopose_t0_rot = rec_t0['ego2global_rotation']
egopose_t1_trans = rec_t1['ego2global_translation']
egopose_t1_rot = rec_t1['ego2global_rotation']
egopose_t0 = convert_egopose_to_matrix_numpy(egopose_t0_trans, egopose_t0_rot)
egopose_t1 = convert_egopose_to_matrix_numpy(egopose_t1_trans, egopose_t1_rot)
lidar2ego_t0_trans = rec_t0['lidar2ego_translation']
lidar2ego_t0_rot = rec_t0['lidar2ego_rotation']
lidar2ego_t1_trans = rec_t1['lidar2ego_translation']
lidar2ego_t1_rot = rec_t1['lidar2ego_rotation']
lidar2ego_t0 = convert_egopose_to_matrix_numpy(lidar2ego_t0_trans, lidar2ego_t0_rot)
lidar2ego_t1 = convert_egopose_to_matrix_numpy(lidar2ego_t1_trans, lidar2ego_t1_rot)
future_egomotion = invert_matrix_egopose_numpy(lidar2ego_t1).dot(invert_matrix_egopose_numpy(egopose_t1)).dot(egopose_t0).dot(lidar2ego_t0)
future_egomotion = torch.Tensor(future_egomotion).float()
# Convert to 6DoF vector
return future_egomotion.unsqueeze(0)
@staticmethod
def _check_consistency(translation, prev_translation, threshold=1.0):
"""
Check for significant displacement of the instance adjacent moments
"""
x, y = translation[:2]
prev_x, prev_y = prev_translation[:2]
if abs(x - prev_x) > threshold or abs(y - prev_y) > threshold:
return False
return True
def refine_instance_poly(self, instance):
"""
Fix the missing frames and disturbances of ground truth caused by noise
"""
pointer = 1
for i in range(instance['timestep'][0] + 1, self.sequence_length):
# Fill in the missing frames
if i not in instance['timestep']:
instance['timestep'].insert(pointer, i)
instance['translation'].insert(pointer, instance['translation'][pointer-1])
instance['rotation'].insert(pointer, instance['rotation'][pointer-1])
instance['attribute_label'].insert(pointer, instance['attribute_label'][pointer-1])
pointer += 1
continue
# Eliminate observation disturbances
if self._check_consistency(instance['translation'][pointer], instance['translation'][pointer-1]):
instance['translation'][pointer] = instance['translation'][pointer-1]
instance['rotation'][pointer] = instance['rotation'][pointer-1]
instance['attribute_label'][pointer] = instance['attribute_label'][pointer-1]
pointer += 1
return instance
def prepare_train_data(self, index):
'''
Generate a training sequence
'''
input_dict = self.get_data_info(index)
if input_dict is None:
return None
example = self.prepare_sequential_data(index)
return example
def prepare_test_data(self, index):
'''
Generate a test sequence
TODO: Give additional functions here such as visualization
'''
input_dict = self.get_data_info(index)
if input_dict is None:
return None
example = self.prepare_sequential_data(index)
# TODO: visualize example data
return example
def prepare_sequential_data(self, index):
'''
Use the predefined pipeline to generate inputs of the baseline network and ground truth for the standard evaluation protocol in Cam4DOcc
'''
instance_map = {}
input_seq_data = {}
keys = ['input_dict','future_egomotion', 'sample_token']
for key in keys:
input_seq_data[key] = []
scene_lidar_token = []
for self.counter, index_t in enumerate(self.indices[index]):
input_dict_per_frame = self.get_data_info(index_t)
if input_dict_per_frame is None:
return None
input_seq_data['input_dict'].append(input_dict_per_frame)
input_seq_data['sample_token'].append(input_dict_per_frame['sample_idx'])
instance_map = self.record_instance(index_t, instance_map)
future_egomotion = self.get_future_egomotion(index_t)
input_seq_data['future_egomotion'].append(future_egomotion)
scene_lidar_token.append(input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token'])
if self.counter == self.time_receptive_field - 1:
self.present_scene_lidar_token = input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token']
# save sequential test indexes for possible evaluation
if self.test_mode:
test_idx_path = os.path.join(self.idx_root, "test_ids")
if not os.path.exists(test_idx_path):
os.mkdir(test_idx_path)
np.savez(os.path.join(test_idx_path, self.present_scene_lidar_token), scene_lidar_token)
for token in self.instance_dict.keys():
self.instance_dict[token] = self.refine_instance_poly(self.instance_dict[token])
input_seq_data.update(
dict(
time_receptive_field=self.time_receptive_field,
sequence_length=self.sequence_length,
egopose_list=self.egopose_list,
ego2lidar_list=self.ego2lidar_list,
instance_dict=self.instance_dict,
instance_map=instance_map,
indices=self.indices[index],
scene_token=self.present_scene_lidar_token,
))
example = self.pipeline(input_seq_data)
return example
def get_data_info(self, index):
'''
get_data_info from .pkl also used by OpenOccupancy
'''
info = self.data_infos[index]
# standard protocal modified from SECOND.Pytorch
input_dict = dict(
sample_idx=info['token'],
pts_filename=info['lidar_path'],
sweeps=info['sweeps'],
lidar2ego_translation=info['lidar2ego_translation'],
lidar2ego_rotation=info['lidar2ego_rotation'],
ego2global_translation=info['ego2global_translation'],
ego2global_rotation=info['ego2global_rotation'],
prev_idx=info['prev'],
next_idx=info['next'],
scene_token=info['scene_token'],
can_bus=info['can_bus'],
# frame_idx=info['frame_idx'],
timestamp=info['timestamp'] / 1e6,
occ_size = np.array(self.occ_size),
pc_range = np.array(self.pc_range),
lidar_token=info['lidar_token'],
lidarseg=info['lidarseg'],
curr=info,
)
if self.modality['use_camera']:
image_paths = []
lidar2img_rts = []
lidar2cam_rts = []
cam_intrinsics = []
lidar2cam_dic = {}
for cam_type, cam_info in info['cams'].items():
image_paths.append(cam_info['data_path'])
# obtain lidar to image transformation matrix
lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
lidar2cam_t = cam_info[
'sensor2lidar_translation'] @ lidar2cam_r.T
lidar2cam_rt = np.eye(4)
lidar2cam_rt[:3, :3] = lidar2cam_r.T
lidar2cam_rt[3, :3] = -lidar2cam_t
intrinsic = cam_info['cam_intrinsic']
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
lidar2img_rt = (viewpad @ lidar2cam_rt.T)
lidar2img_rts.append(lidar2img_rt)
cam_intrinsics.append(viewpad)
lidar2cam_rts.append(lidar2cam_rt.T)
lidar2cam_dic[cam_type] = lidar2cam_rt.T
input_dict.update(
dict(
img_filename=image_paths,
lidar2img=lidar2img_rts,
cam_intrinsic=cam_intrinsics,
lidar2cam=lidar2cam_rts,
lidar2cam_dic=lidar2cam_dic,
))
return input_dict
def evaluate(self, results, logger=None, **kawrgs):
'''
Evaluate by IOU and VPQ metrics for model evaluation
'''
eval_results = {}
''' calculate IOU '''
hist_for_iou = sum(results['hist_for_iou'])
ious = cm_to_ious(hist_for_iou)
res_table, res_dic = format_iou_results(ious, return_dic=True)
for key, val in res_dic.items():
eval_results['IOU_{}'.format(key)] = val
if logger is not None:
logger.info('IOU Evaluation')
logger.info(res_table)
''' calculate VPQ '''
if 'vpq_metric' in results.keys() and 'vpq_len' in results.keys():
vpq_sum = sum(results['vpq_metric'])
eval_results['VPQ'] = vpq_sum/results['vpq_len']
return eval_results
================================================
FILE: projects/occ_plugin/datasets/cam4docc_lyft_dataset.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import numpy as np
from mmcv.runner import get_dist_info
from mmdet.datasets import DATASETS
from mmdet3d.datasets import NuScenesDataset
from mmdet3d.datasets.pipelines import Compose
from torch.utils.data import Dataset
from lyft_dataset_sdk.lyftdataset import LyftDataset
import os
from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
from projects.occ_plugin.utils.formating import cm_to_ious, format_iou_results
from projects.occ_plugin.utils.geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy
from nuscenes import NuScenes
from pyquaternion import Quaternion
import torch
import random
import time
@DATASETS.register_module()
class Cam4DOccLyftDataset(Dataset):
def __init__(self, occ_size, pc_range, occ_root, idx_root, ori_data_root, data_root, time_receptive_field, n_future_frames, classes, use_separate_classes,
train_capacity, test_capacity, test_mode=False, pipeline=None, **kwargs):
'''
Cam4DOccLyftDataset contains sequential occupancy states as well as instance flow for training occupancy forecasting models. We unify the related operations in the LiDAR coordinate system following OpenOccupancy.
occ_size: number of grids along H W L, default: [512, 512, 40]
pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_root: data path of nuScenes-Occupancy
idx_root: save path of test indexes
time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3
n_future_frames: number of forecasted future frames, default: 4
classes: predefiend categories in GMO
use_separate_classes: separate movable objects instead of the general one
train_capacity: number of sequences used for training, default: 23930
test_capacity: number of sequences used for testing, default: 5119
'''
self.test_mode = test_mode
self.CLASSES = classes
self.train_capacity = train_capacity
self.test_capacity = test_capacity
super().__init__()
# training and test indexes following PowerBEV
self.TRAIN_LYFT_INDICES = [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16,
17, 18, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32,
33, 35, 36, 37, 39, 41, 43, 44, 45, 46, 47, 48, 49,
50, 51, 52, 53, 55, 56, 59, 60, 62, 63, 65, 68, 69,
70, 71, 72, 73, 74, 75, 76, 78, 79, 81, 82, 83, 84,
86, 87, 88, 89, 93, 95, 97, 98, 99, 103, 104, 107, 108,
109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 121, 122, 124,
127, 128, 130, 131, 132, 134, 135, 136, 137, 138, 139, 143, 144,
146, 147, 148, 149, 150, 151, 152, 153, 154, 156, 157, 158, 159,
161, 162, 165, 166, 167, 171, 172, 173, 174, 175, 176, 177, 178,
179]
self.VAL_LYFT_INDICES = [0, 2, 4, 13, 22, 25, 26, 34, 38, 40, 42, 54, 57,
58, 61, 64, 66, 67, 77, 80, 85, 90, 91, 92, 94, 96,
100, 101, 102, 105, 106, 112, 120, 123, 125, 126, 129, 133, 140,
141, 142, 145, 155, 160, 163, 164, 168, 169, 170]
rank, world_size = get_dist_info()
self.time_receptive_field = time_receptive_field
self.n_future_frames = n_future_frames
self.sequence_length = time_receptive_field + n_future_frames
if rank == 0:
print("-------------")
print("use past " + str(self.time_receptive_field) + " frames to forecast future " + str(self.n_future_frames) + " frames")
print("-------------")
self.occ_size = occ_size
self.pc_range = pc_range
self.occ_root = occ_root
self.idx_root = idx_root
self.ori_data_root = ori_data_root
self.data_root = data_root
self.classes = classes
self.use_separate_classes = use_separate_classes
self.pipeline = Compose(pipeline)
# load origin nusc dataset for instance annotation
self.lyft = LyftDataset(data_path=self.data_root, json_path=os.path.join(self.data_root, 'train_data'), verbose=False)
self.scenes = self.get_scenes()
self.ixes = self.get_samples()
self.indices = self.get_indices()
self.present_scene_lidar_token = " "
self._set_group_flag()
if self.test_mode:
self.chosen_list = random.sample(range(0, self.test_capacity) , self.test_capacity)
self.chosen_list_num = len(self.chosen_list)
else:
self.chosen_list = random.sample(range(0, self.train_capacity) , self.train_capacity)
self.chosen_list_num = len(self.chosen_list)
def _set_group_flag(self):
if self.test_mode:
self.flag = np.zeros(self.test_capacity, dtype=np.uint8)
else:
self.flag = np.zeros(self.train_capacity, dtype=np.uint8)
def __len__(self):
if self.test_mode:
return self.test_capacity
else:
return self.train_capacity
def __getitem__(self, idx):
idx = int(self.chosen_list[idx])
self.egopose_list = []
self.ego2lidar_list = []
self.visible_instance_set = set()
self.instance_dict = {}
if self.test_mode:
return self.prepare_test_data(idx)
while True:
data = self.prepare_train_data(idx)
if data is None:
idx = self._rand_another(idx)
idx = int(self.chosen_list[idx])
continue
return data
def get_scenes(self):
"""
Obtain the list of scenes names in the given split.
"""
scenes = [row['name'] for row in self.lyft.scene]
# split in train/val
indices = self.VAL_LYFT_INDICES if self.test_mode else self.TRAIN_LYFT_INDICES
scenes = [scenes[i] for i in indices]
return scenes
def get_samples(self):
"""
Find and sort the samples in the given split by scene.
"""
samples = [sample for sample in self.lyft.sample]
# remove samples that aren't in this split
samples = [sample for sample in samples if self.lyft.get('scene', sample['scene_token'])['name'] in self.scenes]
# sort by scene, timestamp (only to make chronological viz easier)
samples.sort(key=lambda x: (x['scene_token'], x['timestamp']))
return samples
def get_indices(self):
'''
Generate sequential indexes for training and testing
'''
indices = []
for index in range(len(self.ixes)):
is_valid_data = True
previous_rec = None
current_indices = []
for t in range(self.sequence_length):
index_t = index + t
# Going over the dataset size limit.
if index_t >= len(self.ixes):
is_valid_data = False
break
rec = self.ixes[index_t]
# Check if scene is the same
if (previous_rec is not None) and (rec['scene_token'] != previous_rec['scene_token']):
is_valid_data = False
break
current_indices.append(index_t)
previous_rec = rec
if is_valid_data:
indices.append(current_indices)
return np.asarray(indices)
def get_lidar_pose(self, rec):
'''
Get global poses for following bbox transforming
'''
current_sample = self.lyft.get('sample', rec['token'])
egopose = self.lyft.get('ego_pose', self.lyft.get('sample_data', current_sample['data']['LIDAR_TOP'])['ego_pose_token'])
ego2global_translation = egopose['translation']
ego2global_rotation = egopose['rotation']
trans = -np.array(ego2global_translation)
rot = Quaternion(ego2global_rotation).inverse
return trans, rot
def get_ego2lidar_pose(self, rec):
'''
Get LiDAR poses in ego system
'''
current_sample = self.lyft.get('sample', rec['token'])
lidar_top_data = self.lyft.get('sample_data', current_sample['data']['LIDAR_TOP'])
lidar2ego_translation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation']
lidar2ego_rotation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation']
trans = -np.array(lidar2ego_translation)
rot = Quaternion(lidar2ego_rotation).inverse
return trans, rot
def record_instance(self, idx, instance_map):
"""
Record information about each visible instance in the sequence and assign a unique ID to it
"""
rec = self.ixes[idx]
translation, rotation = self.get_lidar_pose(rec)
self.egopose_list.append([translation, rotation])
ego2lidar_translation, ego2lidar_rotation = self.get_ego2lidar_pose(rec)
self.ego2lidar_list.append([ego2lidar_translation, ego2lidar_rotation])
current_sample = self.lyft.get('sample', rec['token'])
for annotation_token in current_sample['anns']:
annotation = self.lyft.get('sample_annotation', annotation_token)
# Instance extraction for Cam4DOcc-V1
# Filter out all non vehicle instances
# if 'vehicle' not in annotation['category_name']:
# continue
gmo_flag = False
for class_name in self.classes:
if class_name in annotation['category_name']:
gmo_flag = True
break
if not gmo_flag:
continue
# Specify semantic id if use_separate_classes
semantic_id = 1
if self.use_separate_classes:
if 'bicycle' in annotation['category_name']:
semantic_id = 1
elif 'bus' in annotation['category_name']:
semantic_id = 2
elif 'car' in annotation['category_name']:
semantic_id = 3
elif 'construction' in annotation['category_name']:
semantic_id = 4
elif 'motorcycle' in annotation['category_name']:
semantic_id = 5
elif 'trailer' in annotation['category_name']:
semantic_id = 6
elif 'truck' in annotation['category_name']:
semantic_id = 7
elif 'pedestrian' in annotation['category_name']:
semantic_id = 8
if annotation['instance_token'] not in instance_map:
instance_map[annotation['instance_token']] = len(instance_map) + 1
instance_id = instance_map[annotation['instance_token']]
instance_attribute = 1 # deprecated
if annotation['instance_token'] not in self.instance_dict:
# For the first occurrence of an instance
self.instance_dict[annotation['instance_token']] = {
'timestep': [self.counter],
'translation': [annotation['translation']],
'rotation': [annotation['rotation']],
'size': annotation['size'],
'instance_id': instance_id,
'semantic_id': semantic_id,
'attribute_label': [instance_attribute],
}
else:
# For the instance that have appeared before
self.instance_dict[annotation['instance_token']]['timestep'].append(self.counter)
self.instance_dict[annotation['instance_token']]['translation'].append(annotation['translation'])
self.instance_dict[annotation['instance_token']]['rotation'].append(annotation['rotation'])
self.instance_dict[annotation['instance_token']]['attribute_label'].append(instance_attribute)
return instance_map
def get_future_egomotion(self, idx):
'''
Calculate LiDAR pose updates between idx and idx+1
'''
rec_t0 = self.ixes[idx]
future_egomotion = np.eye(4, dtype=np.float32)
if idx < len(self.ixes) - 1:
rec_t1 = self.ixes[idx + 1]
if rec_t0['scene_token'] == rec_t1['scene_token']:
egopose_t0 = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec_t0['data']['LIDAR_TOP'])['ego_pose_token'])
egopose_t0_trans = egopose_t0['translation']
egopose_t0_rot = egopose_t0['rotation']
egopose_t1 = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec_t1['data']['LIDAR_TOP'])['ego_pose_token'])
egopose_t1_trans = egopose_t1['translation']
egopose_t1_rot = egopose_t1['rotation']
egopose_t0 = convert_egopose_to_matrix_numpy(egopose_t0_trans, egopose_t0_rot)
egopose_t1 = convert_egopose_to_matrix_numpy(egopose_t1_trans, egopose_t1_rot)
lidar_top_data_t0 = self.lyft.get('sample_data', rec_t0['data']['LIDAR_TOP'])
lidar2ego_t0_trans = self.lyft.get('calibrated_sensor', lidar_top_data_t0['calibrated_sensor_token'])['translation']
lidar2ego_t0_rot = self.lyft.get('calibrated_sensor', lidar_top_data_t0['calibrated_sensor_token'])['rotation']
lidar_top_data_t1 = self.lyft.get('sample_data', rec_t1['data']['LIDAR_TOP'])
lidar2ego_t1_trans = self.lyft.get('calibrated_sensor', lidar_top_data_t1['calibrated_sensor_token'])['translation']
lidar2ego_t1_rot = self.lyft.get('calibrated_sensor', lidar_top_data_t1['calibrated_sensor_token'])['rotation']
lidar2ego_t0 = convert_egopose_to_matrix_numpy(lidar2ego_t0_trans, lidar2ego_t0_rot)
lidar2ego_t1 = convert_egopose_to_matrix_numpy(lidar2ego_t1_trans, lidar2ego_t1_rot)
future_egomotion = invert_matrix_egopose_numpy(lidar2ego_t1).dot(invert_matrix_egopose_numpy(egopose_t1)).dot(egopose_t0).dot(lidar2ego_t0)
future_egomotion = torch.Tensor(future_egomotion).float()
return future_egomotion.unsqueeze(0)
@staticmethod
def _check_consistency(translation, prev_translation, threshold=1.0):
"""
Check for significant displacement of the instance adjacent moments
"""
x, y = translation[:2]
prev_x, prev_y = prev_translation[:2]
if abs(x - prev_x) > threshold or abs(y - prev_y) > threshold:
return False
return True
def refine_instance_poly(self, instance):
"""
Fix the missing frames and disturbances of ground truth caused by noise
"""
pointer = 1
for i in range(instance['timestep'][0] + 1, self.sequence_length):
# Fill in the missing frames
if i not in instance['timestep']:
instance['timestep'].insert(pointer, i)
instance['translation'].insert(pointer, instance['translation'][pointer-1])
instance['rotation'].insert(pointer, instance['rotation'][pointer-1])
instance['attribute_label'].insert(pointer, instance['attribute_label'][pointer-1])
pointer += 1
continue
# Eliminate observation disturbances
if self._check_consistency(instance['translation'][pointer], instance['translation'][pointer-1]):
instance['translation'][pointer] = instance['translation'][pointer-1]
instance['rotation'][pointer] = instance['rotation'][pointer-1]
instance['attribute_label'][pointer] = instance['attribute_label'][pointer-1]
pointer += 1
return instance
def prepare_train_data(self, index):
'''
Generate a training sequence
'''
example = self.prepare_sequential_data(index)
return example
def prepare_test_data(self, index):
'''
Generate a test sequence
TODO: Give additional functions here such as visualization
'''
example = self.prepare_sequential_data(index)
# TODO: visualize example data
return example
def prepare_sequential_data(self, index):
'''
Use the predefined pipeline to generate inputs of the baseline network and ground truth for the standard evaluation protocol in Cam4DOcc
'''
instance_map = {}
input_seq_data = {}
keys = ['input_dict','future_egomotion', 'sample_token']
for key in keys:
input_seq_data[key] = []
scene_lidar_token = []
for self.counter, index_t in enumerate(self.indices[index]):
input_dict_per_frame = {}
rec = self.ixes[index_t] # sample
lidar_top_data = self.lyft.get('sample_data', rec['data']['LIDAR_TOP'])
lidar2ego_translation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation']
lidar2ego_rotation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation']
egopose = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec['data']['LIDAR_TOP'])['ego_pose_token'])
ego2global_translation = egopose['translation']
ego2global_rotation = egopose['rotation']
input_dict_per_frame['lidar2ego_translation'] = lidar2ego_translation
input_dict_per_frame['lidar2ego_rotation'] = lidar2ego_rotation
input_dict_per_frame['ego2global_translation'] = ego2global_translation
input_dict_per_frame['ego2global_rotation'] = ego2global_rotation
input_dict_per_frame['scene_token'] = rec['scene_token']
input_dict_per_frame['lidar_token'] = rec['data']['LIDAR_TOP']
input_dict_per_frame['occ_size'] = np.array(self.occ_size)
input_dict_per_frame['pc_range'] = np.array(self.pc_range)
input_dict_per_frame['sample_idx'] = rec['token']
image_paths = []
lidar2img_rts = []
lidar2cam_rts = []
cam_intrinsics = []
cam_intrinsics_ori = []
lidar2cam_dic = {}
lidar_sample = self.lyft.get('sample_data', rec['data']['LIDAR_TOP'])
lidar_pose = self.lyft.get('ego_pose', lidar_sample['ego_pose_token'])
lidar_rotation = Quaternion(lidar_pose['rotation'])
lidar_translation = np.array(lidar_pose['translation'])[:, None]
lidar_to_world = np.vstack([
np.hstack((lidar_rotation.rotation_matrix, lidar_translation)),
np.array([0, 0, 0, 1])
])
lidar_sample_calib = self.lyft.get('calibrated_sensor', lidar_sample['calibrated_sensor_token'])
lidar_sensor_rotation = Quaternion(lidar_sample_calib['rotation'])
lidar_sensor_translation = np.array(lidar_sample_calib['translation'])[:, None]
lidar_to_lidarego = np.vstack([
np.hstack((lidar_sensor_rotation.rotation_matrix, lidar_sensor_translation)),
np.array([0, 0, 0, 1])
])
cameras = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT']
for cam in cameras:
camera_sample = self.lyft.get('sample_data', rec['data'][cam])
image_paths.append(os.path.join("/tos://haomo-algorithms/c6089dc67ff976615510d22b5eaaaa4e/mjy/cam4docc/data/lyft/", camera_sample['filename']))
car_egopose = self.lyft.get('ego_pose', camera_sample['ego_pose_token'])
egopose_rotation = Quaternion(car_egopose['rotation']).inverse
egopose_translation = -np.array(car_egopose['translation'])[:, None]
world_to_car_egopose = np.vstack([
np.hstack((egopose_rotation.rotation_matrix, egopose_rotation.rotation_matrix @ egopose_translation)),
np.array([0, 0, 0, 1])
])
sensor_sample = self.lyft.get('calibrated_sensor', camera_sample['calibrated_sensor_token'])
intrinsic = torch.Tensor(sensor_sample['camera_intrinsic'])
cam_intrinsics_ori.append(intrinsic)
sensor_rotation = Quaternion(sensor_sample['rotation'])
sensor_translation = np.array(sensor_sample['translation'])[:, None]
car_egopose_to_sensor = np.vstack([
np.hstack((sensor_rotation.rotation_matrix, sensor_translation)),
np.array([0, 0, 0, 1])
])
car_egopose_to_sensor = np.linalg.inv(car_egopose_to_sensor)
lidar_to_sensor = car_egopose_to_sensor @ world_to_car_egopose @ lidar_to_world @ lidar_to_lidarego
sensor_to_lidar =np.linalg.inv(lidar_to_sensor)
lidar2cam_r = lidar_to_sensor[:3, :3]
lidar2cam_t = sensor_to_lidar[:3, -1].reshape(1,3) @ lidar2cam_r.T
lidar2cam_rt = np.eye(4)
lidar2cam_rt[:3, :3] = lidar2cam_r.T
lidar2cam_rt[3, :3] = -lidar2cam_t
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
lidar2img_rt = (viewpad @ lidar2cam_rt.T)
lidar2img_rts.append(lidar2img_rt)
cam_intrinsics.append(viewpad)
lidar2cam_rts.append(lidar2cam_rt.T)
lidar2cam_dic[cam] = lidar2cam_rt.T
input_dict_per_frame.update(
dict(
img_filename=image_paths,
lidar2img=lidar2img_rts,
cam_intrinsic=cam_intrinsics,
cam_intrinsics=cam_intrinsics_ori,
lidar2cam=lidar2cam_rts,
lidar2cam_dic=lidar2cam_dic,
))
input_seq_data['input_dict'].append(input_dict_per_frame)
instance_map = self.record_instance(index_t, instance_map)
future_egomotion = self.get_future_egomotion(index_t)
input_seq_data['future_egomotion'].append(future_egomotion)
input_seq_data['sample_token'].append(input_dict_per_frame['sample_idx'])
scene_lidar_token.append(input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token'])
if self.counter == self.time_receptive_field - 1:
self.present_scene_lidar_token = input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token']
for token in self.instance_dict.keys():
self.instance_dict[token] = self.refine_instance_poly(self.instance_dict[token])
input_seq_data.update(
dict(
time_receptive_field=self.time_receptive_field,
sequence_length=self.sequence_length,
egopose_list=self.egopose_list,
ego2lidar_list=self.ego2lidar_list,
instance_dict=self.instance_dict,
instance_map=instance_map,
indices=self.indices[index],
scene_token=self.present_scene_lidar_token,
))
example = self.pipeline(input_seq_data)
return example
def evaluate(self, results, logger=None, **kawrgs):
'''
Evaluate by IOU and VPQ metrics for model evaluation
'''
eval_results = {}
''' calculate IOU '''
hist_for_iou = sum(results['hist_for_iou'])
ious = cm_to_ious(hist_for_iou)
res_table, res_dic = format_iou_results(ious, return_dic=True)
for key, val in res_dic.items():
eval_results['IOU_{}'.format(key)] = val
if logger is not None:
logger.info('IOU Evaluation')
logger.info(res_table)
''' calculate VPQ '''
if 'vpq_metric' in results.keys() and 'vpq_len' in results.keys():
vpq_sum = sum(results['vpq_metric'])
eval_results['VPQ'] = vpq_sum/results['vpq_len']
return eval_results
================================================
FILE: projects/occ_plugin/datasets/nuscenes_dataset.py
================================================
import copy
import numpy as np
from mmdet.datasets import DATASETS
from mmdet3d.datasets import NuScenesDataset
import mmcv
from os import path as osp
from mmdet.datasets import DATASETS
import torch
import numpy as np
from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
from mmcv.parallel import DataContainer as DC
import random
@DATASETS.register_module()
class CustomNuScenesDataset(NuScenesDataset):
r"""NuScenes Dataset.
This datset only add camera intrinsics and extrinsics to the results.
"""
def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs):
super().__init__(*args, **kwargs)
self.queue_length = queue_length
self.overlap_test = overlap_test
self.bev_size = bev_size
def prepare_train_data(self, index):
"""
Training data preparation.
Args:
index (int): Index for accessing the target data.
Returns:
dict: Training data dict of the corresponding index.
"""
queue = []
index_list = list(range(index-self.queue_length, index))
random.shuffle(index_list)
index_list = sorted(index_list[1:])
index_list.append(index)
for i in index_list:
i = max(0, i)
input_dict = self.get_data_info(i)
if input_dict is None:
return None
self.pre_pipeline(input_dict)
example = self.pipeline(input_dict)
if self.filter_empty_gt and \
(example is None or ~(example['gt_labels_3d']._data != -1).any()):
return None
queue.append(example)
return self.union2one(queue)
def union2one(self, queue):
imgs_list = [each['img'].data for each in queue]
metas_map = {}
prev_scene_token = None
prev_pos = None
prev_angle = None
for i, each in enumerate(queue):
metas_map[i] = each['img_metas'].data
if metas_map[i]['scene_token'] != prev_scene_token:
metas_map[i]['prev_bev_exists'] = False
prev_scene_token = metas_map[i]['scene_token']
prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
metas_map[i]['can_bus'][:3] = 0
metas_map[i]['can_bus'][-1] = 0
else:
metas_map[i]['prev_bev_exists'] = True
tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
metas_map[i]['can_bus'][:3] -= prev_pos
metas_map[i]['can_bus'][-1] -= prev_angle
prev_pos = copy.deepcopy(tmp_pos)
prev_angle = copy.deepcopy(tmp_angle)
queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
queue = queue[-1]
return queue
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): Sample index.
- pts_filename (str): Filename of point clouds.
- sweeps (list[dict]): Infos of sweeps.
- timestamp (float): Sample timestamp.
- img_filename (str, optional): Image filename.
- lidar2img (list[np.ndarray], optional): Transformations \
from lidar to different cameras.
- ann_info (dict): Annotation info.
"""
info = self.data_infos[index]
# standard protocal modified from SECOND.Pytorch
input_dict = dict(
sample_idx=info['token'],
pts_filename=info['lidar_path'],
sweeps=info['sweeps'],
ego2global_translation=info['ego2global_translation'],
ego2global_rotation=info['ego2global_rotation'],
prev_idx=info['prev'],
next_idx=info['next'],
scene_token=info['scene_token'],
can_bus=info['can_bus'],
frame_idx=info['frame_idx'],
timestamp=info['timestamp'] / 1e6,
)
if self.modality['use_camera']:
image_paths = []
lidar2img_rts = []
lidar2cam_rts = []
cam_intrinsics = []
for cam_type, cam_info in info['cams'].items():
image_paths.append(cam_info['data_path'])
# obtain lidar to image transformation matrix
lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
lidar2cam_t = cam_info[
'sensor2lidar_translation'] @ lidar2cam_r.T
lidar2cam_rt = np.eye(4)
lidar2cam_rt[:3, :3] = lidar2cam_r.T
lidar2cam_rt[3, :3] = -lidar2cam_t
intrinsic = cam_info['cam_intrinsic']
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
lidar2img_rt = (viewpad @ lidar2cam_rt.T)
lidar2img_rts.append(lidar2img_rt)
cam_intrinsics.append(viewpad)
lidar2cam_rts.append(lidar2cam_rt.T)
input_dict.update(
dict(
img_filename=image_paths,
lidar2img=lidar2img_rts,
cam_intrinsic=cam_intrinsics,
lidar2cam=lidar2cam_rts,
))
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
rotation = Quaternion(input_dict['ego2global_rotation'])
translation = input_dict['ego2global_translation']
can_bus = input_dict['can_bus']
can_bus[:3] = translation
can_bus[3:7] = rotation
patch_angle = quaternion_yaw(rotation) / np.pi * 180
if patch_angle < 0:
patch_angle += 360
can_bus[-2] = patch_angle / 180 * np.pi
can_bus[-1] = patch_angle
return input_dict
def __getitem__(self, idx):
"""Get item from infos according to the given index.
Returns:
dict: Data dictionary of the corresponding index.
"""
if self.test_mode:
return self.prepare_test_data(idx)
while True:
data = self.prepare_train_data(idx)
if data is None:
idx = self._rand_another(idx)
continue
return data
================================================
FILE: projects/occ_plugin/datasets/pipelines/__init__.py
================================================
from .transform_3d import (
PadMultiViewImage, NormalizeMultiviewImage,
PhotoMetricDistortionMultiViewImage, CustomCollect3D, CustomOccCollect3D, RandomScaleImageMultiViewImage)
from .formating import OccDefaultFormatBundle3D
from .loading_occupancy import LoadOccupancy
from .loading_bevdet import LoadAnnotationsBEVDepth, LoadMultiViewImageFromFiles_BEVDet
from .loading_instance import LoadInstanceWithFlow
__all__ = [
'PadMultiViewImage', 'NormalizeMultiviewImage', 'CustomOccCollect3D', 'LoadAnnotationsBEVDepth', 'LoadMultiViewImageFromFiles_BEVDet', 'LoadOccupancy',
'PhotoMetricDistortionMultiViewImage', 'OccDefaultFormatBundle3D', 'CustomCollect3D', 'RandomScaleImageMultiViewImage', "LoadInstanceWithFlow",
]
================================================
FILE: projects/occ_plugin/datasets/pipelines/formating.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
from mmcv.parallel import DataContainer as DC
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import to_tensor
from mmdet3d.datasets.pipelines import DefaultFormatBundle3D
@PIPELINES.register_module()
class OccDefaultFormatBundle3D(DefaultFormatBundle3D):
"""Default formatting bundle.
It simplifies the pipeline of formatting common fields for voxels,
including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
"gt_semantic_seg".
These fields are formatted as follows.
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
- proposals: (1)to tensor, (2)to DataContainer
- gt_bboxes: (1)to tensor, (2)to DataContainer
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
- gt_labels: (1)to tensor, (2)to DataContainer
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __call__(self, results):
"""Call function to transform and format common fields in results.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data that is formatted with
default bundle.
"""
# Format 3D data
results = super(OccDefaultFormatBundle3D, self).__call__(results)
if 'gt_occ' in results.keys():
results['gt_occ'] = DC(to_tensor(results['gt_occ']), stack=True)
if 'gt_occ' in results.keys():
results['segmentation'] = DC(to_tensor(results['segmentation']), stack=True)
if 'gt_occ' in results.keys():
results['instance'] = DC(to_tensor(results['instance']), stack=True)
if 'gt_occ' in results.keys():
results['attribute_label'] = DC(to_tensor(results['attribute_label']), stack=True)
if 'gt_occ' in results.keys():
results['flow'] = DC(to_tensor(results['flow']), stack=True)
if 'gt_vel' in results.keys():
results['gt_vel'] = DC(to_tensor(results['gt_vel']), stack=False)
return results
================================================
FILE: projects/occ_plugin/datasets/pipelines/loading_bevdet.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import mmcv
import numpy as np
from mmdet.datasets.builder import PIPELINES
import os
import torch
from PIL import Image
from pyquaternion import Quaternion
from mmdet3d.core.bbox import LiDARInstance3DBoxes
from numpy import random
import pdb
def mmlabNormalize(img, img_norm_cfg=None):
from mmcv.image.photometric import imnormalize
if img_norm_cfg is None:
mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
to_rgb = True
else:
mean = np.array(img_norm_cfg['mean'], dtype=np.float32)
std = np.array(img_norm_cfg['std'], dtype=np.float32)
to_rgb = img_norm_cfg['to_rgb']
img = imnormalize(np.array(img), mean, std, to_rgb)
img = torch.tensor(img).float().permute(2, 0, 1).contiguous()
return img
def depth_transform(cam_depth, resize, resize_dims, crop, flip, rotate):
"""Transform depth based on ida augmentation configuration.
Args:
cam_depth (np array): Nx3, 3: x,y,d.
resize (float): Resize factor.
resize_dims (list): Final dimension.
crop (list): x1, y1, x2, y2
flip (bool): Whether to flip.
rotate (float): Rotation value.
Returns:
np array: [h/down_ratio, w/down_ratio, d]
"""
H, W = resize_dims
cam_depth[:, :2] = cam_depth[:, :2] * resize
cam_depth[:, 0] -= crop[0]
cam_depth[:, 1] -= crop[1]
if flip:
cam_depth[:, 0] = resize_dims[1] - cam_depth[:, 0]
cam_depth[:, 0] -= W / 2.0
cam_depth[:, 1] -= H / 2.0
h = rotate / 180 * np.pi
rot_matrix = [
[np.cos(h), np.sin(h)],
[-np.sin(h), np.cos(h)],
]
cam_depth[:, :2] = np.matmul(rot_matrix, cam_depth[:, :2].T).T
cam_depth[:, 0] += W / 2.0
cam_depth[:, 1] += H / 2.0
depth_coords = cam_depth[:, :2].astype(np.int16)
depth_map = np.zeros(resize_dims)
valid_mask = ((depth_coords[:, 1] < resize_dims[0])
& (depth_coords[:, 0] < resize_dims[1])
& (depth_coords[:, 1] >= 0)
& (depth_coords[:, 0] >= 0))
depth_map[depth_coords[valid_mask, 1],
depth_coords[valid_mask, 0]] = cam_depth[valid_mask, 2]
return torch.Tensor(depth_map)
@PIPELINES.register_module()
class LoadMultiViewImageFromFiles_BEVDet(object):
"""Load multi channel images from a list of separate channel files.
Expects results['img_filename'] to be a list of filenames.
Args:
to_float32 (bool): Whether to convert the img to float32.
Defaults to False.
color_type (str): Color type of the file. Defaults to 'unchanged'.
"""
def __init__(self, data_config, is_train=False, using_ego=True, colorjitter=False,
sequential=False, aligned=False, trans_only=True, img_norm_cfg=None,
mmlabnorm=False, load_depth=False, depth_gt_path=None, data_root=None, test_mode=False, use_lyft=False):
self.is_train = is_train
self.data_config = data_config
# using mean camera ego frame, rather than the lidar coordinates
self.using_ego = using_ego
self.normalize_img = mmlabNormalize
self.img_norm_cfg = img_norm_cfg
self.sequential = sequential
self.aligned = aligned
self.trans_only = trans_only
self.load_depth = load_depth
self.depth_gt_path = depth_gt_path
self.data_root = data_root
self.colorjitter = colorjitter
self.pipeline_colorjitter = PhotoMetricDistortionMultiViewImage()
self.test_mode = test_mode
self.use_lyft = use_lyft
def get_rot(self,h):
return torch.Tensor([
[np.cos(h), np.sin(h)],
[-np.sin(h), np.cos(h)],
])
def img_transform(self, img, post_rot, post_tran,
resize, resize_dims, crop,
flip, rotate):
# adjust image
img = self.img_transform_core(img, resize_dims, crop, flip, rotate)
# post-homography transformation
post_rot *= resize
post_tran -= torch.Tensor(crop[:2])
if flip:
A = torch.Tensor([[-1, 0], [0, 1]])
b = torch.Tensor([crop[2] - crop[0], 0])
post_rot = A.matmul(post_rot)
post_tran = A.matmul(post_tran) + b
A = self.get_rot(rotate / 180 * np.pi)
b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
b = A.matmul(-b) + b
post_rot = A.matmul(post_rot)
post_tran = A.matmul(post_tran) + b
return img, post_rot, post_tran
def img_transform_core(self, img, resize_dims, crop, flip, rotate):
# adjust image
img = img.resize(resize_dims)
img = img.crop(crop)
if flip:
img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
img = img.rotate(rotate)
return img
def choose_cams(self):
if self.is_train and self.data_config['Ncams'] < len(self.data_config['cams']):
cam_names = np.random.choice(self.data_config['cams'], self.data_config['Ncams'],
replace=False)
else:
cam_names = self.data_config['cams']
return cam_names
def sample_augmentation(self, H , W, flip=None, scale=None):
fH, fW = self.data_config['input_size']
if self.is_train:
resize = float(fW)/float(W)
resize += np.random.uniform(*self.data_config['resize'])
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.random.uniform(*self.data_config['crop_h'])) * newH) - fH
crop_w = int(np.random.uniform(0, max(0, newW - fW)))
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
# We do not use flip here to keep right forecasting
flip = None
rotate = 0
else:
resize = float(fW)/float(W)
resize += self.data_config.get('resize_test', 0.0)
if scale is not None:
resize = scale
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.mean(self.data_config['crop_h'])) * newH) - fH
crop_w = int(max(0, newW - fW) / 2)
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = None
rotate = 0
return resize, resize_dims, crop, flip, rotate
def get_sensor2ego_transformation(self, cam_info, key_info, cam_name):
w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation']
# sweep sensor to sweep ego
sweepsensor2sweepego_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
sweepsensor2sweepego_tran = torch.Tensor(
cam_info['cams'][cam_name]['sensor2ego_translation'])
sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros(
(4, 4))
sweepsensor2sweepego[3, 3] = 1
sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot
sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran
# sweep ego to global
w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation']
sweepego2global_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
sweepego2global_tran = torch.Tensor(
cam_info['cams'][cam_name]['ego2global_translation'])
sweepego2global = sweepego2global_rot.new_zeros((4, 4))
sweepego2global[3, 3] = 1
sweepego2global[:3, :3] = sweepego2global_rot
sweepego2global[:3, -1] = sweepego2global_tran
# global sensor to cur ego
w, x, y, z = key_info['cams'][cam_name]['ego2global_rotation']
keyego2global_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
keyego2global_tran = torch.Tensor(
key_info['cams'][cam_name]['ego2global_translation'])
keyego2global = keyego2global_rot.new_zeros((4, 4))
keyego2global[3, 3] = 1
keyego2global[:3, :3] = keyego2global_rot
keyego2global[:3, -1] = keyego2global_tran
global2keyego = keyego2global.inverse()
# cur ego to sensor
w, x, y, z = key_info['cams'][cam_name]['sensor2ego_rotation']
keysensor2keyego_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
keysensor2keyego_tran = torch.Tensor(
key_info['cams'][cam_name]['sensor2ego_translation'])
keysensor2keyego = keysensor2keyego_rot.new_zeros((4, 4))
keysensor2keyego[3, 3] = 1
keysensor2keyego[:3, :3] = keysensor2keyego_rot
keysensor2keyego[:3, -1] = keysensor2keyego_tran
keyego2keysensor = keysensor2keyego.inverse()
keysensor2sweepsensor = (
keyego2keysensor @ global2keyego @ sweepego2global
@ sweepsensor2sweepego).inverse()
sweepsensor2keyego = global2keyego @ sweepego2global @ \
sweepsensor2sweepego
return sweepsensor2keyego, keysensor2sweepsensor
def get_sensor2lidar_transformation(self, cam_info, cam_name, sample_info):
w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation']
# sweep sensor to sweep ego
sweepsensor2sweepego_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
sweepsensor2sweepego_tran = torch.Tensor(
cam_info['cams'][cam_name]['sensor2ego_translation'])
sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros(
(4, 4))
sweepsensor2sweepego[3, 3] = 1
sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot
sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran
# sweep ego to global
w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation']
sweepego2global_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
sweepego2global_tran = torch.Tensor(
cam_info['cams'][cam_name]['ego2global_translation'])
sweepego2global = sweepego2global_rot.new_zeros((4, 4))
sweepego2global[3, 3] = 1
sweepego2global[:3, :3] = sweepego2global_rot
sweepego2global[:3, -1] = sweepego2global_tran
# global to lidar ego
w, x, y, z = sample_info['ego2global_rotation']
lidarego2global_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
lidarego2global_tran = torch.Tensor(sample_info['ego2global_translation'])
lidarego2global = lidarego2global_rot.new_zeros((4, 4))
lidarego2global[3, 3] = 1
lidarego2global[:3, :3] = lidarego2global_rot
lidarego2global[:3, -1] = lidarego2global_tran
global2lidarego = lidarego2global.inverse()
# lidar ego to lidar
w, x, y, z = sample_info['lidar2ego_rotation']
lidar2ego_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
lidar2ego_tran = torch.Tensor(sample_info['lidar2ego_translation'])
lidar2ego = lidar2ego_rot.new_zeros((4, 4))
lidar2ego[3, 3] = 1
lidar2ego[:3, :3] = lidar2ego_rot
lidar2ego[:3, -1] = lidar2ego_tran
ego2lidar = lidar2ego.inverse()
# camera to lidar
sweepsensor2lidar = ego2lidar @ global2lidarego @ sweepego2global @ sweepsensor2sweepego
return sweepsensor2lidar
def get_seq_inputs(self, results, flip=None, scale=None):
cam_names = self.choose_cams()
results['cam_names'] = cam_names
if self.use_lyft:
filename = results['input_dict'][0]['img_filename'][0]
else:
cam_data = results['input_dict'][0]['curr']['cams'][cam_names[0]]
filename = cam_data['data_path']
filename = os.path.join(self.data_root, filename.split('/')[-3], filename.split('/')[-2], filename.split('/')[-1])
img = Image.open(filename)
img_augs = self.sample_augmentation(H=img.height,
W=img.width,
flip=flip,
scale=scale)
resize, resize_dims, crop, flip, rotate = img_augs
sequence_length = results['sequence_length']
imgs_seq = []
rots_seq = []
trans_seq = []
intrins_seq = []
post_rots_seq = []
post_trans_seq = []
gt_depths_seq = list()
canvas_seq = []
sensor2sensors_seq = []
for counter in range(sequence_length):
input_dict_curr = results['input_dict'][counter]
imgs = []
rots = []
trans = []
intrins = []
post_rots = []
post_trans = []
gt_depths = list()
canvas = []
sensor2sensors = []
for cam_idx, cam_name in enumerate(cam_names):
if self.use_lyft:
cam_data = None
filename = input_dict_curr['img_filename'][cam_idx]
else:
cam_data = input_dict_curr['curr']['cams'][cam_name]
filename = cam_data['data_path']
filename = os.path.join(self.data_root, filename.split('/')[-3], filename.split('/')[-2], filename.split('/')[-1])
img = Image.open(filename)
post_rot = torch.eye(2)
post_tran = torch.zeros(2)
if self.use_lyft:
intrin = torch.Tensor(input_dict_curr['cam_intrinsics'][cam_idx])
else:
intrin = torch.Tensor(cam_data['cam_intrinsic'])
# from camera to lidar
sensor2lidar = torch.tensor(input_dict_curr['lidar2cam_dic'][cam_name]).inverse().float()
rot = sensor2lidar[:3, :3]
tran = sensor2lidar[:3, 3]
img, post_rot2, post_tran2 = \
self.img_transform(img, post_rot,
post_tran,
resize=resize,
resize_dims=resize_dims,
crop=crop,
flip=flip,
rotate=rotate)
# for convenience, make augmentation matrices 3x3
post_tran = torch.zeros(3)
post_rot = torch.eye(3)
post_tran[:2] = post_tran2
post_rot[:2, :2] = post_rot2
# TODO: open source depth enhancement
gt_depths.append(torch.zeros(1))
canvas.append(np.array(img))
if self.colorjitter and self.is_train:
img = self.pipeline_colorjitter(img)
imgs.append(self.normalize_img(img, img_norm_cfg=self.img_norm_cfg))
intrins.append(intrin)
rots.append(rot)
trans.append(tran)
post_rots.append(post_rot)
post_trans.append(post_tran)
sensor2sensors.append(sensor2lidar)
imgs = torch.stack(imgs)
rots = torch.stack(rots)
trans = torch.stack(trans)
intrins = torch.stack(intrins)
post_rots = torch.stack(post_rots)
post_trans = torch.stack(post_trans)
gt_depths = torch.stack(gt_depths)
sensor2sensors = torch.stack(sensor2sensors)
imgs_seq.append(imgs)
rots_seq.append(rots)
trans_seq.append(trans)
intrins_seq.append(intrins)
post_rots_seq.append(post_rots)
post_trans_seq.append(post_trans)
gt_depths_seq.append(gt_depths)
canvas_seq.append(canvas)
sensor2sensors_seq.append(sensor2sensors)
imgs_seq = torch.stack(imgs_seq)
rots_seq = torch.stack(rots_seq)
trans_seq = torch.stack(trans_seq)
intrins_seq = torch.stack(intrins_seq)
post_rots_seq = torch.stack(post_rots_seq)
post_trans_seq = torch.stack(post_trans_seq)
gt_depths_seq = torch.stack(gt_depths_seq)
sensor2sensors_seq = torch.stack(sensor2sensors_seq)
results['canvas'] = canvas
return imgs_seq, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, gt_depths_seq, sensor2sensors_seq
def __call__(self, results):
results['img_inputs_seq'] = self.get_seq_inputs(results)
return results
def bev_transform(rotate_angle, scale_ratio, flip_dx, flip_dy):
rotate_angle = torch.tensor(rotate_angle / 180 * np.pi)
rot_sin = torch.sin(rotate_angle)
rot_cos = torch.cos(rotate_angle)
rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0],
[0, 0, 1]])
scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0],
[0, 0, scale_ratio]])
flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
if flip_dx:
flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0], [0, 0, 1]])
if flip_dy:
flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]])
rot_mat = flip_mat @ (scale_mat @ rot_mat)
return rot_mat
@PIPELINES.register_module()
class LoadAnnotationsBEVDepth():
def __init__(self, bda_aug_conf, classes, is_train=True,
input_modality=None):
self.bda_aug_conf = bda_aug_conf
self.is_train = is_train
self.classes = classes
if input_modality == None:
input_modality = dict(
use_lidar=True,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
self.input_modality = input_modality
def sample_bda_augmentation(self):
"""Generate bda augmentation values based on bda_config."""
if self.is_train:
rotate_bda = np.random.uniform(*self.bda_aug_conf['rot_lim'])
scale_bda = np.random.uniform(*self.bda_aug_conf['scale_lim'])
flip_dx = np.random.uniform() < self.bda_aug_conf['flip_dx_ratio']
flip_dy = np.random.uniform() < self.bda_aug_conf['flip_dy_ratio']
else:
rotate_bda = 0
scale_bda = 1.0
flip_dx = False
flip_dy = False
return rotate_bda, scale_bda, flip_dx, flip_dy
def __call__(self, results):
rotate_bda, scale_bda, flip_dx, flip_dy = self.sample_bda_augmentation()
bda_mat = torch.zeros(4, 4)
bda_mat[3, 3] = 1
bda_rot = bev_transform(rotate_bda, scale_bda, flip_dx, flip_dy)
bda_mat[:3, :3] = bda_rot
results['bda_mat'] = bda_rot
if 'points' in results.keys():
results['points'].rotate(bda_rot)
if self.input_modality['use_camera']:
assert len(results['img_inputs']) == 8
imgs, rots, trans, intrins, post_rots, post_trans, gt_depths, sensor2sensors = results['img_inputs']
results['img_inputs'] = (imgs, rots, trans, intrins, post_rots, post_trans, bda_rot, imgs.shape[-2:], gt_depths, sensor2sensors)
return results
class PhotoMetricDistortionMultiViewImage(object):
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def __call__(self, img):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
# convert PIL Image to Ndarray float32
img = np.array(img, dtype=np.float32)
assert img.dtype == np.float32, \
'PhotoMetricDistortion needs the input image of dtype np.float32,'\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if random.randint(2):
delta = random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(2)
if mode == 1:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if random.randint(2):
img[..., 1] *= random.uniform(self.saturation_lower,
self.saturation_upper)
# random hue
if random.randint(2):
img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# randomly swap channels
if random.randint(2):
img = img[..., random.permutation(3)]
img = Image.fromarray(img.astype(np.uint8))
return img
================================================
FILE: projects/occ_plugin/datasets/pipelines/loading_instance.py
================================================
# Developed by Junyi Ma based on the codebase of PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import numpy as np
from mmdet.datasets.builder import PIPELINES
import os
import torch
from pyquaternion import Quaternion
from nuscenes.utils.data_classes import Box
import time
@PIPELINES.register_module()
class LoadInstanceWithFlow(object):
def __init__(self, cam4docc_dataset_path, grid_size=[512, 512, 40], pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], background=0,
use_flow=True, use_separate_classes=False, use_lyft=False):
'''
Loading sequential occupancy labels and instance flows for training and testing
cam4docc_dataset_path: data path of Cam4DOcc dataset, including 'segmentation', 'instance', and 'flow'
grid_size: number of grids along H W L, default: [512, 512, 40]
pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
background: background pixel value for segmentation/instance/flow maps, default: 0
use_flow: whether use flow for training schemes, default: True
'''
self.cam4docc_dataset_path = cam4docc_dataset_path
self.pc_range = pc_range
self.resolution = [(self.pc_range[3+i] - self.pc_range[i])/grid_size[i] for i in range(len(self.pc_range[:3]))]
self.start_position = [self.pc_range[i] + self.resolution[i] / 2.0 for i in range(len(self.pc_range[:3]))]
self.dimension = grid_size
self.pc_range = np.array(self.pc_range)
self.resolution = np.array(self.resolution)
self.start_position = np.array(self.start_position)
self.dimension = np.array(self.dimension)
self.background = background
self.use_flow = use_flow
self.use_separate_classes = use_separate_classes
self.use_lyft = use_lyft
def get_poly_region(self, instance_annotation, present_egopose, present_ego2lidar):
"""
Obtain the bounding box polygon of the instance
"""
present_ego_translation, present_ego_rotation = present_egopose
present_ego2lidar_translation, present_ego2lidar_rotation = present_ego2lidar
box = Box(
instance_annotation['translation'], instance_annotation['size'], Quaternion(instance_annotation['rotation'])
)
box.translate(present_ego_translation)
box.rotate(present_ego_rotation)
box.translate(present_ego2lidar_translation)
box.rotate(present_ego2lidar_rotation)
pts=box.corners().T
X_min_box = pts.min(axis=0)[0]
X_max_box = pts.max(axis=0)[0]
Y_min_box = pts.min(axis=0)[1]
Y_max_box = pts.max(axis=0)[1]
Z_min_box = pts.min(axis=0)[2]
Z_max_box = pts.max(axis=0)[2]
if self.pc_range[0] <= X_min_box and X_max_box <= self.pc_range[3] \
and self.pc_range[1] <= Y_min_box and Y_max_box <= self.pc_range[4] \
and self.pc_range[2] <= Z_min_box and Z_max_box <= self.pc_range[5]:
pts = np.round((pts - self.start_position[:3] + self.resolution[:3] / 2.0) / self.resolution[:3]).astype(np.int32)
return pts
else:
return None
def fill_occupancy(self, occ_instance, occ_segmentation, occ_attribute_label, instance_fill_info):
x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.float)
x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.float)
y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.float)
z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2])
mesh_grid_3d = torch.stack((x_grid, y_grid, z_grid), -1)
mesh_grid_3d = mesh_grid_3d.view(-1, 3)
occ_instance = torch.from_numpy(occ_instance).view(-1, 1)
occ_segmentation = torch.from_numpy(occ_segmentation).view(-1, 1)
occ_attribute_label = torch.from_numpy(occ_attribute_label).view(-1, 1)
for instance_info in instance_fill_info:
poly_region_pts = instance_info['poly_region']
semantic_id = instance_info['semantic_id']
instance_id = instance_info['instance_id']
attribute_label=instance_info['attribute_label']
X_min_box = poly_region_pts.min(axis=0)[0]
X_max_box = poly_region_pts.max(axis=0)[0]
Y_min_box = poly_region_pts.min(axis=0)[1]
Y_max_box = poly_region_pts.max(axis=0)[1]
Z_min_box = poly_region_pts.min(axis=0)[2]
Z_max_box = poly_region_pts.max(axis=0)[2]
mask_cur_instance = (mesh_grid_3d[:,0] >= X_min_box) & (X_max_box >= mesh_grid_3d[:,0]) \
& (mesh_grid_3d[:,1] >= Y_min_box) & (Y_max_box >= mesh_grid_3d[:,1]) \
& (mesh_grid_3d[:,2] >= Z_min_box) & (Z_max_box >= mesh_grid_3d[:,2])
occ_instance[mask_cur_instance] = instance_id
occ_segmentation[mask_cur_instance] = semantic_id
occ_attribute_label[mask_cur_instance] = attribute_label
occ_instance = occ_instance.view(self.dimension[0], self.dimension[1], self.dimension[2]).long()
occ_segmentation = occ_segmentation.view(self.dimension[0], self.dimension[1], self.dimension[2]).long()
occ_attribute_label = occ_attribute_label.view(self.dimension[0], self.dimension[1], self.dimension[2]).long()
return occ_instance, occ_segmentation, occ_attribute_label
def get_label(self, input_seq_data):
"""
Generate labels for semantic segmentation, instance segmentation, z position, attribute from the raw data of nuScenes
"""
timestep = self.counter
# Background is ID 0
segmentation = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
instance = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
attribute_label = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
instance_dict = input_seq_data['instance_dict']
egopose_list = input_seq_data['egopose_list']
ego2lidar_list = input_seq_data['ego2lidar_list']
time_receptive_field = input_seq_data['time_receptive_field']
instance_fill_info = []
for instance_token, instance_annotation in instance_dict.items():
if timestep not in instance_annotation['timestep']:
continue
pointer = instance_annotation['timestep'].index(timestep)
annotation = {
'translation': instance_annotation['translation'][pointer],
'rotation': instance_annotation['rotation'][pointer],
'size': instance_annotation['size'],
}
poly_region = self.get_poly_region(annotation, egopose_list[time_receptive_field - 1], ego2lidar_list[time_receptive_field - 1])
if isinstance(poly_region, np.ndarray):
if self.counter >= time_receptive_field and instance_token not in self.visible_instance_set:
continue
self.visible_instance_set.add(instance_token)
prepare_for_fill = dict(
poly_region=poly_region,
instance_id=instance_annotation['instance_id'],
attribute_label=instance_annotation['attribute_label'][pointer],
semantic_id=instance_annotation['semantic_id'],
)
instance_fill_info.append(prepare_for_fill)
instance, segmentation, attribute_label = self.fill_occupancy(instance, segmentation, attribute_label, instance_fill_info)
segmentation = segmentation.unsqueeze(0)
instance = instance.unsqueeze(0)
attribute_label = attribute_label.unsqueeze(0).unsqueeze(0)
return segmentation, instance, attribute_label
@staticmethod
def generate_flow(flow, occ_instance_seq, instance, instance_id):
"""
Generate ground truth for the flow of each instance based on instance segmentation
"""
seg_len, wx, wy, wz = occ_instance_seq.shape
ratio = 4
occ_instance_seq = occ_instance_seq.reshape(seg_len, wx//ratio, ratio, wy//ratio, ratio, wz//ratio, ratio).permute(0,1,3,5,2,4,6).reshape(seg_len, wx//ratio, wy//ratio, wz//ratio, ratio**3)
empty_mask = occ_instance_seq.sum(-1) == 0
occ_instance_seq = occ_instance_seq.to(torch.int64)
occ_space = occ_instance_seq[~empty_mask]
occ_space[occ_space==0] = -torch.arange(len(occ_space[occ_space==0])).to(occ_space.device) - 1
occ_instance_seq[~empty_mask] = occ_space
occ_instance_seq = torch.mode(occ_instance_seq, dim=-1)[0]
occ_instance_seq[occ_instance_seq<0] = 0
occ_instance_seq = occ_instance_seq.long()
_, wx, wy, wz = occ_instance_seq.shape
x, y, z = torch.meshgrid(torch.arange(wx, dtype=torch.float), torch.arange(wy, dtype=torch.float), torch.arange(wz, dtype=torch.float))
grid = torch.stack((x, y, z), dim=0)
# Set the first frame
init_pointer = instance['timestep'][0]
instance_mask = (occ_instance_seq[init_pointer] == instance_id)
flow[init_pointer, 0, instance_mask] = grid[0, instance_mask].mean(dim=0, keepdim=True).round() - grid[0, instance_mask]
flow[init_pointer, 1, instance_mask] = grid[1, instance_mask].mean(dim=0, keepdim=True).round() - grid[1, instance_mask]
flow[init_pointer, 2, instance_mask] = grid[2, instance_mask].mean(dim=0, keepdim=True).round() - grid[2, instance_mask]
for i, timestep in enumerate(instance['timestep']):
if i == 0:
continue
instance_mask = (occ_instance_seq[timestep] == instance_id)
prev_instance_mask = (occ_instance_seq[timestep-1] == instance_id)
if instance_mask.sum() == 0 or prev_instance_mask.sum() == 0:
continue
flow[timestep, 0, instance_mask] = grid[0, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[0, instance_mask]
flow[timestep, 1, instance_mask] = grid[1, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[1, instance_mask]
flow[timestep, 2, instance_mask] = grid[2, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[2, instance_mask]
return flow
def get_flow_label(self, input_seq_data, ignore_index=255):
"""
Generate the global map of the flow ground truth
"""
occ_instance = input_seq_data['instance']
instance_dict = input_seq_data['instance_dict']
instance_map = input_seq_data['instance_map']
seq_len, wx, wy, wz = occ_instance.shape
ratio = 4
flow = ignore_index * torch.ones(seq_len, 3, wx//ratio, wy//ratio, wz//ratio)
# ignore flow generation for faster pipelines
if not self.use_flow:
return flow
for token, instance in instance_dict.items():
flow = self.generate_flow(flow, occ_instance, instance, instance_map[token])
return flow.float()
# set ignore index to 0 for vis
@staticmethod
def convert_instance_mask_to_center_and_offset_label(input_seq_data, ignore_index=255, sigma=3):
occ_instance = input_seq_data['instance']
num_instances=len(input_seq_data['instance_map'])
seq_len, wx, wy, wz = occ_instance.shape
center_label = torch.zeros(seq_len, 1, wx, wy, wz)
offset_label = ignore_index * torch.ones(seq_len, 3, wx, wy, wz)
# x is vertical displacement, y is horizontal displacement
x, y, z = torch.meshgrid(torch.arange(wx, dtype=torch.float), torch.arange(wy, dtype=torch.float), torch.arange(wz, dtype=torch.float))
# Ignore id 0 which is the background
for instance_id in range(1, num_instances+1):
for t in range(seq_len):
instance_mask = (occ_instance[t] == instance_id)
xc = x[instance_mask].mean().round().long()
yc = y[instance_mask].mean().round().long()
zc = z[instance_mask].mean().round().long()
off_x = xc - x
off_y = yc - y
off_z = zc - z
g = torch.exp(-(off_x ** 2 + off_y ** 2 + off_z ** 2) / sigma ** 2)
center_label[t, 0] = torch.maximum(center_label[t, 0], g)
offset_label[t, 0, instance_mask] = off_x[instance_mask]
offset_label[t, 1, instance_mask] = off_y[instance_mask]
offset_label[t, 2, instance_mask] = off_z[instance_mask]
return center_label, offset_label
def __call__(self, results):
assert 'segmentation' not in results.keys()
assert 'instance' not in results.keys()
assert 'attribute_label' not in results.keys()
time_receptive_field = results['time_receptive_field']
prefix = "MMO" if self.use_separate_classes else "GMO"
if self.use_lyft:
prefix = prefix + "_lyft"
seg_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "segmentation")
if not os.path.exists(seg_label_dir):
os.mkdir(seg_label_dir)
seg_label_path = os.path.join(seg_label_dir, \
results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token'])
instance_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "instance")
if not os.path.exists(instance_label_dir):
os.mkdir(instance_label_dir)
instance_label_path = os.path.join(instance_label_dir, \
results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token'])
flow_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "flow")
if not os.path.exists(flow_label_dir):
os.mkdir(flow_label_dir)
flow_label_path = os.path.join(flow_label_dir, \
results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token'])
segmentation_list = []
if os.path.exists(seg_label_path+".npz"):
gt_segmentation_arr = np.load(seg_label_path+".npz",allow_pickle=True)['arr_0']
for j in range(len(gt_segmentation_arr)):
segmentation = np.zeros((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
gt_segmentation = gt_segmentation_arr[j]
gt_segmentation = torch.from_numpy(gt_segmentation)
# for i in range(gt_segmentation.shape[0]):
# cur_ind = gt_segmentation[i, :3].long()
# cur_label = gt_segmentation[i, -1]
# segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label
segmentation[gt_segmentation[:, 0].long(), gt_segmentation[:, 1].long(), gt_segmentation[:, 2].long()] = gt_segmentation[:, -1]
segmentation = torch.from_numpy(segmentation).unsqueeze(0)
segmentation_list.append(segmentation)
instance_list = []
if os.path.exists(instance_label_path+".npz"):
gt_instance_arr = np.load(instance_label_path+".npz",allow_pickle=True)['arr_0']
for j in range(len(gt_instance_arr)):
instance = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
gt_instance = gt_instance_arr[j]
gt_instance = torch.from_numpy(gt_instance)
# for i in range(gt_instance.shape[0]):
# cur_ind = gt_instance[i, :3].long()
# cur_label = gt_instance[i, -1]
# instance[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label
instance[gt_instance[:, 0].long(), gt_instance[:, 1].long(), gt_instance[:, 2].long()] = gt_instance[:, -1]
instance = torch.from_numpy(instance).unsqueeze(0)
instance_list.append(instance)
flow_list = []
if os.path.exists(flow_label_path+".npz"):
gt_flow_arr = np.load(flow_label_path+".npz",allow_pickle=True)['arr_0']
for j in range(len(gt_flow_arr)):
flow = np.ones((3, self.dimension[0]//4, self.dimension[1]//4, self.dimension[2]//4)) * 255
gt_flow = gt_flow_arr[j]
gt_flow = torch.from_numpy(gt_flow)
# for i in range(gt_flow.shape[0]):
# cur_ind = gt_flow[i, :3].long()
# cur_label = gt_flow[i, 3:]
# flow[0, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[0]
# flow[1, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[1]
# flow[2, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[2]
flow[:, gt_flow[:, 0].long(), gt_flow[:, 1].long(), gt_flow[:, 2].long()] = gt_flow[:, 3:].permute(1, 0)
flow = torch.from_numpy(flow).unsqueeze(0)
flow_list.append(flow)
if os.path.exists(seg_label_path+".npz") and os.path.exists(instance_label_path+".npz") and os.path.exists(flow_label_path+".npz"):
results['segmentation'] = torch.cat(segmentation_list, dim=0)
results['instance'] = torch.cat(instance_list, dim=0)
results['attribute_label'] = torch.from_numpy(np.zeros((self.dimension[0], self.dimension[1], self.dimension[2]))).unsqueeze(0)
results['flow'] = torch.cat(flow_list, dim=0).float()
for key, value in results.items():
if key in ['sample_token', 'centerness', 'offset', 'flow', 'time_receptive_field', "indices", \
'segmentation','instance','attribute_label','sequence_length', 'instance_dict', 'instance_map', 'input_dict', 'egopose_list','ego2lidar_list','scene_token']:
continue
results[key] = torch.cat(value, dim=0)
return results
else:
results['segmentation'] = []
results['instance'] = []
results['attribute_label'] = []
segmentation_saved_list = []
instance_saved_list = []
sequence_length = results['sequence_length']
self.visible_instance_set = set()
for self.counter in range(sequence_length):
segmentation, instance, attribute_label = self.get_label(results)
results['segmentation'].append(segmentation)
results['instance'].append(instance)
results['attribute_label'].append(attribute_label)
x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.long)
x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.long)
y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.long)
z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2])
segmentation_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
segmentation_for_save = segmentation_for_save.view(-1, 3)
segmentation_label = segmentation.squeeze(0).view(-1,1)
segmentation_for_save = torch.cat((segmentation_for_save, segmentation_label), dim=-1)
kept = segmentation_for_save[:,-1]!=0
segmentation_for_save= segmentation_for_save[kept]
segmentation_saved_list.append(segmentation_for_save)
x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.long)
x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.long)
y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.long)
z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2])
instance_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
instance_for_save = instance_for_save.view(-1, 3)
instance_label = instance.squeeze(0).view(-1,1)
instance_for_save = torch.cat((instance_for_save, instance_label), dim=-1)
kept = instance_for_save[:,-1]!=0
instance_for_save= instance_for_save[kept]
instance_saved_list.append(instance_for_save)
segmentation_saved_list2 = [item.cpu().detach().numpy() for item in segmentation_saved_list]
instance_saved_list2 = [item.cpu().detach().numpy() for item in instance_saved_list]
np.savez(seg_label_path, segmentation_saved_list2)
np.savez(instance_label_path, instance_saved_list2)
results['segmentation'] = torch.cat(results['segmentation'], dim=0)
results['instance'] = torch.cat(results['instance'], dim=0)
results['attribute_label'] = torch.from_numpy(np.zeros((self.dimension[0], self.dimension[1], self.dimension[2]))).unsqueeze(0)
results['flow'] = self.get_flow_label(results, ignore_index=255)
flow_saved_list = []
sequence_length = results['sequence_length']
d0 = self.dimension[0]//4
d1 = self.dimension[1]//4
d2 = self.dimension[2]//4
for cnt in range(sequence_length):
flow = results['flow'][cnt, ...]
x_grid = torch.linspace(0, d0-1, d0, dtype=torch.long)
x_grid = x_grid.view(d0, 1, 1).expand(d0, d1, d2)
y_grid = torch.linspace(0, d1-1, d1, dtype=torch.long)
y_grid = y_grid.view(1, d1, 1).expand(d0, d1, d2)
z_grid = torch.linspace(0, d2-1, d2, dtype=torch.long)
z_grid = z_grid.view(1, 1, d2).expand(d0, d1, d2)
flow_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
flow_for_save = flow_for_save.view(-1, 3)
flow_label = flow.permute(1,2,3,0).view(-1,3)
flow_for_save = torch.cat((flow_for_save, flow_label), dim=-1)
kept = (flow_for_save[:,-1]!=255) & (flow_for_save[:,-2]!=255) & (flow_for_save[:,-3]!=255)
flow_for_save= flow_for_save[kept]
flow_saved_list.append(flow_for_save)
flow_saved_list2 = [item.cpu().detach().numpy() for item in flow_saved_list]
np.savez(flow_label_path, flow_saved_list2)
for key, value in results.items():
if key in ['sample_token', 'centerness', 'offset', 'flow', 'time_receptive_field', "indices", \
'segmentation','instance','attribute_label','sequence_length', 'instance_dict', 'instance_map', 'input_dict', 'egopose_list','ego2lidar_list','scene_token']:
continue
results[key] = torch.cat(value, dim=0)
return results
================================================
FILE: projects/occ_plugin/datasets/pipelines/loading_occupancy.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import numpy as np
import numba as nb
from mmdet.datasets.builder import PIPELINES
import yaml, os
import torch
import torch.nn.functional as F
import copy
@PIPELINES.register_module()
class LoadOccupancy(object):
def __init__(self, to_float32=True, occ_path=None, grid_size=[512, 512, 40], pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], unoccupied=0, gt_resize_ratio=1, use_fine_occ=False, test_mode=False):
'''
Read sequential fine-grained occupancy labels from nuScenes-Occupancy if use_fine_occ=True
'''
self.to_float32 = to_float32
self.occ_path = occ_path
self.grid_size = np.array(grid_size)
self.unoccupied = unoccupied
self.pc_range = np.array(pc_range)
self.voxel_size = (self.pc_range[3:] - self.pc_range[:3]) / self.grid_size
self.gt_resize_ratio = gt_resize_ratio
self.use_fine_occ = use_fine_occ
self.test_mode = test_mode
def get_seq_pseudo_occ(self, results):
sequence_length = results['sequence_length']
gt_occ_seq = []
for count in range(sequence_length):
processed_label = np.ones(self.grid_size, dtype=np.uint8) * self.unoccupied
processed_label = torch.from_numpy(processed_label)
gt_occ_seq.append(processed_label)
gt_occ_seq = torch.stack(gt_occ_seq)
return gt_occ_seq
def get_seq_occ(self, results):
sequence_length = results['sequence_length']
gt_occ_seq = []
for count in range(sequence_length):
scene_token_cur = results['input_dict'][count]['scene_token']
lidar_token_cur = results['input_dict'][count]['lidar_token']
rel_path = 'scene_{0}/occupancy/{1}.npy'.format(scene_token_cur, lidar_token_cur)
# [z y x cls] or [z y x vx vy vz cls]
pcd = np.load(os.path.join(self.occ_path, rel_path))
pcd_label = pcd[..., -1:]
pcd_label[pcd_label==0] = 255
pcd_np_cor = self.voxel2world(pcd[..., [2,1,0]] + 0.5)
untransformed_occ = copy.deepcopy(pcd_np_cor)
egopose_list = results['egopose_list']
ego2lidar_list = results['ego2lidar_list']
time_receptive_field = results['time_receptive_field']
present_global2ego = egopose_list[time_receptive_field - 1]
present_ego2lidar = ego2lidar_list[time_receptive_field - 1]
cur_global2ego = egopose_list[count]
cur_ego2lidar = ego2lidar_list[count]
pcd_np_cor = np.dot(cur_ego2lidar[1].inverse.rotation_matrix, pcd_np_cor.T)
pcd_np_cor = pcd_np_cor.T
pcd_np_cor = pcd_np_cor - cur_ego2lidar[0] # trans
# cur_ego -> global
pcd_np_cor = np.dot(cur_global2ego[1].inverse.rotation_matrix, pcd_np_cor.T) # rot
pcd_np_cor = pcd_np_cor.T
pcd_np_cor = pcd_np_cor - cur_global2ego[0] # trans
# global -> present_ego
pcd_np_cor = pcd_np_cor + present_global2ego[0] # trans
pcd_np_cor = np.dot(present_global2ego[1].rotation_matrix, pcd_np_cor.T)
pcd_np_cor = pcd_np_cor.T
# present_ego -> present_lidar
pcd_np_cor = pcd_np_cor + present_ego2lidar[0] # trans
pcd_np_cor = np.dot(present_ego2lidar[1].rotation_matrix, pcd_np_cor.T) # rot
pcd_np_cor = pcd_np_cor.T
pcd_np_cor = self.world2voxel(pcd_np_cor)
# make sure the point is in the grid
pcd_np_cor = np.clip(pcd_np_cor, np.array([0,0,0]), self.grid_size - 1)
transformed_occ = copy.deepcopy(pcd_np_cor)
pcd_np = np.concatenate([pcd_np_cor, pcd_label], axis=-1)
# 255: noise, 1-16 normal classes, 0 unoccupied
pcd_np = pcd_np[np.lexsort((pcd_np_cor[:, 0], pcd_np_cor[:, 1], pcd_np_cor[:, 2])), :]
pcd_np = pcd_np.astype(np.int64)
processed_label = np.ones(self.grid_size, dtype=np.uint8) * self.unoccupied
processed_label = nb_process_label(processed_label, pcd_np)
processed_label = torch.from_numpy(processed_label)
# TODO: hard coding
for otheridx in [0,1,7,8,11,12,13,14,15,16,17,18,255]:
processed_label[processed_label==otheridx] = 0
for vehidx in [2,3,4,5,6,9,10]:
processed_label[processed_label==vehidx] = 1
gt_occ_seq.append(processed_label)
gt_occ_seq = torch.stack(gt_occ_seq)
return gt_occ_seq
def __call__(self, results):
if self.use_fine_occ:
results['gt_occ'] = self.get_seq_occ(results)
else:
results['gt_occ'] = self.get_seq_pseudo_occ(results)
return results
def voxel2world(self, voxel):
"""
voxel: [N, 3]
"""
return voxel * self.voxel_size[None, :] + self.pc_range[:3][None, :]
def world2voxel(self, world):
"""
world: [N, 3]
"""
return (world - self.pc_range[:3][None, :]) / self.voxel_size[None, :]
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(to_float32={self.to_float32}'
return repr_str
def project_points(self, points, rots, trans, intrins, post_rots, post_trans):
# from lidar to camera
points = points.reshape(-1, 1, 3)
points = points - trans.reshape(1, -1, 3)
inv_rots = rots.inverse().unsqueeze(0)
points = (inv_rots @ points.unsqueeze(-1))
# from camera to raw pixel
points = (intrins.unsqueeze(0) @ points).squeeze(-1)
points_d = points[..., 2:3]
points_uv = points[..., :2] / points_d
# from raw pixel to transformed pixel
points_uv = post_rots[:, :2, :2].unsqueeze(0) @ points_uv.unsqueeze(-1)
points_uv = points_uv.squeeze(-1) + post_trans[..., :2].unsqueeze(0)
points_uvd = torch.cat((points_uv, points_d), dim=2)
return points_uvd
# b1:boolean, u1: uint8, i2: int16, u2: uint16
@nb.jit('b1[:](i2[:,:],u2[:,:],b1[:])', nopython=True, cache=True, parallel=False)
def nb_process_img_points(basic_valid_occ, depth_canva, nb_valid_mask):
# basic_valid_occ M 3
# depth_canva H W
# label_size = M # for original occ, small: 2w mid: ~8w base: ~30w
canva_idx = -1 * np.ones_like(depth_canva, dtype=np.int16)
for i in range(basic_valid_occ.shape[0]):
occ = basic_valid_occ[i]
if occ[2] < depth_canva[occ[1], occ[0]]:
if canva_idx[occ[1], occ[0]] != -1:
nb_valid_mask[canva_idx[occ[1], occ[0]]] = False
canva_idx[occ[1], occ[0]] = i
depth_canva[occ[1], occ[0]] = occ[2]
nb_valid_mask[i] = True
return nb_valid_mask
# u1: uint8, u8: uint16, i8: int64
@nb.jit('u1[:,:,:](u1[:,:,:],i8[:,:])', nopython=True, cache=True, parallel=False)
def nb_process_label_withvel(processed_label, sorted_label_voxel_pair):
label_size = 256
counter = np.zeros((label_size,), dtype=np.uint16)
counter[sorted_label_voxel_pair[0, 3]] = 1
cur_sear_ind = sorted_label_voxel_pair[0, :3]
for i in range(1, sorted_label_voxel_pair.shape[0]):
cur_ind = sorted_label_voxel_pair[i, :3]
if not np.all(np.equal(cur_ind, cur_sear_ind)):
processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
counter = np.zeros((label_size,), dtype=np.uint16)
cur_sear_ind = cur_ind
counter[sorted_label_voxel_pair[i, 3]] += 1
processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
return processed_label
# u1: uint8, u8: uint16, i8: int64
@nb.jit('u1[:,:,:](u1[:,:,:],i8[:,:])', nopython=True, cache=True, parallel=False)
def nb_process_label(processed_label, sorted_label_voxel_pair):
label_size = 256
counter = np.zeros((label_size,), dtype=np.uint16)
counter[sorted_label_voxel_pair[0, 3]] = 1
cur_sear_ind = sorted_label_voxel_pair[0, :3]
for i in range(1, sorted_label_voxel_pair.shape[0]):
cur_ind = sorted_label_voxel_pair[i, :3]
if not np.all(np.equal(cur_ind, cur_sear_ind)):
processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
counter = np.zeros((label_size,), dtype=np.uint16)
cur_sear_ind = cur_ind
counter[sorted_label_voxel_pair[i, 3]] += 1
processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
return processed_label
================================================
FILE: projects/occ_plugin/datasets/pipelines/transform_3d.py
================================================
import numpy as np
from numpy import random
import mmcv
from mmdet.datasets.builder import PIPELINES
from mmcv.parallel import DataContainer as DC
@PIPELINES.register_module()
class PadMultiViewImage(object):
"""Pad the multi-view image.
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
Args:
size (tuple, optional): Fixed padding size.
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
"""
def __init__(self, size=None, size_divisor=None, pad_val=0):
self.size = size
self.size_divisor = size_divisor
self.pad_val = pad_val
# only one of size and size_divisor should be valid
assert size is not None or size_divisor is not None
assert size is None or size_divisor is None
def _pad_img(self, results):
"""Pad images according to ``self.size``."""
if self.size is not None:
padded_img = [mmcv.impad(
img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
elif self.size_divisor is not None:
padded_img = [mmcv.impad_to_multiple(
img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
results['ori_shape'] = [img.shape for img in results['img']]
results['img'] = padded_img
results['img_shape'] = [img.shape for img in padded_img]
results['pad_shape'] = [img.shape for img in padded_img]
results['pad_fixed_size'] = self.size
results['pad_size_divisor'] = self.size_divisor
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self._pad_img(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.size}, '
repr_str += f'size_divisor={self.size_divisor}, '
repr_str += f'pad_val={self.pad_val})'
return repr_str
@PIPELINES.register_module()
class NormalizeMultiviewImage(object):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def __init__(self, mean, std, to_rgb=True):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.to_rgb = to_rgb
def __call__(self, results):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_rgb=self.to_rgb)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
return repr_str
@PIPELINES.register_module()
class PhotoMetricDistortionMultiViewImage:
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def __call__(self, results):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
imgs = results['img']
new_imgs = []
for img in imgs:
assert img.dtype == np.float32, \
'PhotoMetricDistortion needs the input image of dtype np.float32,'\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if random.randint(2):
delta = random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(2)
if mode == 1:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if random.randint(2):
img[..., 1] *= random.uniform(self.saturation_lower,
self.saturation_upper)
# random hue
if random.randint(2):
img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# randomly swap channels
if random.randint(2):
img = img[..., random.permutation(3)]
new_imgs.append(img)
results['img'] = new_imgs
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str
@PIPELINES.register_module()
class CustomCollect3D(object):
"""Collect data from the loader relevant to the specific task.
This is usually the last stage of the data loader pipeline. Typically keys
is set to some subset of "img", "proposals", "gt_bboxes",
"gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
The "img_meta" item is always populated. The contents of the "img_meta"
dictionary depends on "meta_keys". By default this includes:
- 'img_shape': shape of the image input to the network as a tuple \
(h, w, c). Note that images may be zero padded on the \
bottom/right if the batch tensor is larger than this shape.
- 'scale_factor': a float indicating the preprocessing scale
- 'flip': a boolean indicating if image flip transform was used
- 'filename': path to the image file
- 'ori_shape': original shape of the image as a tuple (h, w, c)
- 'pad_shape': image shape after padding
- 'lidar2img': transform from lidar to image
- 'depth2img': transform from depth to image
- 'cam2img': transform from camera to image
- 'pcd_horizontal_flip': a boolean indicating if point cloud is \
flipped horizontally
- 'pcd_vertical_flip': a boolean indicating if point cloud is \
flipped vertically
- 'box_mode_3d': 3D box mode
- 'box_type_3d': 3D box type
- 'img_norm_cfg': a dict of normalization information:
- mean: per channel mean subtraction
- std: per channel std divisor
- to_rgb: bool indicating if bgr was converted to rgb
- 'pcd_trans': point cloud transformations
- 'sample_idx': sample index
- 'pcd_scale_factor': point cloud scale factor
- 'pcd_rotation': rotation applied to point cloud
- 'pts_filename': path to point cloud file.
Args:
keys (Sequence[str]): Keys of results to be collected in ``data``.
meta_keys (Sequence[str], optional): Meta keys to be converted to
``mmcv.DataContainer`` and collected in ``data[img_metas]``.
Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
'box_type_3d', 'img_norm_cfg', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
"""
def __init__(self,
keys,
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape',
'scale_factor', 'flip', 'pcd_horizontal_flip',
'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
'transformation_3d_flow', 'scene_token',
'can_bus'
)):
self.keys = keys
self.meta_keys = meta_keys
def __call__(self, results):
"""Call function to collect keys in results. The keys in ``meta_keys``
will be converted to :obj:`mmcv.DataContainer`.
Args:
results (dict): Result dict contains the data to collect.
Returns:
dict: The result dict contains the following keys
- keys in ``self.keys``
- ``img_metas``
"""
data = {}
img_metas = {}
for key in self.meta_keys:
if key in results:
img_metas[key] = results[key]
data['img_metas'] = DC(img_metas, cpu_only=True)
for key in self.keys:
data[key] = results[key]
return data
def __repr__(self):
"""str: Return a string that describes the module."""
return self.__class__.__name__ + \
f'(keys={self.keys}, meta_keys={self.meta_keys})'
@PIPELINES.register_module()
class CustomOccCollect3D(object):
"""Collect data from the loader relevant to the specific task.
This is usually the last stage of the data loader pipeline. Typically keys
is set to some subset of "img", "proposals", "gt_bboxes",
"gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
The "img_meta" item is always populated. The contents of the "img_meta"
dictionary depends on "meta_keys". By default this includes:
- 'img_shape': shape of the image input to the network as a tuple \
(h, w, c). Note that images may be zero padded on the \
bottom/right if the batch tensor is larger than this shape.
- 'scale_factor': a float indicating the preprocessing scale
- 'flip': a boolean indicating if image flip transform was used
- 'filename': path to the image file
- 'ori_shape': original shape of the image as a tuple (h, w, c)
- 'pad_shape': image shape after padding
- 'lidar2img': transform from lidar to image
- 'depth2img': transform from depth to image
- 'cam2img': transform from camera to image
- 'pcd_horizontal_flip': a boolean indicating if point cloud is \
flipped horizontally
- 'pcd_vertical_flip': a boolean indicating if point cloud is \
flipped vertically
- 'box_mode_3d': 3D box mode
- 'box_type_3d': 3D box type
- 'img_norm_cfg': a dict of normalization information:
- mean: per channel mean subtraction
- std: per channel std divisor
- to_rgb: bool indicating if bgr was converted to rgb
- 'pcd_trans': point cloud transformations
- 'sample_idx': sample index
- 'pcd_scale_factor': point cloud scale factor
- 'pcd_rotation': rotation applied to point cloud
- 'pts_filename': path to point cloud file.
Args:
keys (Sequence[str]): Keys of results to be collected in ``data``.
meta_keys (Sequence[str], optional): Meta keys to be converted to
``mmcv.DataContainer`` and collected in ``data[img_metas]``.
Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
'box_type_3d', 'img_norm_cfg', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
"""
def __init__(self,
keys,
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape',
'scale_factor', 'flip', 'pcd_horizontal_flip',
'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
'transformation_3d_flow', 'scene_token',
'can_bus', 'pc_range', 'occ_size', 'lidar_token'
)):
self.keys = keys
self.meta_keys = meta_keys
def __call__(self, results):
"""Call function to collect keys in results. The keys in ``meta_keys``
will be converted to :obj:`mmcv.DataContainer`.
Args:
results (dict): Result dict contains the data to collect.
Returns:
dict: The result dict contains the following keys
- keys in ``self.keys``
- ``img_metas``
"""
data = {}
img_metas = {}
for key in self.meta_keys:
if key in results:
img_metas[key] = results[key]
data['img_metas'] = DC(img_metas, cpu_only=True)
for key in self.keys:
if key in results.keys():
data[key] = results[key]
print("self.keys", self.keys)
# if 'gt_occ' in results.keys():
# data['gt_occ'] = results['gt_occ']
return data
def __repr__(self):
"""str: Return a string that describes the module."""
return self.__class__.__name__ + \
f'(keys={self.keys}, meta_keys={self.meta_keys})'
@PIPELINES.register_module()
class RandomScaleImageMultiViewImage(object):
"""Random scale the image
Args:
scales
"""
def __init__(self, scales=[]):
self.scales = scales
assert len(self.scales)==1
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
rand_ind = np.random.permutation(range(len(self.scales)))[0]
rand_scale = self.scales[rand_ind]
y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
scale_factor = np.eye(4)
scale_factor[0, 0] *= rand_scale
scale_factor[1, 1] *= rand_scale
results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
enumerate(results['img'])]
lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
results['lidar2img'] = lidar2img
results['img_shape'] = [img.shape for img in results['img']]
results['ori_shape'] = [img.shape for img in results['img']]
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.scales}, '
return repr_str
================================================
FILE: projects/occ_plugin/datasets/samplers/__init__.py
================================================
from .group_sampler import DistributedGroupSampler
from .distributed_sampler import DistributedSampler
from .sampler import SAMPLER, build_sampler
================================================
FILE: projects/occ_plugin/datasets/samplers/distributed_sampler.py
================================================
import math
import torch
from torch.utils.data import DistributedSampler as _DistributedSampler
from .sampler import SAMPLER
@SAMPLER.register_module()
class DistributedSampler(_DistributedSampler):
def __init__(self,
dataset=None,
num_replicas=None,
rank=None,
shuffle=True,
seed=0):
super().__init__(
dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
# for the compatibility from PyTorch 1.3+
self.seed = seed if seed is not None else 0
def __iter__(self):
# deterministically shuffle based on epoch
if self.shuffle:
assert False
else:
indices = torch.arange(len(self.dataset)).tolist()
# add extra samples to make it evenly divisible
# in case that indices is shorter than half of total_size
indices = (indices *
math.ceil(self.total_size / len(indices)))[:self.total_size]
assert len(indices) == self.total_size
# subsample
per_replicas = self.total_size//self.num_replicas
# indices = indices[self.rank:self.total_size:self.num_replicas]
indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
assert len(indices) == self.num_samples
return iter(indices)
================================================
FILE: projects/occ_plugin/datasets/samplers/group_sampler.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
import numpy as np
import torch
from mmcv.runner import get_dist_info
from torch.utils.data import Sampler
from .sampler import SAMPLER
import random
from IPython import embed
@SAMPLER.register_module()
class DistributedGroupSampler(Sampler):
"""Sampler that restricts data loading to a subset of the dataset.
It is especially useful in conjunction with
:class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
process can pass a DistributedSampler instance as a DataLoader sampler,
and load a subset of the original dataset that is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Arguments:
dataset: Dataset used for sampling.
num_replicas (optional): Number of processes participating in
distributed training.
rank (optional): Rank of the current process within num_replicas.
seed (int, optional): random seed used to shuffle the sampler if
``shuffle=True``. This number should be identical across all
processes in the distributed group. Default: 0.
"""
def __init__(self,
dataset,
samples_per_gpu=1,
num_replicas=None,
rank=None,
seed=0):
_rank, _num_replicas = get_dist_info()
if num_replicas is None:
num_replicas = _num_replicas
if rank is None:
rank = _rank
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.seed = seed if seed is not None else 0
assert hasattr(self.dataset, 'flag')
self.flag = self.dataset.flag
self.group_sizes = np.bincount(self.flag)
self.num_samples = 0
for i, j in enumerate(self.group_sizes):
self.num_samples += int(
math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
self.num_replicas)) * self.samples_per_gpu
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch + self.seed)
indices = []
for i, size in enumerate(self.group_sizes):
if size > 0:
indice = np.where(self.flag == i)[0]
assert len(indice) == size
# add .numpy() to avoid bug when selecting indice in parrots.
# TODO: check whether torch.randperm() can be replaced by
# numpy.random.permutation().
indice = indice[list(
torch.randperm(int(size), generator=g).numpy())].tolist()
extra = int(
math.ceil(
size * 1.0 / self.samples_per_gpu / self.num_replicas)
) * self.samples_per_gpu * self.num_replicas - len(indice)
# pad indice
tmp = indice.copy()
for _ in range(extra // size):
indice.extend(tmp)
indice.extend(tmp[:extra % size])
indices.extend(indice)
assert len(indices) == self.total_size
indices = [
indices[j] for i in list(
torch.randperm(
len(indices) // self.samples_per_gpu, generator=g))
for j in range(i * self.samples_per_gpu, (i + 1) *
self.samples_per_gpu)
]
# subsample
offset = self.num_samples * self.rank
indices = indices[offset:offset + self.num_samples]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
================================================
FILE: projects/occ_plugin/datasets/samplers/sampler.py
================================================
from mmcv.utils.registry import Registry, build_from_cfg
SAMPLER = Registry('sampler')
def build_sampler(cfg, default_args):
return build_from_cfg(cfg, SAMPLER, default_args)
================================================
FILE: projects/occ_plugin/occupancy/__init__.py
================================================
from .dense_heads import *
from .detectors import *
from .backbones import *
from .image2bev import *
from .voxel_encoder import *
from .necks import *
from .fuser import *
================================================
FILE: projects/occ_plugin/occupancy/apis/__init__.py
================================================
from .train import custom_train_model
from .mmdet_train import custom_train_detector
# from .test import custom_multi_gpu_test
================================================
FILE: projects/occ_plugin/occupancy/apis/mmdet_train.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Junyi Ma, following OpenOccupancy of Xiaofeng Wang
# ---------------------------------------------
import random
import warnings
import numpy as np
import torch
import torch.distributed as dist
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
Fp16OptimizerHook, OptimizerHook, build_optimizer,
build_runner, get_dist_info)
from mmcv.utils import build_from_cfg
from mmdet.core import EvalHook
from mmdet.datasets import (build_dataset,
replace_ImageToTensor)
from mmdet.utils import get_root_logger
import time
import os.path as osp
from projects.occ_plugin.datasets.builder import build_dataloader
from projects.occ_plugin.core.evaluation.eval_hooks import OccDistEvalHook, OccEvalHook
from projects.occ_plugin.datasets import custom_build_dataset
def custom_train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
meta=None):
logger = get_root_logger(cfg.log_level)
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
data_loaders = [
build_dataloader(
ds,
cfg.data.samples_per_gpu,
cfg.data.workers_per_gpu,
# cfg.gpus will be ignored if distributed
len(cfg.gpu_ids),
dist=distributed,
seed=cfg.seed,
shuffler_sampler=cfg.data.shuffler_sampler,
nonshuffler_sampler=cfg.data.nonshuffler_sampler,
) for ds in dataset
]
# torch.distributed.init_process_group(backend='nccl')
if distributed:
find_unused_parameters = cfg.get('find_unused_parameters', False)
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
# build runner
optimizer = build_optimizer(model, cfg.optimizer)
assert 'runner' in cfg
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
# an ugly workaround to make .log and .log.json filenames the same
runner.timestamp = timestamp
# fp16 setting TODO
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
optimizer_config = Fp16OptimizerHook(
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
elif distributed and 'type' not in cfg.optimizer_config:
optimizer_config = OptimizerHook(**cfg.optimizer_config)
else:
optimizer_config = cfg.optimizer_config
# register hooks
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config,
cfg.get('momentum_config', None))
if distributed:
if isinstance(runner, EpochBasedRunner):
runner.register_hook(DistSamplerSeedHook())
rank, world_size = get_dist_info()
if cfg.resume_from:
if rank == 0:
print("-------------")
print("resume from " + cfg.resume_from)
print("-------------")
runner.resume(cfg.resume_from)
elif cfg.load_from:
if rank == 0:
print("-------------")
print("load from " + cfg.load_from)
print("-------------")
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow)
================================================
FILE: projects/occ_plugin/occupancy/apis/test.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import os.path as osp
import pickle
import shutil
import tempfile
import time
import mmcv
import torch
import torch.distributed as dist
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info
from mmdet.utils import get_root_logger
from mmdet.core import encode_mask_results
import numpy as np
import pycocotools.mask as mask_util
from fvcore.nn import FlopCountAnalysis, parameter_count_table
def custom_encode_mask_results(mask_results):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms = mask_results
num_classes = len(cls_segms)
encoded_mask_results = []
for i in range(len(cls_segms)):
encoded_mask_results.append(
mask_util.encode(
np.array(
cls_segms[i][:, :, np.newaxis], order='F',
dtype='uint8'))[0]) # encoded with RLE
return [encoded_mask_results]
def custom_single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3):
model.eval()
iou_metric = 0
vpq_metric = 0
dataset = data_loader.dataset
prog_bar = mmcv.ProgressBar(len(dataset))
logger = get_root_logger()
logger.info(parameter_count_table(model))
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
if 'hist_for_iou' in result.keys():
iou_metric += result['hist_for_iou']
vpq_metric += result['vpq']
prog_bar.update()
res = {
'hist_for_iou': iou_metric,
'vpq_len': len(dataset),
'vpq_metric': vpq_metric,
}
return res
def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False, show=False, out_dir=None):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model.eval()
# init predictions
iou_metric = []
vpq_metric = []
dataset = data_loader.dataset
rank, world_size = get_dist_info()
if rank == 0:
prog_bar = mmcv.ProgressBar(len(dataset))
time.sleep(2) # This line can prevent deadlock problem in some cases.
logger = get_root_logger()
logger.info(parameter_count_table(model))
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
if 'hist_for_iou' in result.keys():
iou_metric.append(result['hist_for_iou'])
if 'vpq' in result.keys():
vpq_metric.append(result['vpq'])
batch_size = 1
if rank == 0:
for _ in range(batch_size * world_size):
prog_bar.update()
# collect lists from multi-GPUs
res = {}
if 'hist_for_iou' in result.keys():
iou_metric = [sum(iou_metric)]
iou_metric = collect_results_cpu(iou_metric, len(dataset), tmpdir)
res['hist_for_iou'] = iou_metric
if 'vpq' in result.keys():
res['vpq_len'] = len(dataset)
vpq_metric = [sum(vpq_metric)]
vpq_metric = collect_results_cpu(vpq_metric, len(dataset), tmpdir)
res['vpq_metric'] = vpq_metric
return res
def collect_results_cpu(result_part, size, tmpdir=None, type='list'):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN,), 32, dtype=torch.uint8, device='cuda')
if rank == 0:
mmcv.mkdir_or_exist('.dist_test')
tmpdir = tempfile.mkdtemp(dir='.dist_test')
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
dist.barrier()
# collect all parts
if rank == 0:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, f'part_{i}.pkl')
part_list.append(mmcv.load(part_file))
# sort the results
if type == 'list':
ordered_results = []
for res in part_list:
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
else:
raise NotImplementedError
# remove tmp dir
shutil.rmtree(tmpdir)
dist.barrier()
if rank != 0:
return None
return ordered_results
================================================
FILE: projects/occ_plugin/occupancy/apis/train.py
================================================
from .mmdet_train import custom_train_detector
from mmseg.apis import train_segmentor
from mmdet.apis import train_detector
def custom_train_model(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
meta=None):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if cfg.model.type in ['EncoderDecoder3D']:
assert False
else:
custom_train_detector(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
meta=meta)
def train_model(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
meta=None):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if cfg.model.type in ['EncoderDecoder3D']:
train_segmentor(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
meta=meta)
else:
train_detector(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
meta=meta)
================================================
FILE: projects/occ_plugin/occupancy/backbones/__init__.py
================================================
from .resnet3d import CustomResNet3D
from .pred_block import Predictor
================================================
FILE: projects/occ_plugin/occupancy/backbones/pred_block.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet3d.models.builder import BACKBONES
from collections import OrderedDict
from mmcv.cnn import build_norm_layer
class Residual(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size=(3,3,1),
dilation=1,
norm_cfg=None
):
super().__init__()
out_channels = out_channels or in_channels
# padding_size = ((kernel_size - 1) * dilation + 1) // 2
padding_size = [0,0,0]
if dilation!=0:
padding_size[0] = ((kernel_size[0] - 1) * dilation + 1) // 2
padding_size[1] = ((kernel_size[1] - 1) * dilation + 1) // 2
padding_size[2] = ((kernel_size[2] - 1) * dilation + 1) // 2
padding_size = tuple(padding_size)
conv = nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size, bias=False, dilation=dilation, padding=padding_size)
self.layers = nn.Sequential(conv, build_norm_layer(norm_cfg, out_channels)[1], nn.LeakyReLU(inplace=True))
if out_channels == in_channels :
self.projection = None
else:
projection = OrderedDict()
projection.update(
{
'conv_skip_proj': nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=False),
'bn_skip_proj': build_norm_layer(norm_cfg, out_channels)[1],
}
)
self.projection = nn.Sequential(projection)
def forward(self, x):
x_residual = self.layers(x)
if self.projection is not None:
x_projected = self.projection(x)
return x_residual + x_projected
return x_residual + x
@BACKBONES.register_module()
class Predictor(nn.Module):
def __init__(
self,
n_input_channels=None,
in_timesteps=None,
out_timesteps=None,
norm_cfg=None,
):
super(Predictor, self).__init__()
self.predictor = nn.ModuleList()
for nf in n_input_channels:
self.predictor.append(nn.Sequential(
Residual(nf * in_timesteps, nf * in_timesteps, norm_cfg=norm_cfg),
Residual(nf * in_timesteps, nf * in_timesteps, norm_cfg=norm_cfg),
Residual(nf * in_timesteps, nf * out_timesteps, norm_cfg=norm_cfg),
Residual(nf * out_timesteps, nf * out_timesteps, norm_cfg=norm_cfg),
Residual(nf * out_timesteps, nf * out_timesteps, norm_cfg=norm_cfg),
))
def forward(self, x):
assert len(x) == len(self.predictor), f'The number of input feature tensors ({len(x)}) must be the same as the number of STPredictor blocks {len(self.predictor)}.'
y = []
for i in range(len(x)):
b, c, _, _, _ = x[i].shape
y.append(self.predictor[i](x[i]))
return y
================================================
FILE: projects/occ_plugin/occupancy/backbones/resnet3d.py
================================================
import math
from functools import partial
from mmdet3d.models.builder import BACKBONES
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule
import torch
import torch.nn as nn
import torch.nn.functional as F
import pdb
def get_inplanes():
return [64, 128, 256, 512]
def conv3x3x3(in_planes, out_planes, stride=1):
return nn.Conv3d(in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
def conv1x1x1(in_planes, out_planes, stride=1):
return nn.Conv3d(in_planes,
out_planes,
kernel_size=1,
stride=stride,
bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None):
super().__init__()
self.conv1 = conv3x3x3(in_planes, planes, stride)
self.bn1 = build_norm_layer(norm_cfg, planes)[1]
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3x3(planes, planes)
self.bn2 = build_norm_layer(norm_cfg, planes)[1]
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None):
super().__init__()
self.conv1 = conv1x1x1(in_planes, planes)
self.bn1 = build_norm_layer(norm_cfg, planes)[1]
self.conv2 = conv3x3x3(planes, planes, stride)
self.bn2 = build_norm_layer(norm_cfg, planes)[1]
self.conv3 = conv1x1x1(planes, planes * self.expansion)
self.bn3 = build_norm_layer(norm_cfg, planes * self.expansion)[1]
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
@BACKBONES.register_module()
class CustomResNet3D(BaseModule):
def __init__(self,
depth,
block_inplanes=[64, 128, 256, 512],
block_strides=[1, 2, 2, 2],
out_indices=(0, 1, 2, 3),
n_input_channels=3,
shortcut_type='B',
norm_cfg=dict(type='BN3d', requires_grad=True),
widen_factor=1.0):
super().__init__()
layer_metas = {
10: [1, 1, 1, 1],
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
}
if depth in [10, 18, 34]:
block = BasicBlock
else:
assert depth in [50, 101]
block = Bottleneck
layers = layer_metas[depth]
block_inplanes = [int(x * widen_factor) for x in block_inplanes]
self.in_planes = block_inplanes[0]
self.out_indices = out_indices
# replace the first several downsampling layers with the channel-squeeze layers
self.input_proj = nn.Sequential(
nn.Conv3d(n_input_channels, self.in_planes, kernel_size=(1, 1, 1),
stride=(1, 1, 1), bias=False),
build_norm_layer(norm_cfg, self.in_planes)[1],
nn.ReLU(inplace=True),
)
self.layers = nn.ModuleList()
for i in range(len(block_inplanes)):
self.layers.append(self._make_layer(block, block_inplanes[i], layers[i],
shortcut_type, block_strides[i], norm_cfg=norm_cfg))
for m in self.modules():
if isinstance(m, nn.Conv3d):
nn.init.kaiming_normal_(m.weight,
mode='fan_out',
nonlinearity='relu')
elif isinstance(m, nn.BatchNorm3d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def _downsample_basic_block(self, x, planes, stride):
out = F.avg_pool3d(x, kernel_size=1, stride=stride)
zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2),
out.size(3), out.size(4))
if isinstance(out.data, torch.cuda.FloatTensor):
zero_pads = zero_pads.cuda()
out = torch.cat([out.data, zero_pads], dim=1)
return out
def _make_layer(self, block, planes, blocks, shortcut_type, stride=1, norm_cfg=None):
downsample = None
if stride != 1 or self.in_planes != planes * block.expansion:
if shortcut_type == 'A':
downsample = partial(self._downsample_basic_block,
planes=planes * block.expansion,
stride=stride)
else:
downsample = nn.Sequential(
conv1x1x1(self.in_planes, planes * block.expansion, stride),
build_norm_layer(norm_cfg, planes * block.expansion)[1])
layers = []
layers.append(
block(in_planes=self.in_planes,
planes=planes,
stride=stride,
downsample=downsample,
norm_cfg=norm_cfg))
self.in_planes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.in_planes, planes, norm_cfg=norm_cfg))
return nn.Sequential(*layers)
def forward(self, x):
x = self.input_proj(x)
res = []
for index, layer in enumerate(self.layers):
x = layer(x)
if index in self.out_indices:
res.append(x)
return res
def generate_model(model_depth, **kwargs):
assert model_depth in [10, 18, 34, 50, 101, 152, 200]
if model_depth == 10:
model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs)
elif model_depth == 18:
model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs)
elif model_depth == 34:
model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs)
elif model_depth == 50:
model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs)
elif model_depth == 101:
model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs)
elif model_depth == 152:
model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs)
elif model_depth == 200:
model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs)
return model
================================================
FILE: projects/occ_plugin/occupancy/dense_heads/__init__.py
================================================
from .occ_head import OccHead
from .flow_head import FlowHead
================================================
FILE: projects/occ_plugin/occupancy/dense_heads/flow_head.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import reduce_mean
from mmdet.models import HEADS
from mmcv.cnn import build_conv_layer, build_norm_layer
from .lovasz_softmax import lovasz_softmax
from projects.occ_plugin.utils.nusc_param import nusc_class_names
from projects.occ_plugin.utils.semkitti import Smooth_L1_loss
@HEADS.register_module()
class FlowHead(nn.Module):
def __init__(
self,
in_channels,
out_channel,
num_level=1,
num_img_level=1,
soft_weights=False,
loss_weight_cfg=None,
conv_cfg=dict(type='Conv3d', bias=False),
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
fine_topk=20000,
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
final_occ_size=[256, 256, 20],
empty_idx=0,
visible_loss=False,
balance_cls_weight=True,
train_cfg=None,
test_cfg=None,
):
super(FlowHead, self).__init__()
if type(in_channels) is not list:
in_channels = [in_channels]
self.in_channels = in_channels
self.out_channel = out_channel
self.num_level = num_level
self.fine_topk = fine_topk
self.point_cloud_range = torch.tensor(np.array(point_cloud_range)).float()
self.final_occ_size = final_occ_size
self.visible_loss = visible_loss
# voxel-level prediction
self.occ_convs = nn.ModuleList()
for i in range(self.num_level):
mid_channel = self.in_channels[i]
occ_conv = nn.Sequential(
build_conv_layer(conv_cfg, in_channels=self.in_channels[i],
out_channels=mid_channel, kernel_size=3, stride=1, padding=1),
build_norm_layer(norm_cfg, mid_channel)[1],
nn.ReLU(inplace=True))
self.occ_convs.append(occ_conv)
self.occ_pred_conv = nn.Sequential(
build_conv_layer(conv_cfg, in_channels=mid_channel,
out_channels=mid_channel//2, kernel_size=1, stride=1, padding=0),
build_norm_layer(norm_cfg, mid_channel//2)[1],
nn.ReLU(inplace=True),)
self.last_conv = build_conv_layer(conv_cfg, in_channels=mid_channel//2,
out_channels=out_channel, kernel_size=1, stride=1, padding=0)
self.last_conv.bias = nn.parameter.Parameter(torch.tensor([0.0, 0.0, 0.0], requires_grad=True))
self.soft_weights = soft_weights
self.num_img_level = num_img_level
self.num_point_sampling_feat = self.num_level
if self.soft_weights:
soft_in_channel = mid_channel
self.voxel_soft_weights = nn.Sequential(
build_conv_layer(conv_cfg, in_channels=soft_in_channel,
out_channels=soft_in_channel//2, kernel_size=1, stride=1, padding=0),
build_norm_layer(norm_cfg, soft_in_channel//2)[1],
nn.ReLU(inplace=True),
build_conv_layer(conv_cfg, in_channels=soft_in_channel//2,
out_channels=self.num_point_sampling_feat, kernel_size=1, stride=1, padding=0))
self.class_names = nusc_class_names
self.empty_idx = empty_idx
def forward_coarse_voxel(self, voxel_feats):
output_occs = []
output = {}
for feats, occ_conv in zip(voxel_feats, self.occ_convs):
output_occs.append(occ_conv(feats))
if self.soft_weights:
voxel_soft_weights = self.voxel_soft_weights(output_occs[0])
voxel_soft_weights = torch.softmax(voxel_soft_weights, dim=1)
else:
voxel_soft_weights = torch.ones([output_occs[0].shape[0], self.num_point_sampling_feat, 1, 1, 1],).to(output_occs[0].device) / self.num_point_sampling_feat
out_voxel_feats = 0
_, _, H, W, D= output_occs[0].shape
for feats, weights in zip(output_occs, torch.unbind(voxel_soft_weights, dim=1)):
feats = F.interpolate(feats, size=[H, W, D], mode='trilinear', align_corners=False).contiguous()
out_voxel_feats += feats * weights.unsqueeze(1)
output['out_voxel_feats'] = [out_voxel_feats]
out_voxel = self.occ_pred_conv(out_voxel_feats)
out_voxel = self.last_conv(out_voxel)
output['occ'] = [out_voxel]
return output
def forward(self, voxel_feats, img_feats=None, transform=None, **kwargs):
assert type(voxel_feats) is list and len(voxel_feats) == self.num_level
# forward voxel
output = self.forward_coarse_voxel(voxel_feats)
res = {
'output_voxels': output['occ'],
}
return res
def loss_voxel(self, output_voxels, target_voxels, tag):
B, C, H, W, D = output_voxels.shape
tB, tC, tF, tH, tW, tD = target_voxels.shape
target_voxels = target_voxels.view(tB*tC, tF, tH, tW, tD)
assert torch.isnan(output_voxels).sum().item() == 0
output_voxels = output_voxels.permute(0,2,3,4,1)
target_voxels = target_voxels.permute(0,2,3,4,1)
loss_dict = {}
loss_dict['loss_flow_l1_{}'.format(tag)] = (0.5) * (0.1) * Smooth_L1_loss(output_voxels, target_voxels, ignore_index=255)
return loss_dict
def loss_point(self, fine_coord, fine_output, target_voxels, tag):
selected_gt = target_voxels[:, fine_coord[0,:], fine_coord[1,:], fine_coord[2,:]].long()[0]
assert torch.isnan(selected_gt).sum().item() == 0, torch.isnan(selected_gt).sum().item()
assert torch.isnan(fine_output).sum().item() == 0, torch.isnan(fine_output).sum().item()
loss_dict = {}
# igore 255 = ignore noise. we keep the loss bascward for the label=0 (free voxels)
loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * CE_ssc_loss(fine_output, selected_gt, ignore_index=255)
loss_dict['loss_voxel_sem_scal_{}'.format(tag)] = self.loss_voxel_sem_scal_weight * sem_scal_loss(fine_output, selected_gt, ignore_index=255)
loss_dict['loss_voxel_geo_scal_{}'.format(tag)] = self.loss_voxel_geo_scal_weight * geo_scal_loss(fine_output, selected_gt, ignore_index=255, non_empty_idx=self.empty_idx)
loss_dict['loss_voxel_lovasz_{}'.format(tag)] = self.loss_voxel_lovasz_weight * lovasz_softmax(torch.softmax(fine_output, dim=1), selected_gt, ignore=255)
return loss_dict
def loss(self, output_voxels=None,
output_coords_fine=None, output_voxels_fine=None,
target_voxels=None, **kwargs):
loss_dict = {}
for index, output_voxel in enumerate(output_voxels):
loss_dict.update(self.loss_voxel(output_voxel, target_voxels, tag='c_{}'.format(index)))
return loss_dict
================================================
FILE: projects/occ_plugin/occupancy/dense_heads/lovasz_softmax.py
================================================
# -*- coding:utf-8 -*-
# author: Xinge
"""
Lovasz-Softmax and Jaccard hinge loss in PyTorch
Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)
"""
from __future__ import print_function, division
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
try:
from itertools import ifilterfalse
except ImportError: # py3k
from itertools import filterfalse as ifilterfalse
def lovasz_grad(gt_sorted):
"""
Computes gradient of the Lovasz extension w.r.t sorted errors
See Alg. 1 in paper
"""
p = len(gt_sorted)
gts = gt_sorted.sum()
intersection = gts - gt_sorted.float().cumsum(0)
union = gts + (1 - gt_sorted).float().cumsum(0)
jaccard = 1. - intersection / union
if p > 1: # cover 1-pixel case
jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
return jaccard
def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True):
"""
IoU for foreground class
binary: 1 foreground, 0 background
"""
if not per_image:
preds, labels = (preds,), (labels,)
ious = []
for pred, label in zip(preds, labels):
intersection = ((label == 1) & (pred == 1)).sum()
union = ((label == 1) | ((pred == 1) & (label != ignore))).sum()
if not union:
iou = EMPTY
else:
iou = float(intersection) / float(union)
ious.append(iou)
iou = mean(ious) # mean accross images if per_image
return 100 * iou
def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
"""
Array of IoU for each (non ignored) class
"""
if not per_image:
preds, labels = (preds,), (labels,)
ious = []
for pred, label in zip(preds, labels):
iou = []
for i in range(C):
if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes)
intersection = ((label == i) & (pred == i)).sum()
union = ((label == i) | ((pred == i) & (label != ignore))).sum()
if not union:
iou.append(EMPTY)
else:
iou.append(float(intersection) / float(union))
ious.append(iou)
ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image
return 100 * np.array(ious)
# --------------------------- BINARY LOSSES ---------------------------
def lovasz_hinge(logits, labels, per_image=True, ignore=None):
"""
Binary Lovasz hinge loss
logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
per_image: compute the loss per image instead of per batch
ignore: void class id
"""
if per_image:
loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
for log, lab in zip(logits, labels))
else:
loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
return loss
def lovasz_hinge_flat(logits, labels):
"""
Binary Lovasz hinge loss
logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
labels: [P] Tensor, binary ground truth labels (0 or 1)
ignore: label to ignore
"""
if len(labels) == 0:
# only void pixels, the gradients should be 0
return logits.sum() * 0.
signs = 2. * labels.float() - 1.
errors = (1. - logits * Variable(signs))
errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
perm = perm.data
gt_sorted = labels[perm]
grad = lovasz_grad(gt_sorted)
loss = torch.dot(F.relu(errors_sorted), Variable(grad))
return loss
def flatten_binary_scores(scores, labels, ignore=None):
"""
Flattens predictions in the batch (binary case)
Remove labels equal to 'ignore'
"""
scores = scores.view(-1)
labels = labels.view(-1)
if ignore is None:
return scores, labels
valid = (labels != ignore)
vscores = scores[valid]
vlabels = labels[valid]
return vscores, vlabels
class StableBCELoss(torch.nn.modules.Module):
def __init__(self):
super(StableBCELoss, self).__init__()
def forward(self, input, target):
neg_abs = - input.abs()
loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
return loss.mean()
def binary_xloss(logits, labels, ignore=None):
"""
Binary Cross entropy loss
logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
ignore: void class id
"""
logits, labels = flatten_binary_scores(logits, labels, ignore)
loss = StableBCELoss()(logits, Variable(labels.float()))
return loss
# --------------------------- MULTICLASS LOSSES ---------------------------
def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=None):
"""
Multi-class Lovasz-Softmax loss
probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
per_image: compute the loss per image instead of per batch
ignore: void class labels
"""
if per_image:
loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes)
for prob, lab in zip(probas, labels))
else:
loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes)
return loss
def lovasz_softmax_flat(probas, labels, classes='present'):
"""
Multi-class Lovasz-Softmax loss
probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
labels: [P] Tensor, ground truth labels (between 0 and C - 1)
classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
"""
if probas.numel() == 0:
# only void pixels, the gradients should be 0
return probas * 0.
C = probas.size(1)
losses = []
class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
for c in class_to_sum:
fg = (labels == c).float() # foreground for class c
if (classes is 'present' and fg.sum() == 0):
continue
if C == 1:
if len(classes) > 1:
raise ValueError('Sigmoid output possible only with 1 class')
class_pred = probas[:, 0]
else:
class_pred = probas[:, c]
errors = (Variable(fg) - class_pred).abs()
errors_sorted, perm = torch.sort(errors, 0, descending=True)
perm = perm.data
fg_sorted = fg[perm]
losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
return mean(losses)
def flatten_probas(probas, labels, ignore=None):
"""
Flattens predictions in the batch
"""
if probas.dim() == 2:
if ignore is not None:
valid = (labels != ignore)
probas = probas[valid]
labels = labels[valid]
return probas, labels
elif probas.dim() == 3:
# assumes output of a sigmoid layer
B, H, W = probas.size()
probas = probas.view(B, 1, H, W)
elif probas.dim() == 5:
#3D segmentation
B, C, L, H, W = probas.size()
probas = probas.contiguous().view(B, C, L, H*W)
B, C, H, W = probas.size()
probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
labels = labels.view(-1)
if ignore is None:
return probas, labels
valid = (labels != ignore)
vprobas = probas[valid.nonzero().squeeze()]
vlabels = labels[valid]
return vprobas, vlabels
def xloss(logits, labels, ignore=None):
"""
Cross entropy loss
"""
return F.cross_entropy(logits, Variable(labels), ignore_index=255)
def jaccard_loss(probas, labels,ignore=None, smooth = 100, bk_class = None):
"""
Something wrong with this loss
Multi-class Lovasz-Softmax loss
probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
per_image: compute the loss per image instead of per batch
ignore: void class labels
"""
vprobas, vlabels = flatten_probas(probas, labels, ignore)
true_1_hot = torch.eye(vprobas.shape[1])[vlabels]
if bk_class:
one_hot_assignment = torch.ones_like(vlabels)
one_hot_assignment[vlabels == bk_class] = 0
one_hot_assignment = one_hot_assignment.float().unsqueeze(1)
true_1_hot = true_1_hot*one_hot_assignment
true_1_hot = true_1_hot.to(vprobas.device)
intersection = torch.sum(vprobas * true_1_hot)
cardinality = torch.sum(vprobas + true_1_hot)
loss = (intersection + smooth / (cardinality - intersection + smooth)).mean()
return (1-loss)*smooth
def hinge_jaccard_loss(probas, labels,ignore=None, classes = 'present', hinge = 0.1, smooth =100):
"""
Multi-class Hinge Jaccard loss
probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
ignore: void class labels
"""
vprobas, vlabels = flatten_probas(probas, labels, ignore)
C = vprobas.size(1)
losses = []
class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
for c in class_to_sum:
if c in vlabels:
c_sample_ind = vlabels == c
cprobas = vprobas[c_sample_ind,:]
non_c_ind =np.array([a for a in class_to_sum if a != c])
class_pred = cprobas[:,c]
max_non_class_pred = torch.max(cprobas[:,non_c_ind],dim = 1)[0]
TP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.) + smooth
FN = torch.sum(torch.clamp(max_non_class_pred - class_pred, min = -hinge)+hinge)
if (~c_sample_ind).sum() == 0:
FP = 0
else:
nonc_probas = vprobas[~c_sample_ind,:]
class_pred = nonc_probas[:,c]
max_non_class_pred = torch.max(nonc_probas[:,non_c_ind],dim = 1)[0]
FP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.)
losses.append(1 - TP/(TP+FP+FN))
if len(losses) == 0: return 0
return mean(losses)
# --------------------------- HELPER FUNCTIONS ---------------------------
def isnan(x):
return x != x
def mean(l, ignore_nan=False, empty=0):
"""
nanmean compatible with generators.
"""
l = iter(l)
if ignore_nan:
l = ifilterfalse(isnan, l)
try:
n = 1
acc = next(l)
except StopIteration:
if empty == 'raise':
raise ValueError('Empty mean')
return empty
for n, v in enumerate(l, 2):
acc += v
if n == 1:
return acc
return acc / n
================================================
FILE: projects/occ_plugin/occupancy/dense_heads/occ_head.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import reduce_mean
from mmdet.models import HEADS
from mmcv.cnn import build_conv_layer, build_norm_layer
from .lovasz_softmax import lovasz_softmax
from projects.occ_plugin.utils.nusc_param import nusc_class_names
from projects.occ_plugin.utils.semkitti import geo_scal_loss, sem_scal_loss, CE_ssc_loss
@HEADS.register_module()
class OccHead(nn.Module):
def __init__(
self,
in_channels,
out_channel,
num_level=1,
num_img_level=1,
soft_weights=False,
loss_weight_cfg=None,
conv_cfg=dict(type='Conv3d', bias=False),
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
fine_topk=20000,
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
final_occ_size=[256, 256, 20],
empty_idx=0,
visible_loss=False,
balance_cls_weight=True,
train_cfg=None,
test_cfg=None,
):
super(OccHead, self).__init__()
if type(in_channels) is not list:
in_channels = [in_channels]
self.in_channels = in_channels
self.out_channel = out_channel
self.num_level = num_level
self.fine_topk = fine_topk
self.point_cloud_range = torch.tensor(np.array(point_cloud_range)).float()
self.final_occ_size = final_occ_size
self.visible_loss = visible_loss
if loss_weight_cfg is None:
self.loss_weight_cfg = {
"loss_voxel_ce_weight": 1.0,
"loss_voxel_sem_scal_weight": 1.0,
"loss_voxel_geo_scal_weight": 1.0,
"loss_voxel_lovasz_weight": 1.0,
}
else:
self.loss_weight_cfg = loss_weight_cfg
# voxel losses
self.loss_voxel_ce_weight = self.loss_weight_cfg.get('loss_voxel_ce_weight', 1.0)
self.loss_voxel_sem_scal_weight = self.loss_weight_cfg.get('loss_voxel_sem_scal_weight', 1.0)
self.loss_voxel_geo_scal_weight = self.loss_weight_cfg.get('loss_voxel_geo_scal_weight', 1.0)
self.loss_voxel_lovasz_weight = self.loss_weight_cfg.get('loss_voxel_lovasz_weight', 1.0)
# voxel-level prediction
self.occ_convs = nn.ModuleList()
for i in range(self.num_level):
mid_channel = self.in_channels[i]
occ_conv = nn.Sequential(
build_conv_layer(conv_cfg, in_channels=self.in_channels[i],
out_channels=mid_channel, kernel_size=3, stride=1, padding=1),
build_norm_layer(norm_cfg, mid_channel)[1],
nn.ReLU(inplace=True))
self.occ_convs.append(occ_conv)
self.occ_pred_conv = nn.Sequential(
build_conv_layer(conv_cfg, in_channels=mid_channel,
out_channels=mid_channel//2, kernel_size=1, stride=1, padding=0),
build_norm_layer(norm_cfg, mid_channel//2)[1],
nn.ReLU(inplace=True),
build_conv_layer(conv_cfg, in_channels=mid_channel//2,
out_channels=out_channel, kernel_size=1, stride=1, padding=0))
self.soft_weights = soft_weights
self.num_img_level = num_img_level
self.num_point_sampling_feat = self.num_level
if self.soft_weights:
soft_in_channel = mid_channel
self.voxel_soft_weights = nn.Sequential(
build_conv_layer(conv_cfg, in_channels=soft_in_channel,
out_channels=soft_in_channel//2, kernel_size=1, stride=1, padding=0),
build_norm_layer(norm_cfg, soft_in_channel//2)[1],
nn.ReLU(inplace=True),
build_conv_layer(conv_cfg, in_channels=soft_in_channel//2,
out_channels=self.num_point_sampling_feat, kernel_size=1, stride=1, padding=0)) # num_point_sampling_feat=4
if balance_cls_weight:
# out_channel
self.class_weights = np.ones((out_channel,))
self.class_weights[1:] = 5
self.class_weights = torch.from_numpy(self.class_weights)
else:
self.class_weights = np.ones((out_channel,))
self.class_names = nusc_class_names
self.empty_idx = empty_idx
def forward_coarse_voxel(self, voxel_feats):
output_occs = []
output = {}
for feats, occ_conv in zip(voxel_feats, self.occ_convs):
output_occs.append(occ_conv(feats))
if self.soft_weights:
voxel_soft_weights = self.voxel_soft_weights(output_occs[0])
voxel_soft_weights = torch.softmax(voxel_soft_weights, dim=1)
else:
voxel_soft_weights = torch.ones([output_occs[0].shape[0], self.num_point_sampling_feat, 1, 1, 1],).to(output_occs[0].device) / self.num_point_sampling_feat
out_voxel_feats = 0
_, _, H, W, D= output_occs[0].shape
for feats, weights in zip(output_occs, torch.unbind(voxel_soft_weights, dim=1)):
feats = F.interpolate(feats, size=[H, W, D], mode='trilinear', align_corners=False).contiguous()
out_voxel_feats += feats * weights.unsqueeze(1)
output['out_voxel_feats'] = [out_voxel_feats]
out_voxel = self.occ_pred_conv(out_voxel_feats)
output['occ'] = [out_voxel]
return output
def forward(self, voxel_feats, img_feats=None, transform=None, **kwargs):
assert type(voxel_feats) is list and len(voxel_feats) == self.num_level
# forward voxel
output = self.forward_coarse_voxel(voxel_feats)
res = {
'output_voxels': output['occ'],
}
return res
def loss_voxel(self, output_voxels, target_voxels, tag):
B, C, H, W, D = output_voxels.shape
tB, tC, tH, tW, tD = target_voxels.shape
target_voxels = target_voxels.view(tB*tC, tH, tW, tD)
ratio = target_voxels.shape[2] // H
if ratio != 1:
target_voxels = target_voxels.reshape(B, H, ratio, W, ratio, D, ratio).permute(0,1,3,5,2,4,6).reshape(B, H, W, D, ratio**3)
empty_mask = target_voxels.sum(-1) == self.empty_idx
target_voxels = target_voxels.to(torch.int64)
occ_space = target_voxels[~empty_mask]
occ_space[occ_space==0] = -torch.arange(len(occ_space[occ_space==0])).to(occ_space.device) - 1
target_voxels[~empty_mask] = occ_space
target_voxels = torch.mode(target_voxels, dim=-1)[0]
target_voxels[target_voxels<0] = 255
target_voxels = target_voxels.long()
assert torch.isnan(output_voxels).sum().item() == 0
assert torch.isnan(target_voxels).sum().item() == 0
loss_dict = {}
loss_dict['loss_voxel_ce_{}'.format(tag)] = (0.5) * CE_ssc_loss(output_voxels, target_voxels, self.class_weights.type_as(output_voxels), ignore_index=255)
return loss_dict
def loss_point(self, fine_coord, fine_output, target_voxels, tag):
selected_gt = target_voxels[:, fine_coord[0,:], fine_coord[1,:], fine_coord[2,:]].long()[0]
assert torch.isnan(selected_gt).sum().item() == 0, torch.isnan(selected_gt).sum().item()
assert torch.isnan(fine_output).sum().item() == 0, torch.isnan(fine_output).sum().item()
loss_dict = {}
# igore 255 = ignore noise. we keep the loss bascward for the label=0 (free voxels)
loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * CE_ssc_loss(fine_output, selected_gt, ignore_index=255)
loss_dict['loss_voxel_sem_scal_{}'.format(tag)] = self.loss_voxel_sem_scal_weight * sem_scal_loss(fine_output, selected_gt, ignore_index=255)
loss_dict['loss_voxel_geo_scal_{}'.format(tag)] = self.loss_voxel_geo_scal_weight * geo_scal_loss(fine_output, selected_gt, ignore_index=255, non_empty_idx=self.empty_idx)
loss_dict['loss_voxel_lovasz_{}'.format(tag)] = self.loss_voxel_lovasz_weight * lovasz_softmax(torch.softmax(fine_output, dim=1), selected_gt, ignore=255)
return loss_dict
def loss(self, output_voxels=None,
output_coords_fine=None, output_voxels_fine=None,
target_voxels=None, **kwargs):
loss_dict = {}
for index, output_voxel in enumerate(output_voxels):
loss_dict.update(self.loss_voxel(output_voxel, target_voxels, tag='c_{}'.format(index)))
return loss_dict
================================================
FILE: projects/occ_plugin/occupancy/dense_heads/utils.py
================================================
# borrowed from https://github.com/GuoPingPan/RPVNet/blob/main/core/models/utils/utils.py
import time
import numpy as np
import torch
from torch.nn.functional import grid_sample
import torchsparse.nn.functional as F
from torchsparse import PointTensor, SparseTensor
from torchsparse.nn.utils import get_kernel_offsets
__all__ = ['initial_voxelize', 'point_to_voxel', 'voxel_to_point',
'range_to_point','point_to_range']
def initial_voxelize(z: PointTensor, after_res) -> SparseTensor:
new_float_coord = torch.cat(
[z.C[:, :3] / after_res, z.C[:, -1].view(-1, 1)], 1)
pc_hash = F.sphash(torch.round(new_float_coord).int())
sparse_hash = torch.unique(pc_hash)
idx_query = F.sphashquery(pc_hash, sparse_hash)
counts = F.spcount(idx_query.int(), len(sparse_hash))
inserted_coords = F.spvoxelize(torch.round(new_float_coord), idx_query,counts)
inserted_coords = torch.round(inserted_coords).int()
inserted_feat = F.spvoxelize(z.F, idx_query, counts)
new_tensor = SparseTensor(inserted_feat, inserted_coords, 1)
new_tensor.cmaps.setdefault((1,1,1), new_tensor.coords)
z.additional_features['idx_query'][(1,1,1)] = idx_query
z.additional_features['counts'][(1,1,1)] = counts
return new_tensor.to(z.F.device)
def point_to_voxel(x: SparseTensor, z: PointTensor) -> SparseTensor:
if z.additional_features is None or z.additional_features['idx_query'] is None \
or z.additional_features['idx_query'].get(x.s) is None:
pc_hash = F.sphash(
torch.cat([
torch.round(z.C[:, :3] / x.s[0]).int(),
z.C[:, -1].int().view(-1, 1)
], 1))
sparse_hash = F.sphash(x.C)
idx_query = F.sphashquery(pc_hash, sparse_hash)
counts = F.spcount(idx_query.int(), x.C.shape[0])
else:
idx_query = z.additional_features['idx_query'][x.s]
counts = z.additional_features['counts'][x.s]
inserted_feat = F.spvoxelize(z.F, idx_query, counts)
new_tensor = SparseTensor(inserted_feat, x.C, x.s)
new_tensor.cmaps = x.cmaps
new_tensor.kmaps = x.kmaps
return new_tensor
def voxel_to_point(x: SparseTensor, z: PointTensor, nearest=False) -> torch.Tensor:
if z.idx_query is None or z.weights is None or z.idx_query.get(x.s) is None \
or z.weights.get(x.s) is None:
off = get_kernel_offsets(2, x.s, 1, device=z.F.device)
old_hash = F.sphash(
torch.cat([
torch.round(z.C[:, :3] / x.s[0]).int(),
z.C[:, -1].int().view(-1, 1)
], 1), off)
pc_hash = F.sphash(x.C.to(z.F.device))
idx_query = F.sphashquery(old_hash, pc_hash)
weights = F.calc_ti_weights(z.C, idx_query,
scale=x.s[0]).transpose(0, 1).contiguous()
idx_query = idx_query.transpose(0, 1).contiguous()
if nearest:
weights[:, 1:] = 0.
idx_query[:, 1:] = -1
new_feat = F.spdevoxelize(x.F, idx_query, weights)
if x.s == (1,1,1):
z.idx_query[x.s] = idx_query
z.weights[x.s] = weights
else:
new_feat = F.spdevoxelize(x.F, z.idx_query.get(x.s), z.weights.get(x.s))
return new_feat
def range_to_point(x,px,py):
r2p = []
for batch,(p_x,p_y) in enumerate(zip(px,py)):
pypx = torch.stack([p_x,p_y],dim=2).to(px[0].device)
resampled = grid_sample(x[batch].unsqueeze(0),pypx.unsqueeze(0))
r2p.append(resampled.squeeze().permute(1,0))
return torch.concat(r2p,dim=0)
def point_to_range(range_shape,pF,px,py):
H, W = range_shape
cnt = 0
r = []
# t1 = time.time()
for batch,(p_x,p_y) in enumerate(zip(px,py)):
image = torch.zeros(size=(H,W,pF.shape[1])).to(px[0].device)
image_cumsum = torch.zeros(size=(H,W,pF.shape[1])) + 1e-5
p_x = torch.floor((p_x/2. + 0.5) * W).long()
p_y = torch.floor((p_y/2. + 0.5) * H).long()
''' v1: directly assign '''
# image[p_y,p_x] = pF[cnt:cnt+p_x.shape[1]]
''' v2: use average '''
image[p_y,p_x] += pF[cnt:cnt+p_x.shape[1]]
image_cumsum[p_y,p_x] += torch.ones(pF.shape[1])
image = image/image_cumsum.to(px[0].device)
r.append(image.permute(2,0,1))
cnt += p_x.shape[1]
return torch.stack(r,dim=0).to(px[0].device)
================================================
FILE: projects/occ_plugin/occupancy/detectors/__init__.py
================================================
from .ocfnet import OCFNet
================================================
FILE: projects/occ_plugin/occupancy/detectors/bevdepth.py
================================================
# Copyright (c) Phigent Robotics. All rights reserved.
import math
import torch
from mmcv.runner import force_fp32
import torch.nn.functional as F
from mmdet.models import DETECTORS
from mmdet3d.models import builder
from torch.utils.checkpoint import checkpoint
from mmdet3d.models.detectors import CenterPoint
import pdb
@DETECTORS.register_module()
class BEVDet(CenterPoint):
def __init__(self, img_view_transformer=None,
img_bev_encoder_backbone=None,
img_bev_encoder_neck=None, **kwargs):
super(BEVDet, self).__init__(**kwargs)
if img_view_transformer is not None:
self.img_view_transformer = builder.build_neck(img_view_transformer)
else:
self.img_view_transformer = None
if img_bev_encoder_backbone is not None:
self.img_bev_encoder_backbone = builder.build_backbone(img_bev_encoder_backbone)
else:
self.img_bev_encoder_backbone = torch.nn.Identity()
if img_bev_encoder_neck is not None:
self.img_bev_encoder_neck = builder.build_neck(img_bev_encoder_neck)
else:
self.img_bev_encoder_neck = torch.nn.Identity()
def image_encoder(self, img):
imgs = img
B, N, C, imH, imW = imgs.shape
imgs = imgs.view(B * N, C, imH, imW)
x = self.img_backbone(imgs)
if self.with_img_neck:
x = self.img_neck(x)
if type(x) in [list, tuple]:
x = x[0]
_, output_dim, ouput_H, output_W = x.shape
x = x.view(B, N, output_dim, ouput_H, output_W)
return x
@force_fp32()
def bev_encoder(self, x):
x = self.img_bev_encoder_backbone(x)
x = self.img_bev_encoder_neck(x)
if type(x) in [list, tuple]:
x = x[0]
return x
def extract_img_feat(self, img, img_metas):
"""Extract features of images."""
x = self.image_encoder(img[0])
x = self.img_view_transformer([x] + img[1:7])
x = self.bev_encoder(x)
return [x]
def extract_feat(self, points, img, img_metas):
"""Extract features from images and points."""
img_feats = self.extract_img_feat(img, img_metas)
pts_feats = None
return (img_feats, pts_feats)
def forward_train(self,
points=None,
img_metas=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
gt_labels=None,
gt_bboxes=None,
img_inputs=None,
proposals=None,
gt_bboxes_ignore=None):
"""Forward training function.
Args:
points (list[torch.Tensor], optional): Points of each sample.
Defaults to None.
img_metas (list[dict], optional): Meta information of each sample.
Defaults to None.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth 3D boxes. Defaults to None.
gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
of 3D boxes. Defaults to None.
gt_labels (list[torch.Tensor], optional): Ground truth labels
of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape
(N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
2D boxes in images to be ignored. Defaults to None.
Returns:
dict: Losses of different branches.
"""
img_feats, pts_feats = self.extract_feat(
points, img=img_inputs, img_metas=img_metas)
assert self.with_pts_bbox
losses = dict()
losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
gt_labels_3d, img_metas,
gt_bboxes_ignore)
losses.update(losses_pts)
return losses
def forward_test(self, points=None, img_metas=None, img_inputs=None, **kwargs):
"""
Args:
points (list[torch.Tensor]): the outer list indicates test-time
augmentations and inner torch.Tensor should have a shape NxC,
which contains all points in the batch.
img_metas (list[list[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch
img (list[torch.Tensor], optional): the outer
list indicates test-time augmentations and inner
torch.Tensor should have a shape NxCxHxW, which contains
all images in the batch. Defaults to None.
"""
for var, name in [(img_inputs, 'img_inputs'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError('{} must be a list, but got {}'.format(
name, type(var)))
num_augs = len(img_inputs)
if num_augs != len(img_metas):
raise ValueError(
'num of augmentations ({}) != num of image meta ({})'.format(
len(img_inputs), len(img_metas)))
if not isinstance(img_inputs[0][0],list):
img_inputs = [img_inputs] if img_inputs is None else img_inputs
points = [points] if points is None else points
return self.simple_test(points[0], img_metas[0], img_inputs[0], **kwargs)
else:
return self.aug_test(None, img_metas[0], img_inputs[0], **kwargs)
def aug_test(self, points, img_metas, img=None, rescale=False):
"""Test function without augmentaiton."""
combine_type = self.test_cfg.get('combine_type','output')
if combine_type=='output':
return self.aug_test_combine_output(points, img_metas, img, rescale)
elif combine_type=='feature':
return self.aug_test_combine_feature(points, img_metas, img, rescale)
else:
assert False
def simple_test(self, points, img_metas, img=None, rescale=False):
"""Test function without augmentaiton."""
img_feats, _ = self.extract_feat(points, img=img, img_metas=img_metas)
bbox_list = [dict() for _ in range(len(img_metas))]
bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
result_dict['pts_bbox'] = pts_bbox
return bbox_list
def forward_dummy(self, points=None, img_metas=None, img_inputs=None, **kwargs):
img_feats, _ = self.extract_feat(points, img=img_inputs, img_metas=img_metas)
from mmdet3d.core.bbox.structures.box_3d_mode import LiDARInstance3DBoxes
img_metas=[dict(box_type_3d=LiDARInstance3DBoxes)]
bbox_list = [dict() for _ in range(1)]
assert self.with_pts_bbox
bbox_pts = self.simple_test_pts(
img_feats, img_metas, rescale=False)
for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
result_dict['pts_bbox'] = pts_bbox
return bbox_list
@DETECTORS.register_module()
class BEVDet4D(BEVDet):
def __init__(self, pre_process=None,
align_after_view_transfromation=False,
detach=True,
detach_pre_process=False, **kwargs):
super(BEVDet4D, self).__init__(**kwargs)
self.pre_process = pre_process is not None
if self.pre_process:
self.pre_process_net = builder.build_backbone(pre_process)
self.align_after_view_transfromation = align_after_view_transfromation
self.detach = detach
self.detach_pre_process = detach_pre_process
@force_fp32()
def shift_feature(self, input, trans, rots):
n, c, h, w = input.shape
_, v, _ = trans[0].shape
# generate grid
xs = torch.linspace(0, w - 1, w, dtype=input.dtype,
device=input.device).view(1, w).expand(h, w)
ys = torch.linspace(0, h - 1, h, dtype=input.dtype,
device=input.device).view(h, 1).expand(h, w)
grid = torch.stack((xs, ys, torch.ones_like(xs)), -1)
grid = grid.view(1, h, w, 3).expand(n,h,w,3).view(n, h, w, 3, 1)
# get transformation from current lidar frame to adjacent lidar frame
# transformation from current camera frame to current lidar frame
c02l0 = torch.zeros((n, v, 4, 4), dtype=grid.dtype).to(grid)
c02l0[:, :, :3, :3] = rots[0]
c02l0[:, :, :3, 3] = trans[0]
c02l0[:, :, 3, 3] = 1
# transformation from adjacent camera frame to current lidar frame
c12l0 = torch.zeros((n, v, 4, 4), dtype=grid.dtype).to(grid)
c12l0[:, :, :3, :3] = rots[1]
c12l0[:, :, :3, 3] = trans[1]
c12l0[:, :, 3, 3] = 1
# transformation from current lidar frame to adjacent lidar frame
l02l1 = c02l0.matmul(torch.inverse(c12l0))[:, 0, :, :].view(n, 1, 1, 4, 4)
'''
c02l0 * inv(c12l0)
= c02l0 * inv(l12l0 * c12l1)
= c02l0 * inv(c12l1) * inv(l12l0)
= l02l1 # c02l0==c12l1
'''
l02l1 = l02l1[:, :, :, [True, True, False, True], :][:, :, :, :,
[True, True, False, True]]
feat2bev = torch.zeros((3, 3), dtype=grid.dtype).to(grid)
feat2bev[0, 0] = self.img_view_transformer.dx[0]
feat2bev[1, 1] = self.img_view_transformer.dx[1]
feat2bev[0, 2] = self.img_view_transformer.bx[0] - \
self.img_view_transformer.dx[0] / 2.
feat2bev[1, 2] = self.img_view_transformer.bx[1] - \
self.img_view_transformer.dx[1] / 2.
feat2bev[2, 2] = 1
feat2bev = feat2bev.view(1, 3, 3)
tf = torch.inverse(feat2bev).matmul(l02l1).matmul(feat2bev)
# transform and normalize
grid = tf.matmul(grid)
normalize_factor = torch.tensor([w - 1.0, h - 1.0], dtype=input.dtype,
device=input.device)
grid = grid[:, :, :, :2, 0] / normalize_factor.view(1, 1, 1,
2) * 2.0 - 1.0
output = F.grid_sample(input, grid.to(input.dtype), align_corners=True)
return output
def prepare_bev_feat(self, img, rot, tran, intrin, post_rot, post_tran, bda):
x = self.image_encoder(img)
bev_feat = self.img_view_transformer([x, rot, tran, intrin,
post_rot, post_tran, bda])
if self.pre_process:
bev_feat = self.pre_process_net(bev_feat)[0]
return bev_feat
def extract_img_feat(self, img, img_metas):
inputs = img
"""Extract features of images."""
B, N, _, H, W = inputs[0].shape
N = N//2
imgs = inputs[0].view(B,N,2,3,H,W)
imgs = torch.split(imgs,1,2)
imgs = [t.squeeze(2) for t in imgs]
rots, trans, intrins, post_rots, post_trans, bda = inputs[1:7]
extra = [rots.view(B,2,N,3,3),
trans.view(B,2,N,3),
intrins.view(B,2,N,3,3),
post_rots.view(B,2,N,3,3),
post_trans.view(B,2,N,3)]
extra = [torch.split(t, 1, 1) for t in extra]
extra = [[p.squeeze(1) for p in t] for t in extra]
rots, trans, intrins, post_rots, post_trans = extra
bev_feat_list = []
key_frame=True # back propagation for key frame only
for img, rot, tran, intrin, post_rot, \
post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans):
if self.align_after_view_transfromation:
rot, tran = rots[0], trans[0]
inputs_curr = (img, rot, tran, intrin, post_rot, post_tran, bda)
if not key_frame and self.detach:
with torch.no_grad():
bev_feat = self.prepare_bev_feat(*inputs_curr)
else:
bev_feat = self.prepare_bev_feat(*inputs_curr)
bev_feat_list.append(bev_feat)
key_frame = False
if self.align_after_view_transfromation:
bev_feat_list[1] = self.shift_feature(bev_feat_list[1],
trans, rots)
bev_feat = torch.cat(bev_feat_list, dim=1)
x = self.bev_encoder(bev_feat)
return [x]
class BEVDepth_Base(object):
def extract_feat(self, points, img, img_metas):
"""Extract features from images and points."""
img_feats, depth = self.extract_img_feat(img, img_metas)
pts_feats = None
return (img_feats, pts_feats, depth)
def simple_test(self, points, img_metas, img=None, rescale=False):
"""Test function without augmentaiton."""
img_feats, _, _ = self.extract_feat(points, img=img, img_metas=img_metas)
bbox_list = [dict() for _ in range(len(img_metas))]
bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
result_dict['pts_bbox'] = pts_bbox
return bbox_list
def forward_train(self,
points=None,
img_metas=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
gt_labels=None,
gt_bboxes=None,
img_inputs=None,
proposals=None,
gt_bboxes_ignore=None):
"""Forward training function.
Args:
points (list[torch.Tensor], optional): Points of each sample.
Defaults to None.
img_metas (list[dict], optional): Meta information of each sample.
Defaults to None.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth 3D boxes. Defaults to None.
gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
of 3D boxes. Defaults to None.
gt_labels (list[torch.Tensor], optional): Ground truth labels
of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape
(N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
2D boxes in images to be ignored. Defaults to None.
Returns:
dict: Losses of different branches.
"""
img_feats, pts_feats, depth = self.extract_feat(
points, img=img_inputs, img_metas=img_metas)
assert self.with_pts_bbox
# assert len(img_inputs) == 8
depth_gt = img_inputs[7]
loss_depth = self.img_view_transformer.get_depth_loss(depth_gt, depth)
losses = dict(loss_depth=loss_depth)
losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
gt_labels_3d, img_metas,
gt_bboxes_ignore)
losses.update(losses_pts)
# some modifications
if hasattr(self.img_view_transformer, 'loss_depth_reg_weight') and self.img_view_transformer.loss_depth_reg_weight > 0:
losses['loss_depth_reg'] = self.img_view_transformer.get_depth_reg_loss(depth_gt, depth)
return losses
@DETECTORS.register_module()
class BEVDepth(BEVDepth_Base, BEVDet):
def extract_img_feat(self, img, img_metas):
"""Extract features of images."""
x = self.image_encoder(img[0])
# img: imgs, rots, trans, intrins, post_rots, post_trans, gt_depths, sensor2sensors
rots, trans, intrins, post_rots, post_trans, bda = img[1:7]
mlp_input = self.img_view_transformer.get_mlp_input(rots, trans, intrins, post_rots, post_trans, bda)
geo_inputs = [rots, trans, intrins, post_rots, post_trans, bda, mlp_input]
x, depth = self.img_view_transformer([x] + geo_inputs)
x = self.bev_encoder(x)
return [x], depth
@DETECTORS.register_module()
class BEVDepth4D(BEVDepth_Base, BEVDet4D):
def prepare_bev_feat(self, img, rot, tran, intrin,
post_rot, post_tran, bda, mlp_input):
x = self.image_encoder(img)
bev_feat, depth = self.img_view_transformer([x, rot, tran, intrin,
post_rot, post_tran, bda, mlp_input])
if self.detach_pre_process and self.pre_process:
bev_feat = self.pre_process_net(bev_feat)[0]
return bev_feat, depth
def extract_img_feat(self, img, img_metas):
inputs = img
"""Extract features of images."""
B, N, _, H, W = inputs[0].shape
N = N//2
imgs = inputs[0].view(B,N,2,3,H,W)
imgs = torch.split(imgs,1,2)
imgs = [t.squeeze(2) for t in imgs]
rots, trans, intrins, post_rots, post_trans, bda = inputs[1:7]
extra = [rots.view(B,2,N,3,3),
trans.view(B,2,N,3),
intrins.view(B,2,N,3,3),
post_rots.view(B,2,N,3,3),
post_trans.view(B,2,N,3)]
extra = [torch.split(t, 1, 1) for t in extra]
extra = [[p.squeeze(1) for p in t] for t in extra]
rots, trans, intrins, post_rots, post_trans = extra
bev_feat_list = []
depth_list = []
key_frame=True # back propagation for key frame only
for img, rot, tran, intrin, post_rot, \
post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans):
if self.align_after_view_transfromation:
rot, tran = rots[0], trans[0]
mlp_input = self.img_view_transformer.get_mlp_input(
rots[0], trans[0], intrin,post_rot, post_tran, bda)
inputs_curr = (img, rot, tran, intrin, post_rot, post_tran, bda, mlp_input)
if not key_frame and self.detach:
with torch.no_grad():
bev_feat, depth = self.prepare_bev_feat(*inputs_curr)
else:
bev_feat, depth = self.prepare_bev_feat(*inputs_curr)
if not self.detach_pre_process and self.pre_process:
bev_feat = self.pre_process_net(bev_feat)[0]
bev_feat_list.append(bev_feat)
depth_list.append(depth)
key_frame = False
if self.align_after_view_transfromation:
bev_feat_list[1] = self.shift_feature(bev_feat_list[1],
trans, rots)
bev_feat = torch.cat(bev_feat_list, dim=1)
x = self.bev_encoder(bev_feat)
return [x], depth_list[0]
@DETECTORS.register_module()
class BEVStereo(BEVDepth4D):
def __init__(self, bevdet_model=False, **kwargs):
super(BEVStereo, self).__init__(**kwargs)
self.bevdet_model = bevdet_model
def image_encoder(self, img):
imgs = img
B, N, C, imH, imW = imgs.shape
imgs = imgs.view(B * N, C, imH, imW)
x = self.img_backbone(imgs)
stereo_feat = x[0].detach()
# if isinstance(self.img_backbone, CustomSwin):
# stereo_feat = stereo_feat.permute(0,2,3,1)
# stereo_feat = self.img_backbone.norm0(stereo_feat)
# stereo_feat = stereo_feat.permute(0,3,1,2)
if self.bevdet_model:
x = x[-2:]
if self.with_img_neck:
x = self.img_neck(x)
if type(x) in [list, tuple]:
x = x[0]
_, output_dim, ouput_H, output_W = x.shape
x = x.view(B, N, output_dim, ouput_H, output_W)
return x, stereo_feat
def extract_img_feat(self, img, img_metas):
inputs = img
"""Extract features of images."""
B, N, _, H, W = inputs[0].shape
N = N//2
imgs = inputs[0].view(B,N,2,3,H,W)
imgs = torch.split(imgs,1,2)
imgs = [t.squeeze(2) for t in imgs]
rots, trans, intrins, post_rots, post_trans, bda, _, sensor2sensors = inputs[1:9]
extra = [rots.view(B,2,N,3,3),
trans.view(B,2,N,3),
intrins.view(B,2,N,3,3),
post_rots.view(B,2,N,3,3),
post_trans.view(B,2,N,3),
sensor2sensors.view(B,2,N,4,4)]
sensor2ego_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots)
sensor2ego_mats[:,:,:,:3,:3] = extra[0]
sensor2ego_mats[:,:,:,:3,3] = extra[1]
intrin_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots)
intrin_mats[:,:,:,:3,:3] = extra[2]
ida_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots)
ida_mats[:,:,:,:3,:3] = extra[3]
ida_mats[:,:,:,:3,3] = extra[4]
mats_dict = dict(sensor2ego_mats=sensor2ego_mats,
intrin_mats=intrin_mats,
ida_mats=ida_mats,
sensor2sensor_mats=extra[5],
bda_mat=bda)
extra = [torch.split(t, 1, 1) for t in extra]
extra = [[p.squeeze(1) for p in t] for t in extra]
rots, trans, intrins, post_rots, post_trans, sensor2sensors = extra
# forward stereo depth
context_all_sweeps = list()
depth_feat_all_sweeps = list()
img_feats_all_sweeps = list()
stereo_feats_all_sweeps = list()
mu_all_sweeps = list()
sigma_all_sweeps = list()
mono_depth_all_sweeps = list()
range_score_all_sweeps = list()
key_frame=True # back propagation for key frame only
for img, rot, tran, intrin, post_rot, post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans):
if not key_frame:
with torch.no_grad():
img_feats, stereo_feats = self.image_encoder(img)
img_feats = img_feats.view(B * N, *img_feats.shape[2:])
mlp_input = \
self.img_view_transformer.get_mlp_input(rots[0], trans[0], intrin, post_rot, post_tran, bda)
depth_feat, context, mu, sigma, range_score, mono_depth = \
self.img_view_transformer.depth_net(img_feats,
mlp_input)
context = self.img_view_transformer.context_downsample_net(
context)
else:
img_feats, stereo_feats = self.image_encoder(img)
img_feats = img_feats.view(B * N, *img_feats.shape[2:])
mlp_input = \
self.img_view_transformer.get_mlp_input(rots[0], trans[0], intrin,
post_rot,
post_tran, bda)
depth_feat, context, mu, sigma, range_score, mono_depth = \
self.img_view_transformer.depth_net(img_feats,
mlp_input)
context = self.img_view_transformer.context_downsample_net(
context)
img_feats_all_sweeps.append(img_feats)
stereo_feats_all_sweeps.append(stereo_feats)
depth_feat_all_sweeps.append(depth_feat)
context_all_sweeps.append(context)
mu_all_sweeps.append(mu)
sigma_all_sweeps.append(sigma)
mono_depth_all_sweeps.append(mono_depth)
range_score_all_sweeps.append(range_score)
key_frame = False
depth_score_all_sweeps = list()
num_sweeps = 2
for ref_idx in range(num_sweeps):
sensor2sensor_mats = list()
for src_idx in range(num_sweeps):
ref2keysensor_mats = sensor2sensors[ref_idx].inverse()
key2srcsensor_mats = sensor2sensors[src_idx]
ref2srcsensor_mats = key2srcsensor_mats @ ref2keysensor_mats
sensor2sensor_mats.append(ref2srcsensor_mats)
if ref_idx == 0:
# last iteration on stage 1 does not have propagation
# (photometric consistency filtering)
if self.img_view_transformer.use_mask:
stereo_depth, mask = self.img_view_transformer._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
else:
stereo_depth = self.img_view_transformer._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
else:
with torch.no_grad():
# last iteration on stage 1 does not have
# propagation (photometric consistency filtering)
if self.img_view_transformer.use_mask:
stereo_depth, mask = self.img_view_transformer._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
else:
stereo_depth = self.img_view_transformer._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
if self.img_view_transformer.use_mask:
depth_score = (
mono_depth_all_sweeps[ref_idx] +
self.img_view_transformer.depth_downsample_net(
stereo_depth) * mask).softmax(1)
else:
depth_score = (
mono_depth_all_sweeps[ref_idx] +
self.img_view_transformer.depth_downsample_net(stereo_depth)).softmax(1)
depth_score_all_sweeps.append(depth_score)
# forward view transformation
bev_feat_list = []
key_frame=True # back propagation for key frame only
for image_feat, depth_prob, rot, tran, intrin, post_rot, post_tran in \
zip(context_all_sweeps, depth_score_all_sweeps, rots, trans,
intrins, post_rots, post_trans):
if not key_frame:
with torch.no_grad():
input_curr = (image_feat.view(B,N,*image_feat.shape[1:]),
depth_prob, rot, tran, intrin, post_rot,
post_tran, bda)
bev_feat = self.img_view_transformer(input_curr)
else:
input_curr = (image_feat.view(B,N,*image_feat.shape[1:]),
depth_prob, rot, tran, intrin, post_rot,
post_tran, bda)
bev_feat = self.img_view_transformer(input_curr)
if self.pre_process:
bev_feat = self.pre_process_net(bev_feat)[0]
bev_feat_list.append(bev_feat)
key_frame = False
bev_feat = torch.cat(bev_feat_list, dim=1)
x = self.bev_encoder(bev_feat)
return [x], depth_score_all_sweeps[0]
================================================
FILE: projects/occ_plugin/occupancy/detectors/ocfnet.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
from sys import api_version
import torch
import collections
import torch.nn.functional as F
import os
from mmdet.models import DETECTORS
from mmcv.runner import auto_fp16, force_fp32
from .bevdepth import BEVDepth
from mmdet3d.models import builder
import numpy as np
import time
import copy
from typing import Tuple
@DETECTORS.register_module()
class OCFNet(BEVDepth):
def __init__(self,
loss_cfg=None,
only_generate_dataset=False,
disable_loss_depth=False,
test_present=False,
empty_idx=0,
max_label=2,
occ_encoder_backbone=None,
occ_predictor=None,
occ_encoder_neck=None,
flow_encoder_backbone=None,
flow_predictor=None,
flow_encoder_neck=None,
flow_head=None,
loss_norm=False,
point_cloud_range=None,
time_receptive_field=None,
n_future_frames=None,
n_future_frames_plus=None,
iou_thresh_for_vpq=None,
record_time=False,
save_pred=False,
save_path=None,
**kwargs):
'''
OCFNet is our end-to-end baseline for 4D camera-only occupancy forecasting
there are two streams for the forecasting task with aggregated voxel features as inputs:
1. occ_encoder_backbone -> occ_predictor -> occ_encoder_neck -> pts_bbox_head
2. flow_encoder_backbone -> flow_predictor -> flow_encoder_neck -> flow_head
time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3
n_future_frames: number of forecasted future frames, default: 4
n_future_frames_plus: number of estimated frames (> n_future_frames), default: 6 (if only forecasting occupancy states rather than instances, n_future_frames=n_future_frames_plus can be set)
iou_thresh_for_vpq: iou threshold to associate instances in 3D instance prediction, default: 0.2 (adjusted by occupancy forecasting performance)
'''
super().__init__(**kwargs)
self.loss_cfg = loss_cfg
self.disable_loss_depth = disable_loss_depth
self.only_generate_dataset = only_generate_dataset
self.loss_norm = loss_norm
self.time_receptive_field = time_receptive_field
self.n_future_frames = n_future_frames
self.n_future_frames_plus = n_future_frames_plus
self.eval_start_moment = self.n_future_frames_plus - self.n_future_frames - 1
self.iou_thresh_for_vpq = iou_thresh_for_vpq
self.record_time = record_time
self.time_stats = collections.defaultdict(list)
self.empty_idx = empty_idx
self.max_label = max_label
self.occ_encoder_backbone = builder.build_backbone(occ_encoder_backbone)
self.occ_predictor = builder.build_neck(occ_predictor)
self.occ_encoder_neck = builder.build_neck(occ_encoder_neck)
self.flow_encoder_backbone = builder.build_backbone(flow_encoder_backbone)
self.flow_encoder_neck = builder.build_neck(flow_encoder_neck)
self.flow_predictor = builder.build_neck(flow_predictor)
self.flow_head = builder.build_head(flow_head)
self.point_cloud_range = point_cloud_range
self.spatial_extent3d = (self.point_cloud_range[3]-self.point_cloud_range[0], \
self.point_cloud_range[4]-self.point_cloud_range[1], \
self.point_cloud_range[5]-self.point_cloud_range[2])
self.ego_center_shift_proportion_x = abs(self.point_cloud_range[0])/(self.point_cloud_range[3]-self.point_cloud_range[0])
self.ego_center_shift_proportion_y = abs(self.point_cloud_range[1])/(self.point_cloud_range[4]-self.point_cloud_range[1])
self.ego_center_shift_proportion_z = abs(self.point_cloud_range[2])/(self.point_cloud_range[5]-self.point_cloud_range[2])
self.n_cam = 6
self.fine_grained = False
self.vehicles_id = 1
self.test_present = test_present
self.save_pred = save_pred
self.save_path = save_path
def image_encoder(self, img):
imgs = img
B, N, C, imH, imW = imgs.shape
imgs = imgs.view(B * N, C, imH, imW)
backbone_feats = self.img_backbone(imgs)
if self.with_img_neck:
x = self.img_neck(backbone_feats)
if type(x) in [list, tuple]:
x = x[0]
else:
x = backbone_feats
_, output_dim, ouput_H, output_W = x.shape
x = x.view(B, N, output_dim, ouput_H, output_W)
return {'x': x,
'img_feats': [x.clone()]}
@force_fp32()
def occ_encoder(self, x):
b, t, _, _, _, _ = x.shape
x = x.reshape(b, -1, *x.shape[3:])
x = self.occ_encoder_backbone(x)
x = self.occ_predictor(x)
x = self.occ_encoder_neck(x)
return x
@force_fp32()
def flow_encoder(self, x):
b, t, _, _, _, _ = x.shape
x = x.reshape(b, -1, *x.shape[3:])
x = self.flow_encoder_backbone(x)
x = self.flow_predictor(x)
x = self.flow_encoder_neck(x)
return x
def mat2pose_vec(self, matrix: torch.Tensor):
"""
Converts a 4x4 pose matrix into a 6-dof pose vector
Args:
matrix (ndarray): 4x4 pose matrix
Returns:
vector (ndarray): 6-dof pose vector comprising translation components (tx, ty, tz) and
rotation components (rx, ry, rz)
"""
# M[1, 2] = -sinx*cosy, M[2, 2] = +cosx*cosy
rotx = torch.atan2(-matrix[..., 1, 2], matrix[..., 2, 2])
# M[0, 2] = +siny, M[1, 2] = -sinx*cosy, M[2, 2] = +cosx*cosy
cosy = torch.sqrt(matrix[..., 1, 2] ** 2 + matrix[..., 2, 2] ** 2)
roty = torch.atan2(matrix[..., 0, 2], cosy)
# M[0, 0] = +cosy*cosz, M[0, 1] = -cosy*sinz
rotz = torch.atan2(-matrix[..., 0, 1], matrix[..., 0, 0])
rotation = torch.stack((rotx, roty, rotz), dim=-1)
# Extract translation params
translation = matrix[..., :3, 3]
return torch.cat((translation, rotation), dim=-1)
def pack_dbatch_and_dtime(self, x):
b = x.shape[0]
s = x.shape[1]
x = x.view(b*s, *x.shape[2:])
return x
def unpack_dbatch_and_dtime(self, x, b, s):
assert (b*s) == x.shape[0]
x = x.view(b, s, *x.shape[1:])
return x
def extract_img_feat(self, img_inputs_seq, img_metas):
'''
Extract features of sequential input images
'''
if self.record_time:
torch.cuda.synchronize()
t0 = time.time()
imgs_seq, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, gt_depths_seq, sensor2sensors_seq = img_inputs_seq
self.batch_size = imgs_seq.shape[0]
self.sequence_length = imgs_seq.shape[1]
imgs_seq = imgs_seq[:,0:self.time_receptive_field,...].contiguous()
rots_seq = rots_seq[:,0:self.time_receptive_field,...].contiguous()
trans_seq = trans_seq[:,0:self.time_receptive_field,...].contiguous()
intrins_seq = intrins_seq[:,0:self.time_receptive_field,...].contiguous()
post_rots_seq = post_rots_seq[:,0:self.time_receptive_field,...].contiguous()
post_trans_seq = post_trans_seq[:,0:self.time_receptive_field,...].contiguous()
gt_depths_seq = gt_depths_seq[:,0:self.time_receptive_field,...].contiguous()
sensor2sensors_seq = sensor2sensors_seq[:,0:self.time_receptive_field,...].contiguous()
imgs_seq = self.pack_dbatch_and_dtime(imgs_seq)
rots_seq = self.pack_dbatch_and_dtime(rots_seq)
trans_seq = self.pack_dbatch_and_dtime(trans_seq)
intrins_seq = self.pack_dbatch_and_dtime(intrins_seq)
post_rots_seq = self.pack_dbatch_and_dtime(post_rots_seq)
post_trans_seq = self.pack_dbatch_and_dtime(post_trans_seq)
gt_depths_seq = self.pack_dbatch_and_dtime(gt_depths_seq)
sensor2sensors_seq = self.pack_dbatch_and_dtime(sensor2sensors_seq)
self.n_cam = imgs_seq.shape[1]
img_enc_feats = self.image_encoder(imgs_seq)
x = img_enc_feats['x']
img_feats = img_enc_feats['img_feats']
if self.record_time:
torch.cuda.synchronize()
t1 = time.time()
self.time_stats['img_encoder'].append(t1 - t0)
mlp_input_seq = self.img_view_transformer.get_mlp_input(rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq)
geo_inputs = [rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, None, mlp_input_seq]
x, depth = self.img_view_transformer([x] + geo_inputs)
if self.record_time:
torch.cuda.synchronize()
t2 = time.time()
self.time_stats['view_transformer'].append(t2 - t1)
return x, depth, img_feats
def warp_features(self, x, flow, tseq):
'''
Warp features by motion flow
'''
if flow is None:
return x
b, dc, dx, dy, dz = x.shape
# normalize 3D motion flow
flow[:,0,-1] =flow[:,0,-1]*dx/self.spatial_extent3d[0]
flow[:,1,-1] =flow[:,1,-1]*dy/self.spatial_extent3d[1]
flow[:,2,-1] =flow[:,2,-1]*dz/self.spatial_extent3d[2]
nx, ny, nz = torch.meshgrid(torch.arange(dx, dtype=torch.float, device=x.device), \
torch.arange(dy, dtype=torch.float, device=x.device), \
torch.arange(dz, dtype=torch.float, device=x.device))
tmp = torch.ones((dx, dy, dz), device=x.device)
grid = torch.stack((nx, ny, nz, tmp), dim=-1)
# centralize shift
shift_x = self.ego_center_shift_proportion_x * dx
shift_y = self.ego_center_shift_proportion_y * dy
shift_z = self.ego_center_shift_proportion_z * dz
grid[:, :, :, 0] = grid[:, :, :, 0] - shift_x
grid[:, :, :, 1] = grid[:, :, :, 1] - shift_y
grid[:, :, :, 2] = grid[:, :, :, 2] - shift_z
grid = grid.view(dx*dy*dz, grid.shape[-1]).unsqueeze(-1) #[N,4,1]
transformation = flow.unsqueeze(1) # [bs, 1, 4, 4]
transformed_grid = transformation @ grid # [bs, N, 4, 1]
transformed_grid = transformed_grid.squeeze(-1) # [bs, N, 4]
transformed_grid = transformed_grid.view(-1, 4)
# de-centralize
transformed_grid[:, 0] = (transformed_grid[:, 0] + shift_x)
transformed_grid[:, 1] = (transformed_grid[:, 1] + shift_y)
transformed_grid[:, 2] = (transformed_grid[:, 2] + shift_z)
transformed_grid = transformed_grid.round().long()
# de-normalize
grid = grid.squeeze(-1)
grid = grid.view(-1, 4)
grid[:, 0] = (grid[:, 0] + shift_x)
grid[:, 1] = (grid[:, 1] + shift_y)
grid[:, 2] = (grid[:, 2] + shift_z)
grid = grid.round().long()
batch_ix = torch.cat([torch.full([transformed_grid.shape[0] // b, 1], ix, device=x.device, dtype=torch.long) for ix in range(b)])
kept = (transformed_grid[:,0] >= 0) & (transformed_grid[:,0] = 0) & (transformed_grid[:,1] = 0) & (transformed_grid[:,2] < dz)
transformed_grid = transformed_grid[kept]
batch_ix = batch_ix[kept]
grid = grid[kept]
warped_x = torch.zeros_like(x, device=x.device)
# hard coding for reducing memory usage
# erratum for new version
split_num = 32
gap = transformed_grid.shape[0]//split_num
for tt in range(split_num-1):
start_idx_tt = int(tt*gap)
end_idx_tt = int((tt+1)*gap)
current_batch = batch_ix[start_idx_tt:end_idx_tt]
ixx = transformed_grid[start_idx_tt:end_idx_tt, 0]
ixy = transformed_grid[start_idx_tt:end_idx_tt, 1]
ixz = transformed_grid[start_idx_tt:end_idx_tt, 2]
ixx_ori = grid[start_idx_tt:end_idx_tt, 0]
ixy_ori = grid[start_idx_tt:end_idx_tt, 1]
ixz_ori = grid[start_idx_tt:end_idx_tt, 2]
warped_x[current_batch, :, ixx, ixy, ixz] = x[current_batch, :, ixx_ori, ixy_ori, ixz_ori]
# for i in range(transformed_grid.shape[0]):
# current_batch = batch_ix[i]
# ixx = transformed_grid[i, 0]
# ixy = transformed_grid[i, 1]
# ixz = transformed_grid[i, 2]
# ixx_ori = grid[i, 0]
# ixy_ori = grid[i, 1]
# ixz_ori = grid[i, 2]
# warped_x[current_batch, :, ixx, ixy, ixz] = x[current_batch, :, ixx_ori, ixy_ori, ixz_ori]
return warped_x
def cumulative_warp_occ(self, lifted_feature_seq, future_egomotion, mode='bilinear'):
'''
Warp sequential voxel features to the present frame by ego pose updates
'''
future_egomotion = future_egomotion[:, :self.time_receptive_field, ...].contiguous()
out = [lifted_feature_seq[:, -1]]
cum_future_egomotion = future_egomotion[:, -2]
for t in reversed(range(self.time_receptive_field - 1)):
out.append(self.warp_features(lifted_feature_seq[:, t], cum_future_egomotion, t))
cum_future_egomotion = cum_future_egomotion @ future_egomotion[:, t - 1]
return torch.stack(out[::-1], 1)
def extract_feat(self, img_inputs_seq, img_metas, future_egomotion):
'''
Extract voxel features from input sequential images
'''
voxel_feats = None
depth, img_feats = None, None
if img_inputs_seq is not None:
voxel_feats, depth, img_feats = self.extract_img_feat(img_inputs_seq, img_metas)
if self.record_time:
torch.cuda.synchronize()
t0 = time.time()
voxel_feats = self.unpack_dbatch_and_dtime(voxel_feats, self.batch_size, self.time_receptive_field)
voxel_feats = self.cumulative_warp_occ(voxel_feats.clone(), future_egomotion, mode='bilinear')
if self.record_time:
torch.cuda.synchronize()
t1 = time.time()
self.time_stats['feature warping'].append(t1 - t0)
# egomotion-aware
future_egomotion_vec = self.mat2pose_vec(future_egomotion)
batch_size, sequence_length, nbr_pose_channels = future_egomotion_vec.shape
dx, dy, dz = voxel_feats.shape[-3:]
future_egomotions_spatial = future_egomotion_vec.view(batch_size, sequence_length, nbr_pose_channels, 1, 1, 1).expand(batch_size, sequence_length, nbr_pose_channels, dx, dy, dz)
# at time 0, no egomotion so feed zero vector
future_egomotions_spatial = torch.cat([torch.zeros_like(future_egomotions_spatial[:, :1]),
future_egomotions_spatial[:, :(self.time_receptive_field-1)]], dim=1)
voxel_feats = torch.cat([voxel_feats, future_egomotions_spatial], dim=-4)
voxel_feats_enc = self.occ_encoder(voxel_feats)
if type(voxel_feats_enc) is not list:
voxel_feats_enc = [voxel_feats_enc]
if self.record_time:
torch.cuda.synchronize()
t2 = time.time()
self.time_stats['occ_encoder'].append(t2 - t1)
flow_feats_enc = self.flow_encoder(voxel_feats)
if type(flow_feats_enc) is not list:
flow_feats_enc = [flow_feats_enc]
if self.record_time:
torch.cuda.synchronize()
t3 = time.time()
self.time_stats['flow_encoder'].append(t3 - t2)
depth = depth.view(-1, self.n_cam, *depth.shape[-3:])
return (voxel_feats_enc, flow_feats_enc, img_feats, depth)
@force_fp32(apply_to=('voxel_feats'))
def forward_pts_train(
self,
voxel_feats,
gt_occ=None,
points_occ=None,
img_metas=None,
transform=None,
img_feats=None,
):
if self.record_time:
torch.cuda.synchronize()
t0 = time.time()
outs = self.pts_bbox_head(
voxel_feats=voxel_feats,
points=points_occ,
img_metas=img_metas,
img_feats=img_feats,
transform=transform,
)
if self.record_time:
torch.cuda.synchronize()
t1 = time.time()
self.time_stats['occ_head'].append(t1 - t0)
losses = self.pts_bbox_head.loss(
output_voxels=outs['output_voxels'],
target_voxels=gt_occ,
target_points=points_occ,
img_metas=img_metas,
)
if self.record_time:
torch.cuda.synchronize()
t2 = time.time()
self.time_stats['loss_occ'].append(t2 - t1)
return losses
@force_fp32(apply_to=('voxel_feats'))
def forward_flow_train(
self,
voxel_feats,
gt_occ=None,
points_occ=None,
img_metas=None,
transform=None,
img_feats=None,
):
if self.record_time:
torch.cuda.synchronize()
t0 = time.time()
outs = self.flow_head(
voxel_feats=voxel_feats,
points=points_occ,
img_metas=img_metas,
img_feats=img_feats,
transform=transform,
)
if self.record_time:
torch.cuda.synchronize()
t1 = time.time()
self.time_stats['flow_head'].append(t1 - t0)
losses = self.flow_head.loss(
output_voxels=outs['output_voxels'],
target_voxels=gt_occ,
target_points=points_occ,
img_metas=img_metas,
)
if self.record_time:
torch.cuda.synchronize()
t2 = time.time()
self.time_stats['loss_flow'].append(t2 - t1)
return losses
def forward_train(self,
img_inputs_seq=None,
segmentation=None,
instance=None,
attribute_label=None,
flow=None,
future_egomotion=None,
gt_occ=None,
img_metas=None,
points_occ=None,
**kwargs,
):
'''
Train OCFNet using bbox-wise occupancy labels if self.fine_grained=False, else using voxel-wise labels from nuScenes-Occupancy
'''
# manually stop forward
if self.only_generate_dataset:
return {"pseudo_loss": torch.tensor(0.0, device=segmentation.device, requires_grad=True)}
if not self.fine_grained:
gt_occ = segmentation
voxel_feats, flow_feats, img_feats, depth = self.extract_feat(
img_inputs_seq=img_inputs_seq, img_metas=img_metas, future_egomotion=future_egomotion)
# training losses
losses = dict()
if self.record_time:
torch.cuda.synchronize()
t0 = time.time()
# TODO: we will release the version with depth fine-tuning in the future
if not self.disable_loss_depth and depth is not None:
depth_gt = img_inputs_seq[-2][:,0:self.time_receptive_field,...].contiguous()
depth_gt = depth_gt.view(depth_gt.shape[0]*depth_gt.shape[1],*depth_gt.shape[2:])
depth = depth.view(-1, *depth.shape[2:])
losses['loss_depth'] = self.img_view_transformer.get_depth_loss(depth_gt, depth)
if self.record_time:
torch.cuda.synchronize()
t1 = time.time()
self.time_stats['loss_depth'].append(t1 - t0)
transform = img_inputs_seq[1:8] if img_inputs_seq is not None else None
voxel_feats_seq = []
for voxel_feats_stage in voxel_feats:
bs, sfeatures = voxel_feats_stage.shape[:2]
voxel_feats_stage_ = voxel_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *voxel_feats_stage.shape[2:])
voxel_feats_seq.append(voxel_feats_stage_)
gt_occ = gt_occ[:, -self.n_future_frames_plus:, ...]
flow = flow[:, -self.n_future_frames_plus:, ...]
losses_occupancy = self.forward_pts_train(voxel_feats_seq, gt_occ,
points_occ, img_metas, img_feats=img_feats, transform=transform)
losses.update(losses_occupancy)
flow_feats_seq = []
for flow_feats_stage in flow_feats:
bs, sfeatures = flow_feats_stage.shape[:2]
flow_feats_stage_ = flow_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *flow_feats_stage.shape[2:])
flow_feats_seq.append(flow_feats_stage_)
losses_flow = self.forward_flow_train(flow_feats_seq, flow,
points_occ, img_metas, img_feats=img_feats, transform=transform)
losses.update(losses_flow)
if self.loss_norm:
for loss_key in losses.keys():
if loss_key.startswith('loss'):
losses[loss_key] = losses[loss_key] / (losses[loss_key].detach() + 1e-9)
def logging_latencies():
# logging latencies
avg_time = {key: sum(val) / len(val) for key, val in self.time_stats.items()}
sum_time = sum(list(avg_time.values()))
out_res = ''
for key, val in avg_time.items():
out_res += '{}: {:.4f}, {:.1f}, '.format(key, val, val / sum_time)
print(out_res)
if self.record_time:
logging_latencies()
return losses
def forward_test(self,
img_inputs_seq=None,
segmentation=None,
instance=None,
attribute_label=None,
flow=None,
future_egomotion=None,
gt_occ=None,
img_metas=None,
points_occ=None,
**kwargs,
):
'''
Test OCFNet using IOU and VPQ metrics
'''
# let batch size equals 1 while testing
assert segmentation.shape[0] == 1
return self.simple_test(img_metas, img_inputs_seq, gt_occ=gt_occ, gt_flow=flow, segmentation=segmentation, instance=instance, future_egomotion=future_egomotion, **kwargs)
def simple_test(self, img_metas, img_inputs_seq=None, rescale=False, points_occ=None,
gt_occ=None, gt_flow=None, segmentation=None, instance=None, future_egomotion=None):
# manually stop forward
if self.only_generate_dataset:
return {'hist_for_iou': 0, 'pred_c': 0, 'vpq':0}
if not self.fine_grained:
gt_occ = segmentation
voxel_feats, flow_feats, img_feats, depth = self.extract_feat(
img_inputs_seq=img_inputs_seq, img_metas=img_metas, future_egomotion=future_egomotion)
transform = img_inputs_seq[1:8] if img_inputs_seq is not None else None
voxel_feats_seq = []
for voxel_feats_stage in voxel_feats:
bs, sfeatures = voxel_feats_stage.shape[:2]
voxel_feats_stage_ = voxel_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *voxel_feats_stage.shape[2:])
voxel_feats_seq.append(voxel_feats_stage_)
gt_occ = gt_occ[:, -self.n_future_frames_plus:, ...].contiguous()
gt_occ = gt_occ.view(gt_occ.shape[0]*gt_occ.shape[1], *gt_occ.shape[2:])
instance = instance[:, -self.n_future_frames_plus:, ...].contiguous()
instance = instance.view(instance.shape[0]*instance.shape[1], *instance.shape[2:])
output = self.pts_bbox_head(
voxel_feats=voxel_feats_seq,
points=points_occ,
img_metas=img_metas,
img_feats=img_feats,
transform=transform,
)
pred_c = output['output_voxels'][0]
flow_feats_seq = []
for flow_feats_stage in flow_feats:
bs, sfeatures = flow_feats_stage.shape[:2]
flow_feats_stage_ = flow_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *flow_feats_stage.shape[2:])
flow_feats_seq.append(flow_feats_stage_)
output_flow = self.flow_head(
voxel_feats=flow_feats_seq,
points=points_occ,
img_metas=img_metas,
img_feats=img_feats,
transform=transform,
)
gt_flow = gt_flow[:, -self.n_future_frames_plus:, ...].contiguous()
gt_flow = gt_flow.view(gt_flow.shape[0]*gt_flow.shape[1], *gt_flow.shape[2:])
# pred_flow = output_flow['output_voxels'][0]
# vpq = self.evaluate_instance_prediction(pred_c, pred_flow, gt_occ, instance)
vpq = 0.1
if self.test_present:
pred_c = pred_c[self.eval_start_moment:(self.eval_start_moment+1), ...]
gt_occ = gt_occ[self.eval_start_moment:(self.eval_start_moment+1), ...]
else:
pred_c = pred_c[self.eval_start_moment+1:, ...]
gt_occ = gt_occ[self.eval_start_moment+1:, ...]
hist_for_iou = self.evaluate_occupancy_forecasting(pred_c, gt_occ, img_metas=img_metas, save_pred=self.save_pred, save_path=self.save_path)
test_output = {
'hist_for_iou': hist_for_iou,
'pred_c': pred_c,
'vpq': vpq,
}
return test_output
def evaluate_occupancy_forecasting(self, pred, gt, img_metas=None, save_pred=False, save_path=None):
B, H, W, D = gt.shape
pred = F.interpolate(pred, size=[H, W, D], mode='trilinear', align_corners=False).contiguous()
hist_all = 0
iou_per_pred_list = []
pred_list = []
gt_list = []
for i in range(B):
pred_cur = pred[i,...]
pred_cur = torch.argmax(pred_cur, dim=0).cpu().numpy()
gt_cur = gt[i, ...].cpu().numpy()
gt_cur = gt_cur.astype(np.int)
pred_list.append(pred_cur)
gt_list.append(gt_cur)
# ignore noise
noise_mask = gt_cur != 255
# GMO and others for max_label=2
# multiple movable objects for max_label=9
hist_cur, iou_per_pred = fast_hist(pred_cur[noise_mask], gt_cur[noise_mask], max_label=self.max_label)
hist_all = hist_all + hist_cur
iou_per_pred_list.append(iou_per_pred)
# whether save prediction results
if save_pred:
if not os.path.exists(save_path):
os.mkdir(save_path)
pred_for_save_list = []
for k in range(B):
pred_for_save = torch.argmax(pred[k], dim=0).cpu()
x_grid = torch.linspace(0, H-1, H, dtype=torch.long)
x_grid = x_grid.view(H, 1, 1).expand(H, W, D)
y_grid = torch.linspace(0, W-1, W, dtype=torch.long)
y_grid = y_grid.view(1, W, 1).expand(H, W, D)
z_grid = torch.linspace(0, D-1, D, dtype=torch.long)
z_grid = z_grid.view(1, 1, D).expand(H, W, D)
segmentation_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
segmentation_for_save = segmentation_for_save.view(-1, 3)
segmentation_label = pred_for_save.squeeze(0).view(-1,1)
segmentation_for_save = torch.cat((segmentation_for_save, segmentation_label), dim=-1) # N,4
kept = segmentation_for_save[:,-1]!=0
segmentation_for_save= segmentation_for_save[kept].cpu().numpy()
pred_for_save_list.append(segmentation_for_save)
np.savez(os.path.join(save_path, img_metas[0]["scene_token"]), pred_for_save_list)
return hist_all
def find_instance_centers(self, center_prediction: torch.Tensor, conf_threshold: float = 0.1, nms_kernel_size: float = 3):
assert len(center_prediction.shape) == 4
center_prediction = F.threshold(center_prediction, threshold=conf_threshold, value=-1)
nms_padding = (nms_kernel_size - 1) // 2
maxpooled_center_prediction = F.max_pool3d(
center_prediction, kernel_size=nms_kernel_size, stride=1, padding=nms_padding
)
# Filter all elements that are not the maximum (i.e. the center of the heatmap instance)
center_prediction[center_prediction != maxpooled_center_prediction] = -1
return torch.nonzero(center_prediction > 0)[:, 1:]
def group_pixels(self, centers: torch.Tensor, offset_predictions: torch.Tensor) -> torch.Tensor:
dx, dy, dz = offset_predictions.shape[-3:]
x_grid = (
torch.arange(dx, dtype=offset_predictions.dtype, device=offset_predictions.device)
.view(1, dx, 1, 1)
.repeat(1, 1, dy, dz)
)
y_grid = (
torch.arange(dy, dtype=offset_predictions.dtype, device=offset_predictions.device)
.view(1, 1, dy, 1)
.repeat(1, dx, 1, dz)
)
z_grid = (
torch.arange(dz, dtype=offset_predictions.dtype, device=offset_predictions.device)
.view(1, 1, 1, dz)
.repeat(1, dx, dy, 1)
)
pixel_grid = torch.cat((x_grid, y_grid, z_grid), dim=0)
center_locations = (pixel_grid + offset_predictions).view(3, dx*dy*dz, 1).permute(2, 1, 0)
centers = centers.view(-1, 1, 3)
distances = torch.norm(centers - center_locations, dim=-1)
instance_id = torch.argmin(distances, dim=0).reshape(1, dx, dy, dz) + 1
return instance_id
def update_instance_ids(self, instance_seg, old_ids, new_ids):
indices = torch.arange(old_ids.max() + 1, device=instance_seg.device)
for old_id, new_id in zip(old_ids, new_ids):
indices[old_id] = new_id
return indices[instance_seg].long()
def make_instance_seg_consecutive(self, instance_seg):
# Make the indices of instance_seg consecutive
unique_ids = torch.unique(instance_seg)
new_ids = torch.arange(len(unique_ids), device=instance_seg.device)
instance_seg = self.update_instance_ids(instance_seg, unique_ids, new_ids)
return instance_seg
def get_instance_segmentation_and_centers(self,
center_predictions: torch.Tensor,
offset_predictions: torch.Tensor,
foreground_mask: torch.Tensor,
conf_threshold: float = 0.1,
nms_kernel_size: float = 5,
max_n_instance_centers: int = 100,
) -> Tuple[torch.Tensor, torch.Tensor]:
dx, dy, dz = offset_predictions.shape[-3:]
center_predictions = center_predictions.view(1, dx, dy, dz)
offset_predictions = offset_predictions.view(3, dx, dy, dz)
foreground_mask = foreground_mask.view(1, dx, dy, dz)
centers = self.find_instance_centers(center_predictions, conf_threshold=conf_threshold, nms_kernel_size=nms_kernel_size)
if not len(centers):
return torch.zeros(center_predictions.shape, dtype=torch.int64, device=center_predictions.device)
if len(centers) > max_n_instance_centers:
centers = centers[:max_n_instance_centers].clone()
instance_ids = self.group_pixels(centers, offset_predictions * foreground_mask.float())
instance_seg = (instance_ids * foreground_mask.float()).long()
# Make the indices of instance_seg consecutive
instance_seg = self.make_instance_seg_consecutive(instance_seg)
return instance_seg.long()
def flow_warp(self, occupancy, flow, mode='nearest', padding_mode='zeros'):
'''
Warp ground-truth flow-origin occupancies according to predicted flows
'''
_, num_waypoints, _, grid_dx_cells, grid_dy_cells, grid_dz_cells = occupancy.size()
dx = torch.linspace(-1, 1, steps=grid_dx_cells)
dy = torch.linspace(-1, 1, steps=grid_dy_cells)
dz = torch.linspace(-1, 1, steps=grid_dz_cells)
x_idx, y_idx, z_idx = torch.meshgrid(dx, dy, dz)
identity_indices = torch.stack((x_idx, y_idx, z_idx), dim=0).to(device=occupancy.device)
warped_occupancy = []
for k in range(num_waypoints): # 1
flow_origin_occupancy = occupancy[:, k] # B T 1 dx dy dz -> B 1 dx dy dz
pred_flow = flow[:, k] # B T 3 dx dy dz -> B 3 dx dy dz
# Normalize along the width and height direction
normalize_pred_flow = torch.stack(
(2.0 * pred_flow[:, 0] / (grid_dx_cells - 1),
2.0 * pred_flow[:, 1] / (grid_dy_cells - 1),
2.0 * pred_flow[:, 2] / (grid_dz_cells - 1),),
dim=1,
)
warped_indices = identity_indices + normalize_pred_flow
warped_indices = warped_indices.permute(0, 2, 3, 4, 1)
flow_origin_occupancy = flow_origin_occupancy.permute(0, 1, 4, 3, 2)
sampled_occupancy = F.grid_sample(
input=flow_origin_occupancy,
grid=warped_indices,
mode=mode,
padding_mode='zeros',
align_corners=True,
)
warped_occupancy.append(sampled_occupancy)
return warped_occupancy[0]
def make_instance_id_temporally_consecutive(self, pred_inst, preds, backward_flow, ignore_index=255.0):
assert pred_inst.shape[0] == 1, 'Assumes batch size = 1'
# Initialise instance segmentations with prediction corresponding to the present
consistent_instance_seg = [pred_inst.unsqueeze(0)]
backward_flow = backward_flow.clone().detach()
backward_flow[backward_flow == ignore_index] = 0.0
seq_len, _, dx, dy, dz = preds.shape
for t in range(1, seq_len):
init_warped_instance_seg = self.flow_warp(consistent_instance_seg[-1].unsqueeze(0).float(), backward_flow[t:t+1].unsqueeze(0)).int()
warped_instance_seg = init_warped_instance_seg * preds[t:t+1, 0]
consistent_instance_seg.append(warped_instance_seg)
consistent_instance_seg = torch.cat(consistent_instance_seg, dim=1)
return consistent_instance_seg
def predict_instance_segmentation(self, pred_seg, pred_flow):
pred_seg_sm = pred_seg.detach()
pred_seg_sm = torch.argmax(pred_seg_sm, dim=1, keepdims=True)
foreground_masks = pred_seg_sm.squeeze(1) == self.vehicles_id
pred_inst_batch = self.get_instance_segmentation_and_centers(
torch.softmax(pred_seg, dim=1)[0:1, self.vehicles_id].detach(),
pred_flow[1:2].detach(),
foreground_masks[1:2].detach(),
nms_kernel_size=7,
)
consistent_instance_seg = self.make_instance_id_temporally_consecutive(
pred_inst_batch,
pred_seg_sm[1:],
pred_flow[1:].detach(),
)
consistent_instance_seg = torch.cat([torch.zeros_like(pred_inst_batch.unsqueeze(0)), consistent_instance_seg], dim=1)
return consistent_instance_seg.permute(1, 0, 2, 3, 4).long() # [1, 6, 512, 512, 40]
def combine_mask(self, segmentation: torch.Tensor, instance: torch.Tensor, n_classes: int, n_all_things: int):
'''
Shift all things ids by num_classes and combine things and stuff into a single mask
'''
instance = instance.view(-1)
instance_mask = instance > 0
instance = instance - 1 + n_classes
segmentation = segmentation.clone().view(-1)
segmentation_mask = segmentation < n_classes
# Build an index from instance id to class id.
instance_id_to_class_tuples = torch.cat(
(
instance[instance_mask & segmentation_mask].unsqueeze(1),
segmentation[instance_mask & segmentation_mask].unsqueeze(1),
),
dim=1,
)
instance_id_to_class = -instance_id_to_class_tuples.new_ones((n_all_things,))
instance_id_to_class[instance_id_to_class_tuples[:, 0]] = instance_id_to_class_tuples[:, 1]
instance_id_to_class[torch.arange(n_classes, device=segmentation.device)] = torch.arange(
n_classes, device=segmentation.device
)
segmentation[instance_mask] = instance[instance_mask]
segmentation += 1
segmentation[~segmentation_mask] = 0
return segmentation, instance_id_to_class
def panoptic_metrics(self, pred_segmentation, pred_instance, gt_segmentation, gt_instance, unique_id_mapping):
# GMO and others
n_classes = 2
self.keys = ['iou', 'true_positive', 'false_positive', 'false_negative'] # hard coding
result = {key: torch.zeros(n_classes, dtype=torch.float32, device=gt_instance.device) for key in self.keys}
assert pred_segmentation.dim() == 3
assert pred_segmentation.shape == pred_instance.shape == gt_segmentation.shape == gt_instance.shape
n_instances = int(torch.cat([pred_instance, gt_instance]).max().item())
n_all_things = n_instances + n_classes # Classes + instances.
n_things_and_void = n_all_things + 1
pred_segmentation = pred_segmentation.long().detach().cpu()
pred_instance = pred_instance.long().detach().cpu()
gt_segmentation = gt_segmentation.long().detach().cpu()
gt_instance = gt_instance.long().detach().cpu()
prediction, pred_to_cls = self.combine_mask(pred_segmentation, pred_instance, n_classes, n_all_things)
target, target_to_cls = self.combine_mask(gt_segmentation, gt_instance, n_classes, n_all_things)
# Compute ious between all stuff and things
# hack for bincounting 2 arrays together
x = prediction + n_things_and_void * target
bincount_2d = torch.bincount(x.long(), minlength=n_things_and_void ** 2)
if bincount_2d.shape[0] != n_things_and_void ** 2:
raise ValueError('Incorrect bincount size.')
conf = bincount_2d.reshape((n_things_and_void, n_things_and_void))
# Drop void class
conf = conf[1:, 1:]
# Confusion matrix contains intersections between all combinations of classes
union = conf.sum(0).unsqueeze(0) + conf.sum(1).unsqueeze(1) - conf
iou = torch.where(union > 0, (conf.float() + 1e-9) / (union.float() + 1e-9), torch.zeros_like(union).float())
mapping = (iou > self.iou_thresh_for_vpq).nonzero(as_tuple=False)
# Check that classes match.
is_matching = pred_to_cls[mapping[:, 1]] == target_to_cls[mapping[:, 0]]
mapping = mapping[is_matching.detach().cpu().numpy()]
tp_mask = torch.zeros_like(conf, dtype=torch.bool)
tp_mask[mapping[:, 0], mapping[:, 1]] = True
# First ids correspond to "stuff" i.e. semantic seg.
# Instance ids are offset accordingly
for target_id, pred_id in mapping:
cls_id = pred_to_cls[pred_id]
self.temporally_consistent = True # hard coding !
if self.temporally_consistent and cls_id == self.vehicles_id:
if target_id.item() in unique_id_mapping and unique_id_mapping[target_id.item()] != pred_id.item():
# Not temporally consistent
result['false_negative'][target_to_cls[target_id]] += 1
result['false_positive'][pred_to_cls[pred_id]] += 1
unique_id_mapping[target_id.item()] = pred_id.item()
continue
result['true_positive'][cls_id] += 1
result['iou'][cls_id] += iou[target_id][pred_id]
unique_id_mapping[target_id.item()] = pred_id.item()
for target_id in range(n_classes, n_all_things):
# If this is a true positive do nothing.
if tp_mask[target_id, n_classes:].any():
continue
# If this target instance didn't match with any predictions and was present set it as false negative.
if target_to_cls[target_id] != -1:
result['false_negative'][target_to_cls[target_id]] += 1
for pred_id in range(n_classes, n_all_things):
# If this is a true positive do nothing.
if tp_mask[n_classes:, pred_id].any():
continue
# If this predicted instance didn't match with any prediction, set that predictions as false positive.
if pred_to_cls[pred_id] != -1 and (conf[:, pred_id] > 0).any():
result['false_positive'][pred_to_cls[pred_id]] += 1
return result
def evaluate_instance_prediction(self, pred_seg, pred_flow, gt_seg, gt_instance):
B, H, W, D = gt_seg.shape
pred_consistent_instance_seg = self.predict_instance_segmentation(pred_seg, pred_flow)
# add one feature dimension for interpolate
pred_consistent_instance_seg = F.interpolate(pred_consistent_instance_seg.float(), size=[H, W, D], mode='nearest').contiguous()
pred_consistent_instance_seg = pred_consistent_instance_seg.squeeze(1) # [6,512,512,40]
iou = 0
true_positive = 0
false_positive = 0
false_negative = 0
# starting from the present frame
pred_instance = pred_consistent_instance_seg[self.eval_start_moment:]
gt_instance = gt_instance[self.eval_start_moment:].long()
assert gt_instance.min() == 0, 'ID 0 of gt_instance must be background'
pred_segmentation = (pred_instance > 0).long()
gt_segmentation = (gt_instance > 0).long()
unique_id_mapping = {}
for t in range(pred_segmentation.shape[0]):
result = self.panoptic_metrics(
pred_segmentation[t].detach(),
pred_instance[t].detach(),
gt_segmentation[t],
gt_instance[t],
unique_id_mapping,
)
iou += result['iou']
true_positive += result['true_positive']
false_positive += result['false_positive']
false_negative += result['false_negative']
denominator = torch.maximum(
(true_positive + false_positive / 2 + false_negative / 2),
torch.ones_like(true_positive)
)
pq = iou / denominator
return pq.cpu().numpy()
def forward_dummy(self,
points=None,
img_metas=None,
img_inputs=None,
points_occ=None,
**kwargs,
):
voxel_feats, flow_feats, img_feats, depth = self.extract_feat(img=img_inputs, img_metas=img_metas)
transform = img_inputs[1:8] if img_inputs is not None else None
output = self.pts_bbox_head(
voxel_feats=voxel_feats,
points=points_occ,
img_metas=img_metas,
img_feats=img_feats,
transform=transform,
)
return output
def fast_hist(pred, label, max_label=18):
pred = copy.deepcopy(pred.flatten())
label = copy.deepcopy(label.flatten())
bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2)
iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred
================================================
FILE: projects/occ_plugin/occupancy/fuser/__init__.py
================================================
from .addfuse import AddFuser
from .visfuse import VisFuser
from .convfuse import ConvFuser
================================================
FILE: projects/occ_plugin/occupancy/fuser/addfuse.py
================================================
import random
from typing import List
import torch
from torch import nn
from mmdet3d.models.builder import FUSION_LAYERS
@FUSION_LAYERS.register_module()
class AddFuser(nn.Module):
def __init__(self, in_channels, out_channels, dropout, input_modality=None) -> None:
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.dropout = dropout
if input_modality == None:
input_modality = dict(
use_lidar=True,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
self.use_lidar = input_modality['use_lidar']
self.use_img = input_modality['use_camera']
if self.use_img:
self.img_enc = nn.Sequential(
nn.Conv3d(in_channels, out_channels, 3, padding=1, bias=False),
nn.BatchNorm3d(out_channels),
nn.ReLU(True),
)
if self.use_lidar:
self.pts_enc = nn.Sequential(
nn.Conv3d(in_channels, out_channels, 3, padding=1, bias=False),
nn.BatchNorm3d(out_channels),
nn.ReLU(True),
)
def forward(self, img_voxel_feats, pts_voxel_feats):
features = []
if self.use_img:
img_voxel_feats = self.img_enc(img_voxel_feats)
features.append(img_voxel_feats)
if self.use_lidar:
pts_voxel_feats = self.pts_enc(pts_voxel_feats)
features.append(pts_voxel_feats)
weights = [1] * len(features)
if self.training and random.random() < self.dropout:
index = random.randint(0, len(features) - 1)
weights[index] = 0
return sum(w * f for w, f in zip(weights, features)) / sum(weights)
================================================
FILE: projects/occ_plugin/occupancy/fuser/convfuse.py
================================================
import random
from typing import List
import torch
from torch import nn
from mmdet3d.models.builder import FUSION_LAYERS
@FUSION_LAYERS.register_module()
class ConvFuser(nn.Module):
def __init__(self, in_channels, out_channels) -> None:
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.occ_enc = nn.Sequential(
nn.Conv3d(in_channels*2, out_channels, 3, padding=1, bias=False),
nn.BatchNorm3d(out_channels),
nn.ReLU(True),
)
def forward(self, img_voxel_feats, pts_voxel_feats):
return self.occ_enc(torch.cat([img_voxel_feats, pts_voxel_feats], dim=1))
================================================
FILE: projects/occ_plugin/occupancy/fuser/visfuse.py
================================================
import random
from typing import List
import torch
from torch import nn
import torch.nn.functional as F
from mmdet3d.models.builder import FUSION_LAYERS
from mmcv.cnn import build_norm_layer
@FUSION_LAYERS.register_module()
class VisFuser(nn.Module):
def __init__(self, in_channels, out_channels, norm_cfg=None) -> None:
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
if norm_cfg is None:
norm_cfg = dict(type='BN3d', eps=1e-3, momentum=0.01)
self.img_enc = nn.Sequential(
nn.Conv3d(in_channels, out_channels, 7, padding=3, bias=False),
build_norm_layer(norm_cfg, out_channels)[1],
# nn.BatchNorm3d(out_channels),
nn.ReLU(True),
)
self.pts_enc = nn.Sequential(
nn.Conv3d(in_channels, out_channels, 7, padding=3, bias=False),
build_norm_layer(norm_cfg, out_channels)[1],
# nn.BatchNorm3d(out_channels),
nn.ReLU(True),
)
self.vis_enc = nn.Sequential(
nn.Conv3d(2*out_channels, 16, 3, padding=1, bias=False),
build_norm_layer(norm_cfg, 16)[1],
# nn.BatchNorm3d(16),
nn.ReLU(True),
nn.Conv3d(16, 1, 1, padding=0, bias=False),
nn.Sigmoid(),
)
def forward(self, img_voxel_feats, pts_voxel_feats):
img_voxel_feats = self.img_enc(img_voxel_feats)
pts_voxel_feats = self.pts_enc(pts_voxel_feats)
vis_weight = self.vis_enc(torch.cat([img_voxel_feats, pts_voxel_feats], dim=1))
voxel_feats = vis_weight * img_voxel_feats + (1 - vis_weight) * pts_voxel_feats
return voxel_feats
================================================
FILE: projects/occ_plugin/occupancy/image2bev/ViewTransformerLSSBEVDepth.py
================================================
# Copyright (c) Phigent Robotics. All rights reserved.
import math
import torch
import torch.nn as nn
from mmcv.runner import BaseModule
from mmdet3d.models.builder import NECKS
from projects.occ_plugin.ops.occ_pooling import occ_pool
from mmcv.cnn import build_conv_layer, build_norm_layer
from mmcv.runner import force_fp32
from torch.cuda.amp.autocast_mode import autocast
from mmdet.models.backbones.resnet import BasicBlock
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint
from scipy.special import erf
from scipy.stats import norm
import numpy as np
import copy
import pdb
def gen_dx_bx(xbound, ybound, zbound):
dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]])
bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]])
nx = torch.Tensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]])
return dx, bx, nx
def cumsum_trick(x, geom_feats, ranks):
x = x.cumsum(0)
kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
kept[:-1] = (ranks[1:] != ranks[:-1])
x, geom_feats = x[kept], geom_feats[kept]
x = torch.cat((x[:1], x[1:] - x[:-1]))
return x, geom_feats
class QuickCumsum(torch.autograd.Function):
@staticmethod
def forward(ctx, x, geom_feats, ranks):
x = x.cumsum(0)
kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
kept[:-1] = (ranks[1:] != ranks[:-1])
x, geom_feats = x[kept], geom_feats[kept]
x = torch.cat((x[:1], x[1:] - x[:-1]))
# save kept for backward
ctx.save_for_backward(kept)
# no gradient for geom_feats
ctx.mark_non_differentiable(geom_feats)
return x, geom_feats
@staticmethod
def backward(ctx, gradx, gradgeom):
kept, = ctx.saved_tensors
back = torch.cumsum(kept, 0)
back[kept] -= 1
val = gradx[back]
return val, None, None
class ViewTransformerLiftSplatShoot(BaseModule):
def __init__(self, grid_config=None, data_config=None,
numC_input=512, numC_Trans=64, downsample=16,
accelerate=False, use_bev_pool=True, vp_megvii=False,
vp_stero=False, **kwargs):
super(ViewTransformerLiftSplatShoot, self).__init__()
if grid_config is None:
grid_config = {
'xbound': [-51.2, 51.2, 0.8],
'ybound': [-51.2, 51.2, 0.8],
'zbound': [-10.0, 10.0, 20.0],
'dbound': [1.0, 60.0, 1.0],}
self.grid_config = grid_config
dx, bx, nx = gen_dx_bx(self.grid_config['xbound'],
self.grid_config['ybound'],
self.grid_config['zbound'],
)
self.dx = nn.Parameter(dx, requires_grad=False)
self.bx = nn.Parameter(bx, requires_grad=False)
self.nx = nn.Parameter(nx, requires_grad=False)
if data_config is None:
data_config = {'input_size': (256, 704)}
self.data_config = data_config
self.downsample = downsample
self.frustum = self.create_frustum() # D x H x W x 3
self.D, _, _, _ = self.frustum.shape
self.numC_input = numC_input
self.numC_Trans = numC_Trans
self.depth_net = nn.Conv2d(self.numC_input, self.D + self.numC_Trans, kernel_size=1, padding=0)
self.geom_feats = None
self.accelerate = accelerate
self.use_bev_pool = use_bev_pool
self.vp_megvii = vp_megvii
self.vp_stereo = vp_stero
def get_depth_dist(self, x):
return x.softmax(dim=1)
def create_frustum(self):
# make grid in image plane
ogfH, ogfW = self.data_config['input_size']
fH, fW = ogfH // self.downsample, ogfW // self.downsample
ds = torch.arange(*self.grid_config['dbound'], dtype=torch.float).view(-1, 1, 1).expand(-1, fH, fW) # dbound=[2.0, 58.0, 0.5]
D, _, _ = ds.shape
xs = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(1, 1, fW).expand(D, fH, fW)
ys = torch.linspace(0, ogfH - 1, fH, dtype=torch.float).view(1, fH, 1).expand(D, fH, fW)
# D x H x W x 3
frustum = torch.stack((xs, ys, ds), -1)
return nn.Parameter(frustum, requires_grad=False)
def get_geometry(self, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda):
"""Determine the (x,y,z) locations (in the ego frame)
of the points in the point cloud.
Returns B x N x D x H/downsample x W/downsample x 3
"""
B, N, _ = trans_seq.shape
# undo post-transformation
# B x N x D x H x W x 3
points = self.frustum - post_trans_seq.view(B, N, 1, 1, 1, 3)
points = torch.inverse(post_rots_seq).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))
# cam_to_ego
points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
points[:, :, :, :, :, 2:3]
), 5)
if intrins_seq.shape[3] == 4:
shift = intrins_seq[:, :, :3, 3]
points = points - shift.view(B, N, 1, 1, 1, 3, 1)
intrins_seq = intrins_seq[:, :, :3, :3]
combine = rots_seq.matmul(torch.inverse(intrins_seq))
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += trans_seq.view(B, N, 1, 1, 1, 3)
return points
def voxel_pooling(self, geom_feats, x):
B, N, D, H, W, C = x.shape
Nprime = B * N * D * H * W
nx = self.nx.to(torch.long)
# flatten x
x = x.reshape(Nprime, C)
# flatten indices
geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).long()
geom_feats = geom_feats.view(Nprime, 3)
batch_ix = torch.cat([torch.full([Nprime // B, 1], ix,
device=x.device, dtype=torch.long) for ix in range(B)])
geom_feats = torch.cat((geom_feats, batch_ix), 1)
# filter out points that are outside box
kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \
& (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \
& (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])
x = x[kept]
geom_feats = geom_feats[kept]
if self.use_bev_pool:
final = occ_pool(x, geom_feats, B, self.nx[2], self.nx[0],
self.nx[1])
final = final.transpose(dim0=-2, dim1=-1)
else:
# get tensors from the same voxel next to each other
ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B) \
+ geom_feats[:, 1] * (self.nx[2] * B) \
+ geom_feats[:, 2] * B \
+ geom_feats[:, 3]
sorts = ranks.argsort()
x, geom_feats, ranks = x[sorts], geom_feats[sorts], ranks[sorts]
# cumsum trick
x, geom_feats = QuickCumsum.apply(x, geom_feats, ranks)
# griddify (B x C x Z x X x Y)
final = torch.zeros((B, C, nx[2], nx[1], nx[0]), device=x.device)
final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 1], geom_feats[:, 0]] = x
# collapse Z
final = torch.cat(final.unbind(dim=2), 1)
return final
def voxel_pooling_accelerated(self, rots, trans, intrins, post_rots, post_trans, bda, x):
B, N, D, H, W, C = x.shape
Nprime = B * N * D * H * W
nx = self.nx.to(torch.long)
# flatten x
x = x.reshape(Nprime, C)
max = 300
# flatten indices
if self.geom_feats is None:
geom_feats = self.get_geometry(rots, trans, intrins,
post_rots, post_trans, bda)
geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) /
self.dx).long()
geom_feats = geom_feats.view(Nprime, 3)
batch_ix = torch.cat([torch.full([Nprime // B, 1], ix,
device=x.device, dtype=torch.long)
for ix in range(B)])
geom_feats = torch.cat((geom_feats, batch_ix), 1)
# filter out points that are outside box
kept1 = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \
& (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \
& (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])
idx = torch.range(0, x.shape[0] - 1, dtype=torch.long)
x = x[kept1]
idx = idx[kept1]
geom_feats = geom_feats[kept1]
# get tensors from the same voxel next to each other
ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B) \
+ geom_feats[:, 1] * (self.nx[2] * B) \
+ geom_feats[:, 2] * B \
+ geom_feats[:, 3]
sorts = ranks.argsort()
x, geom_feats, ranks, idx = x[sorts], geom_feats[sorts], ranks[sorts], idx[sorts]
repeat_id = torch.ones(geom_feats.shape[0], device=geom_feats.device, dtype=geom_feats.dtype)
curr = 0
repeat_id[0] = 0
curr_rank = ranks[0]
for i in range(1, ranks.shape[0]):
if curr_rank == ranks[i]:
curr += 1
repeat_id[i] = curr
else:
curr_rank = ranks[i]
curr = 0
repeat_id[i] = curr
kept2 = repeat_id < max
repeat_id, geom_feats, x, idx = repeat_id[kept2], geom_feats[kept2], x[kept2], idx[kept2]
geom_feats = torch.cat([geom_feats,
repeat_id.unsqueeze(-1)], dim=-1)
self.geom_feats = geom_feats
self.idx = idx
else:
geom_feats = self.geom_feats
idx = self.idx
x = x[idx]
# griddify (B x C x Z x X x Y)
final = torch.zeros((B, C, nx[2], nx[1], nx[0], max), device=x.device)
final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 1],
geom_feats[:, 0], geom_feats[:, 4]] = x
final = final.sum(-1)
# collapse Z
final = torch.cat(final.unbind(dim=2), 1)
return final
def voxel_pooling_bevdepth(self, geom_feats, x):
nx = self.nx.to(torch.long)
geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).int()
# FIXME
# final = voxel_pooling(geom_feats, x.contiguous(), nx)
final = self.voxel_pooling(geom_feats, x.contiguous(), nx)
return final
def forward(self, input):
x, rots, trans, intrins, post_rots, post_trans, bda = input
B, N, C, H, W = x.shape
x = x.view(B * N, C, H, W)
x = self.depth_net(x)
depth = self.get_depth_dist(x[:, :self.D])
img_feat = x[:, self.D:(self.D + self.numC_Trans)]
# Lift
volume = depth.unsqueeze(1) * img_feat.unsqueeze(2)
volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
volume = volume.permute(0, 1, 3, 4, 5, 2)
# Splat
if self.accelerate:
bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins,
post_rots, post_trans,
bda, volume)
else:
geom = self.get_geometry(rots, trans, intrins,
post_rots, post_trans, bda)
if self.vp_megvii:
bev_feat = self.voxel_pooling_bevdepth(geom, volume)
else:
bev_feat = self.voxel_pooling(geom, volume)
return bev_feat
class _ASPPModule(nn.Module):
def __init__(self, inplanes, planes, kernel_size, padding, dilation,
BatchNorm):
super(_ASPPModule, self).__init__()
self.atrous_conv = nn.Conv2d(inplanes,
planes,
kernel_size=kernel_size,
stride=1,
padding=padding,
dilation=dilation,
bias=False)
self.bn = BatchNorm
self.relu = nn.ReLU()
self._init_weight()
def forward(self, x):
x = self.atrous_conv(x)
x = self.bn(x)
return self.relu(x)
def _init_weight(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
class ASPP(nn.Module):
def __init__(self, inplanes, mid_channels=256, norm_cfg=dict(type='BN2d')):
super(ASPP, self).__init__()
dilations = [1, 6, 12, 18]
self.aspp1 = _ASPPModule(inplanes,
mid_channels,
1,
padding=0,
dilation=dilations[0],
BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])
self.aspp2 = _ASPPModule(inplanes,
mid_channels,
3,
padding=dilations[1],
dilation=dilations[1],
BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])
self.aspp3 = _ASPPModule(inplanes,
mid_channels,
3,
padding=dilations[2],
dilation=dilations[2],
BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])
self.aspp4 = _ASPPModule(inplanes,
mid_channels,
3,
padding=dilations[3],
dilation=dilations[3],
BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])
self.global_avg_pool = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False),
build_norm_layer(norm_cfg, mid_channels)[1],
nn.ReLU(),
)
self.conv1 = nn.Conv2d(int(mid_channels * 5),
mid_channels,
1,
bias=False)
self.bn1 = build_norm_layer(norm_cfg, mid_channels)[1]
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
self._init_weight()
def forward(self, x):
x1 = self.aspp1(x)
x2 = self.aspp2(x)
x3 = self.aspp3(x)
x4 = self.aspp4(x)
x5 = self.global_avg_pool(x)
x5 = F.interpolate(x5,
size=x4.size()[2:],
mode='bilinear',
align_corners=True)
x = torch.cat((x1, x2, x3, x4, x5), dim=1)
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
return self.dropout(x)
def _init_weight(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
class Mlp(nn.Module):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.ReLU,
drop=0.0):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.drop1 = nn.Dropout(drop)
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop2 = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop1(x)
x = self.fc2(x)
x = self.drop2(x)
return x
class SELayer(nn.Module):
def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
super().__init__()
self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
self.act1 = act_layer()
self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
self.gate = gate_layer()
def forward(self, x, x_se):
x_se = self.conv_reduce(x_se)
x_se = self.act1(x_se)
x_se = self.conv_expand(x_se)
return x * self.gate(x_se)
class DepthNet(nn.Module):
def __init__(self, in_channels, mid_channels, context_channels,
depth_channels, cam_channels=27, norm_cfg=None):
super(DepthNet, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1),
build_norm_layer(norm_cfg, mid_channels)[1],
nn.ReLU(inplace=True),
)
self.context_conv = nn.Conv2d(mid_channels,
context_channels,
kernel_size=1,
stride=1,
padding=0)
self.bn = build_norm_layer(dict(type='GN', num_groups=9, requires_grad=True), cam_channels)[1]
self.depth_mlp = Mlp(cam_channels, mid_channels, mid_channels)
self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware
self.context_mlp = Mlp(cam_channels, mid_channels, mid_channels)
self.context_se = SELayer(mid_channels) # NOTE: add camera-aware
self.depth_conv = nn.Sequential(
BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
ASPP(mid_channels, mid_channels, norm_cfg=norm_cfg),
build_conv_layer(cfg=dict(
type='DCN',
in_channels=mid_channels,
out_channels=mid_channels,
kernel_size=3,
padding=1,
groups=4,
im2col_step=128,
)),
nn.Conv2d(mid_channels,
depth_channels,
kernel_size=1,
stride=1,
padding=0),
)
def forward(self, x, mlp_input):
mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
x = self.reduce_conv(x)
context_se = self.context_mlp(mlp_input)[..., None, None]
context = self.context_se(x, context_se)
context = self.context_conv(context)
depth_se = self.depth_mlp(mlp_input)[..., None, None]
depth = self.depth_se(x, depth_se)
depth = self.depth_conv(depth)
return torch.cat([depth, context], dim=1)
class DepthAggregation(nn.Module):
"""
pixel cloud feature extraction
"""
def __init__(self, in_channels, mid_channels, out_channels, norm_cfg):
super(DepthAggregation, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False),
build_norm_layer(norm_cfg, mid_channels)[1],
nn.ReLU(inplace=True),
)
self.conv = nn.Sequential(
nn.Conv2d(mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False),
build_norm_layer(norm_cfg, mid_channels)[1],
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False),
build_norm_layer(norm_cfg, mid_channels)[1],
nn.ReLU(inplace=True),
)
self.out_conv = nn.Sequential(
nn.Conv2d(mid_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=True),
# nn.BatchNorm3d(out_channels),
# nn.ReLU(inplace=True),
)
@autocast(False)
def forward(self, x):
x = checkpoint(self.reduce_conv, x)
short_cut = x
x = checkpoint(self.conv, x)
x = short_cut + x
x = self.out_conv(x)
return x
@NECKS.register_module()
class ViewTransformerLSSBEVDepth(ViewTransformerLiftSplatShoot):
def __init__(self, loss_depth_weight, cam_channels=27, loss_depth_reg_weight=0.0, use_voxel_net=False,
norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01), **kwargs):
super(ViewTransformerLSSBEVDepth, self).__init__(**kwargs)
self.loss_depth_weight = loss_depth_weight
self.loss_depth_reg_weight = loss_depth_reg_weight
self.cam_channels = cam_channels
self.depth_net = DepthNet(self.numC_input, self.numC_input,
self.numC_Trans, self.D, cam_channels=self.cam_channels,
norm_cfg=norm_cfg)
self.depth_aggregation_net = DepthAggregation(self.numC_Trans,
self.numC_Trans,
self.numC_Trans,
norm_cfg=norm_cfg) if use_voxel_net else None
def _forward_voxel_net(self, img_feat_with_depth):
# BEVConv2D [n, c, d, h, w] -> [n, h, c, w, d]
if self.depth_aggregation_net is None:
return img_feat_with_depth
img_feat_with_depth = img_feat_with_depth.permute(
0, 3, 1, 4, 2).contiguous() # [n, c, d, h, w] -> [n, h, c, w, d]
n, h, c, w, d = img_feat_with_depth.shape
img_feat_with_depth = img_feat_with_depth.view(-1, c, w, d)
img_feat_with_depth = (
self.depth_aggregation_net(img_feat_with_depth).view(
n, h, c, w, d).permute(0, 2, 4, 1, 3).contiguous().float())
return img_feat_with_depth
def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda=None):
B,N,_,_ = rot.shape
if bda is None:
bda = torch.eye(3).to(rot).view(1,3,3).repeat(B,1,1)
bda = bda.view(B,1,3,3).repeat(1,N,1,1)
if intrin.shape[-1] == 4:
# for KITTI, the intrin matrix is 3x4
mlp_input = torch.stack([
intrin[:, :, 0, 0],
intrin[:, :, 1, 1],
intrin[:, :, 0, 2],
intrin[:, :, 1, 2],
intrin[:, :, 0, 3],
intrin[:, :, 1, 3],
intrin[:, :, 2, 3],
post_rot[:, :, 0, 0],
post_rot[:, :, 0, 1],
post_tran[:, :, 0],
post_rot[:, :, 1, 0],
post_rot[:, :, 1, 1],
post_tran[:, :, 1],
bda[:, :, 0, 0],
bda[:, :, 0, 1],
bda[:, :, 1, 0],
bda[:, :, 1, 1],
bda[:, :, 2, 2],
], dim=-1)
else:
mlp_input = torch.stack([
intrin[:, :, 0, 0],
intrin[:, :, 1, 1],
intrin[:, :, 0, 2],
intrin[:, :, 1, 2],
post_rot[:, :, 0, 0],
post_rot[:, :, 0, 1],
post_tran[:, :, 0],
post_rot[:, :, 1, 0],
post_rot[:, :, 1, 1],
post_tran[:, :, 1],
bda[:, :, 0, 0],
bda[:, :, 0, 1],
bda[:, :, 1, 0],
bda[:, :, 1, 1],
bda[:, :, 2, 2],
], dim=-1)
sensor2ego = torch.cat([rot, tran.reshape(B, N, 3, 1)], dim=-1).reshape(B, N, -1)
mlp_input = torch.cat([mlp_input, sensor2ego], dim=-1)
return mlp_input
def get_downsampled_gt_depth(self, gt_depths):
"""
Input:
gt_depths: [B, N, H, W]
Output:
gt_depths: [B*N*h*w, d]
"""
B, N, H, W = gt_depths.shape
gt_depths = gt_depths.view(B * N,
H // self.downsample, self.downsample,
W // self.downsample, self.downsample, 1)
gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
gt_depths = gt_depths.view(-1, self.downsample * self.downsample)
gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths)
gt_depths = torch.min(gt_depths_tmp, dim=-1).values
gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample)
# [min - step / 2, min + step / 2] creates min depth
gt_depths = (gt_depths - (self.grid_config['dbound'][0] - self.grid_config['dbound'][2] / 2)) / self.grid_config['dbound'][2]
gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths))
gt_depths = F.one_hot(gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:]
return gt_depths.float()
def _prepare_depth_gt(self, gt_depths):
"""
Input:
gt_depths: [B, N, H, W]
Output:
gt_depths: [B*N*H*W, d]
"""
gt_depths = (gt_depths - (self.grid_config['dbound'][0] -
self.grid_config['dbound'][2])) / \
self.grid_config['dbound'][2]
gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0),
gt_depths, torch.zeros_like(gt_depths))
gt_depths = F.one_hot(gt_depths.long(),
num_classes=self.D + 1).view(-1,
self.D + 1)[:, 1:]
return gt_depths.float()
@force_fp32()
def get_depth_reg_loss(self, depth_labels, depth_preds):
depth_labels = self.get_downsampled_gt_depth(depth_labels)
# depth_labels = self._prepare_depth_gt(depth_labels)
depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D)
# foreground predictions & labels
fg_mask = torch.max(depth_labels, dim=1).values > 0.0
depth_labels = depth_labels[fg_mask]
depth_preds = depth_preds[fg_mask]
# cls_targets ==> reg_targets
ds = torch.arange(*self.grid_config['dbound'], dtype=torch.float).view(1, -1).type_as(depth_preds)
depth_reg_labels = torch.sum(depth_labels * ds, dim=1)
depth_reg_preds = torch.sum(depth_preds * ds, dim=1)
with autocast(enabled=False):
loss_depth = F.smooth_l1_loss(depth_reg_preds, depth_reg_labels, reduction='mean')
return self.loss_depth_reg_weight * loss_depth
@force_fp32()
def get_depth_loss(self, depth_labels, depth_preds):
depth_labels = self.get_downsampled_gt_depth(depth_labels)
# depth_labels = self._prepare_depth_gt(depth_labels)
depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(
-1, self.D)
fg_mask = torch.max(depth_labels, dim=1).values > 0.0
depth_labels = depth_labels[fg_mask]
depth_preds = depth_preds[fg_mask]
with autocast(enabled=False):
depth_loss = F.binary_cross_entropy(
depth_preds,
depth_labels,
reduction='none',
).sum() / max(1.0, fg_mask.sum())
return self.loss_depth_weight * depth_loss
def forward(self, input):
(x, rots, trans, intrins, post_rots, post_trans, bda, mlp_input) = input[:8]
B, N, C, H, W = x.shape
x = x.view(B * N, C, H, W)
x = self.depth_net(x, mlp_input)
depth_digit = x[:, :self.D, ...]
img_feat = x[:, self.D:self.D+self.numC_Trans, ...]
depth_prob = self.get_depth_dist(depth_digit)
# Lift
volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2)
volume = self._forward_voxel_net(volume)
volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
volume = volume.permute(0, 1, 3, 4, 5, 2)
# Splat
if self.accelerate:
bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins,
post_rots, post_trans,
bda, volume)
else:
geom = self.get_geometry(rots, trans, intrins,
post_rots, post_trans, bda)
if self.vp_megvii:
bev_feat = self.voxel_pooling_bevdepth(geom, volume)
else:
bev_feat = self.voxel_pooling(geom, volume)
return bev_feat, depth_prob
class ConvBnReLU3D(nn.Module):
"""Implements of 3d convolution + batch normalization + ReLU."""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
pad: int = 1,
dilation: int = 1,
) -> None:
"""initialization method for convolution3D + batch normalization + relu module
Args:
in_channels: input channel number of convolution layer
out_channels: output channel number of convolution layer
kernel_size: kernel size of convolution layer
stride: stride of convolution layer
pad: pad of convolution layer
dilation: dilation of convolution layer
"""
super(ConvBnReLU3D, self).__init__()
self.conv = nn.Conv3d(in_channels,
out_channels,
kernel_size,
stride=stride,
padding=pad,
dilation=dilation,
bias=False)
self.bn = nn.BatchNorm3d(out_channels)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""forward method"""
return F.relu(self.bn(self.conv(x)), inplace=True)
class DepthNetStereo(nn.Module):
def __init__(self,
in_channels,
mid_channels,
context_channels,
depth_channels,
d_bound,
num_ranges=4,
norm_cfg=dict(type='BN', requires_grad=True)):
super(DepthNetStereo, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
)
self.context_conv = nn.Conv2d(mid_channels,
context_channels,
kernel_size=1,
stride=1,
padding=0)
self.bn = nn.BatchNorm1d(27)
self.depth_mlp = Mlp(27, mid_channels, mid_channels)
self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware
self.context_mlp = Mlp(27, mid_channels, mid_channels)
self.context_se = SELayer(mid_channels) # NOTE: add camera-aware
self.depth_feat_conv = nn.Sequential(
BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
ASPP(mid_channels, mid_channels, norm_cfg=norm_cfg),
build_conv_layer(cfg=dict(
type='DCN',
in_channels=mid_channels,
out_channels=mid_channels,
kernel_size=3,
padding=1,
groups=4,
im2col_step=128,
)),
)
self.mu_sigma_range_net = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
nn.ConvTranspose2d(mid_channels,
mid_channels,
3,
stride=2,
padding=1,
output_padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.ConvTranspose2d(mid_channels,
mid_channels,
3,
stride=2,
padding=1,
output_padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels,
num_ranges * 3,
kernel_size=1,
stride=1,
padding=0),
)
self.mono_depth_net = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
nn.Conv2d(mid_channels,
depth_channels,
kernel_size=1,
stride=1,
padding=0),
)
self.d_bound = d_bound
self.num_ranges = num_ranges
# @autocast(False)
def forward(self, x, mlp_input):
B, _, H, W = x.shape
mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
x = self.reduce_conv(x)
context_se = self.context_mlp(mlp_input)[..., None, None]
context = self.context_se(x, context_se)
context = self.context_conv(context)
depth_se = self.depth_mlp(mlp_input)[..., None, None]
depth_feat = self.depth_se(x, depth_se)
depth_feat = checkpoint(self.depth_feat_conv, depth_feat)
mono_depth = checkpoint(self.mono_depth_net, depth_feat)
mu_sigma_score = checkpoint(self.mu_sigma_range_net, depth_feat)
mu = mu_sigma_score[:, 0:self.num_ranges, ...]
sigma = mu_sigma_score[:, self.num_ranges:2 * self.num_ranges, ...]
range_score = mu_sigma_score[:,
2 * self.num_ranges:3 * self.num_ranges,
...]
sigma = F.elu(sigma) + 1.0 + 1e-10
return x, context, mu, sigma, range_score, mono_depth
@NECKS.register_module()
class ViewTransformerLSSBEVStereo(ViewTransformerLSSBEVDepth):
def __init__(self, num_ranges=4, use_mask=True, em_iteration=3,
range_list=[[2, 8], [8, 16], [16, 28], [28, 58]],
sampling_range=3, num_samples=3,
k_list=None, min_sigma=1.0,
num_groups=8,
stereo_downsample_factor=4,
norm_cfg=dict(type='BN2d'), **kwargs):
super(ViewTransformerLSSBEVStereo, self).__init__(**kwargs)
self.num_ranges = num_ranges
self.depth_net = DepthNetStereo(self.numC_input, self.numC_input,
self.numC_Trans, self.D,
self.grid_config['dbound'],
self.num_ranges,
norm_cfg=norm_cfg)
self.context_downsample_net = nn.Identity()
self.use_mask = use_mask
self.stereo_downsample_factor = stereo_downsample_factor
self.num_ranges = num_ranges
self.min_sigma = min_sigma
self.sampling_range = sampling_range
self.num_samples = num_samples
self.num_groups=num_groups
self.similarity_net = nn.Sequential(
ConvBnReLU3D(in_channels=num_groups,
out_channels=16,
kernel_size=1,
stride=1,
pad=0),
ConvBnReLU3D(in_channels=16,
out_channels=8,
kernel_size=1,
stride=1,
pad=0),
nn.Conv3d(in_channels=8,
out_channels=1,
kernel_size=1,
stride=1,
padding=0),
)
self.depth_downsample_net = nn.Sequential(
nn.Conv2d(self.D, 256, 3, 2, 1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(256, 256, 3, 2, 1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(256, self.D, 1, 1, 0),
)
if range_list is None:
range_length = (self.grid_config['dbound'][1] -
self.grid_config['dbound'][0]) / num_ranges
self.range_list = [[
self.grid_config['dbound'][0] + range_length * i,
self.grid_config['dbound'][0] + range_length * (i + 1)
] for i in range(num_ranges)]
else:
assert len(range_list) == num_ranges
self.range_list = range_list
self.em_iteration = em_iteration
if k_list is None:
self.register_buffer('k_list', torch.Tensor(self.depth_sampling()))
else:
self.register_buffer('k_list', torch.Tensor(k_list))
if self.use_mask:
self.mask_net = nn.Sequential(
nn.Conv2d(self.D*2, 64, 3, 1, 1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
BasicBlock(64, 64),
BasicBlock(64, 64),
nn.Conv2d(64, 1, 1, 1, 0),
nn.Sigmoid(),
)
def depth_sampling(self):
"""Generate sampling range of candidates.
Returns:
list[float]: List of all candidates.
"""
P_total = erf(self.sampling_range /
np.sqrt(2)) # Probability covered by the sampling range
idx_list = np.arange(0, self.num_samples + 1)
p_list = (1 - P_total) / 2 + ((idx_list / self.num_samples) * P_total)
k_list = norm.ppf(p_list)
k_list = (k_list[1:] + k_list[:-1]) / 2
return list(k_list)
def create_depth_sample_frustum(self, depth_sample, downsample_factor=16):
"""Generate frustum"""
# make grid in image plane
ogfH, ogfW = self.data_config['input_size']
fH, fW = ogfH // downsample_factor, ogfW // downsample_factor
batch_size, num_depth, _, _ = depth_sample.shape
x_coords = (torch.linspace(0,
ogfW - 1,
fW,
dtype=torch.float,
device=depth_sample.device).view(
1, 1, 1,
fW).expand(batch_size, num_depth, fH,
fW))
y_coords = (torch.linspace(0,
ogfH - 1,
fH,
dtype=torch.float,
device=depth_sample.device).view(
1, 1, fH,
1).expand(batch_size, num_depth, fH,
fW))
paddings = torch.ones_like(depth_sample)
# D x H x W x 3
frustum = torch.stack((x_coords, y_coords, depth_sample, paddings), -1)
return frustum
def homo_warping(
self,
stereo_feat,
key_intrin_mats,
sweep_intrin_mats,
sensor2sensor_mats,
key_ida_mats,
sweep_ida_mats,
depth_sample,
frustum,
):
"""Used for mvs method to transfer sweep image feature to
key image feature.
Args:
src_fea(Tensor): image features.
key_intrin_mats(Tensor): Intrin matrix for key sensor.
sweep_intrin_mats(Tensor): Intrin matrix for sweep sensor.
sensor2sensor_mats(Tensor): Transformation matrix from key
sensor to sweep sensor.
key_ida_mats(Tensor): Ida matrix for key frame.
sweep_ida_mats(Tensor): Ida matrix for sweep frame.
depth_sample (Tensor): Depth map of all candidates.
depth_sample_frustum (Tensor): Pre-generated frustum.
"""
batch_size_with_num_cams, channels = stereo_feat.shape[
0], stereo_feat.shape[1]
height, width = stereo_feat.shape[2], stereo_feat.shape[3]
with torch.no_grad():
points = frustum
points = points.reshape(points.shape[0], -1, points.shape[-1])
points[..., 2] = 1
# Undo ida for key frame.
points = key_ida_mats.reshape(batch_size_with_num_cams,
*key_ida_mats.shape[2:]).inverse(
).unsqueeze(1) @ points.unsqueeze(-1)
# Convert points from pixel coord to key camera coord.
points[..., :3, :] *= depth_sample.reshape(
batch_size_with_num_cams, -1, 1, 1)
num_depth = frustum.shape[1]
points = (key_intrin_mats.reshape(
batch_size_with_num_cams,
*key_intrin_mats.shape[2:]).inverse().unsqueeze(1) @ points)
points = (sensor2sensor_mats.reshape(
batch_size_with_num_cams,
*sensor2sensor_mats.shape[2:]).unsqueeze(1) @ points)
# points in sweep sensor coord.
points = (sweep_intrin_mats.reshape(
batch_size_with_num_cams,
*sweep_intrin_mats.shape[2:]).unsqueeze(1) @ points)
# points in sweep pixel coord.
points[..., :2, :] = points[..., :2, :] / points[
..., 2:3, :] # [B, 2, Ndepth, H*W]
points = (sweep_ida_mats.reshape(
batch_size_with_num_cams,
*sweep_ida_mats.shape[2:]).unsqueeze(1) @ points).squeeze(-1)
neg_mask = points[..., 2] < 1e-3
points[..., 0][neg_mask] = width * self.stereo_downsample_factor
points[..., 1][neg_mask] = height * self.stereo_downsample_factor
points[..., 2][neg_mask] = 1
proj_x_normalized = points[..., 0] / (
(width * self.stereo_downsample_factor - 1) / 2) - 1
proj_y_normalized = points[..., 1] / (
(height * self.stereo_downsample_factor - 1) / 2) - 1
grid = torch.stack([proj_x_normalized, proj_y_normalized],
dim=2) # [B, Ndepth, H*W, 2]
warped_stereo_fea = F.grid_sample(
stereo_feat,
grid.view(batch_size_with_num_cams, num_depth * height, width, 2),
mode='bilinear',
padding_mode='zeros',
)
warped_stereo_fea = warped_stereo_fea.view(batch_size_with_num_cams,
channels, num_depth, height,
width)
return warped_stereo_fea
def _forward_mask(
self,
sweep_index,
mono_depth_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
):
"""Forward function to generate mask.
Args:
sweep_index (int): Index of sweep.
mono_depth_all_sweeps (list[Tensor]): List of mono_depth for
all sweeps.
mats_dict (dict):
sensor2ego_mats (Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats (Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats (Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats (Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat (Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
depth_sample (Tensor): Depth map of all candidates.
depth_sample_frustum (Tensor): Pre-generated frustum.
sensor2sensor_mats (Tensor): Transformation matrix from reference
sensor to source sensor.
Returns:
Tensor: Generated mask.
"""
num_sweeps = len(mono_depth_all_sweeps)
mask_all_sweeps = list()
for idx in range(num_sweeps):
if idx == sweep_index:
continue
warped_mono_depth = self.homo_warping(
mono_depth_all_sweeps[idx],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, idx, ...],
sensor2sensor_mats[idx],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, idx, ...],
depth_sample,
depth_sample_frustum.type_as(mono_depth_all_sweeps[idx]),
)
mask = self.mask_net(
torch.cat([
mono_depth_all_sweeps[sweep_index].detach(),
warped_mono_depth.mean(2).detach()
], 1))
mask_all_sweeps.append(mask)
return torch.stack(mask_all_sweeps).mean(0)
def _generate_cost_volume(
self,
sweep_index,
stereo_feats_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
):
"""Generate cost volume based on depth sample.
Args:
sweep_index (int): Index of sweep.
stereo_feats_all_sweeps (list[Tensor]): Stereo feature
of all sweeps.
mats_dict (dict):
sensor2ego_mats (Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats (Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats (Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats (Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat (Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
depth_sample (Tensor): Depth map of all candidates.
depth_sample_frustum (Tensor): Pre-generated frustum.
sensor2sensor_mats (Tensor): Transformation matrix from reference
sensor to source sensor.
Returns:
Tensor: Depth score for all sweeps.
"""
batch_size, num_channels, height, width = stereo_feats_all_sweeps[
0].shape
# thres = int(self.mvs_weighting.split("CW")[1])
num_sweeps = len(stereo_feats_all_sweeps)
depth_score_all_sweeps = list()
for idx in range(num_sweeps):
if idx == sweep_index:
continue
warped_stereo_fea = self.homo_warping(
stereo_feats_all_sweeps[idx],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, idx, ...],
sensor2sensor_mats[idx],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, idx, ...],
depth_sample,
depth_sample_frustum.type_as(stereo_feats_all_sweeps[idx]),
)
warped_stereo_fea = warped_stereo_fea.reshape(
batch_size, self.num_groups, num_channels // self.num_groups,
self.num_samples, height, width)
ref_stereo_feat = stereo_feats_all_sweeps[sweep_index].reshape(
batch_size, self.num_groups, num_channels // self.num_groups,
height, width)
feat_cost = torch.mean(
(ref_stereo_feat.unsqueeze(3) * warped_stereo_fea), axis=2)
depth_score = self.similarity_net(feat_cost).squeeze(1)
depth_score_all_sweeps.append(depth_score)
return torch.stack(depth_score_all_sweeps).mean(0)
def _forward_stereo(
self,
sweep_index,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
):
"""Forward function to generate stereo depth.
Args:
sweep_index (int): Index of sweep.
stereo_feats_all_sweeps (list[Tensor]): Stereo feature
of all sweeps.
mono_depth_all_sweeps (list[Tensor]):
mats_dict (dict):
sensor2ego_mats (Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats (Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats (Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats (Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat (Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix from key
sensor to sweep sensor.
mu_all_sweeps (list[Tensor]): List of mu for all sweeps.
sigma_all_sweeps (list[Tensor]): List of sigma for all sweeps.
range_score_all_sweeps (list[Tensor]): List of all range score
for all sweeps.
depth_feat_all_sweeps (list[Tensor]): List of all depth feat for
all sweeps.
Returns:
Tensor: stereo_depth
"""
batch_size_with_cams, _, feat_height, feat_width = \
stereo_feats_all_sweeps[0].shape
device = stereo_feats_all_sweeps[0].device
d_coords = torch.arange(*self.grid_config['dbound'],
dtype=torch.float,
device=device).reshape(1, -1, 1, 1)
d_coords = d_coords.repeat(batch_size_with_cams, 1, feat_height,
feat_width)
stereo_depth = stereo_feats_all_sweeps[0].new_zeros(
batch_size_with_cams, self.D, feat_height, feat_width)
mask_score = stereo_feats_all_sweeps[0].new_zeros(
batch_size_with_cams,
self.D,
feat_height * self.stereo_downsample_factor //
self.downsample,
feat_width * self.stereo_downsample_factor //
self.downsample,
)
score_all_ranges = list()
range_score = range_score_all_sweeps[sweep_index].softmax(1)
for range_idx in range(self.num_ranges):
# Map mu to the corresponding interval.
range_start = self.range_list[range_idx][0]
mu_all_sweeps_single_range = [
mu[:, range_idx:range_idx + 1, ...].sigmoid() *
(self.range_list[range_idx][1] - self.range_list[range_idx][0])
+ range_start for mu in mu_all_sweeps
]
sigma_all_sweeps_single_range = [
sigma[:, range_idx:range_idx + 1, ...]
for sigma in sigma_all_sweeps
]
batch_size_with_cams, _, feat_height, feat_width =\
stereo_feats_all_sweeps[0].shape
mu = mu_all_sweeps_single_range[sweep_index]
sigma = sigma_all_sweeps_single_range[sweep_index]
for _ in range(self.em_iteration):
depth_sample = torch.cat([mu + sigma * k for k in self.k_list],
1)
depth_sample_frustum = self.create_depth_sample_frustum(
depth_sample, self.stereo_downsample_factor)
mu_score = self._generate_cost_volume(
sweep_index,
stereo_feats_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
)
mu_score = mu_score.softmax(1)
scale_factor = torch.clamp(
0.5 / (1e-4 + mu_score[:, self.num_samples //
2:self.num_samples // 2 + 1, ...]),
min=0.1,
max=10)
sigma = torch.clamp(sigma * scale_factor, min=0.1, max=10)
mu = (depth_sample * mu_score).sum(1, keepdim=True)
del depth_sample
del depth_sample_frustum
mu = torch.clamp(mu,
max=self.range_list[range_idx][1],
min=self.range_list[range_idx][0])
range_length = int(
(self.range_list[range_idx][1] - self.range_list[range_idx][0])
// self.grid_config['dbound'][2])
if self.use_mask:
depth_sample = F.avg_pool2d(
mu,
self.downsample // self.stereo_downsample_factor,
self.downsample // self.stereo_downsample_factor,
)
depth_sample_frustum = self.create_depth_sample_frustum(
depth_sample, self.downsample)
mask = self._forward_mask(
sweep_index,
mono_depth_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
)
mask_score[:,
int((range_start - self.grid_config['dbound'][0]) //
self.grid_config['dbound'][2]):range_length +
int((range_start - self.grid_config['dbound'][0]) //
self.grid_config['dbound'][2]), ..., ] += mask
del depth_sample
del depth_sample_frustum
sigma = torch.clamp(sigma, self.min_sigma)
mu_repeated = mu.repeat(1, range_length, 1, 1)
eps = 1e-6
depth_score_single_range = (-1 / 2 * (
(d_coords[:,
int((range_start - self.grid_config['dbound'][0]) //
self.grid_config['dbound'][2]):range_length + int(
(range_start - self.grid_config['dbound'][0]) //
self.grid_config['dbound'][2]), ..., ] - mu_repeated) /
torch.sqrt(sigma))**2)
depth_score_single_range = depth_score_single_range.exp()
score_all_ranges.append(mu_score.sum(1).unsqueeze(1))
depth_score_single_range = depth_score_single_range / (
sigma * math.sqrt(2 * math.pi) + eps)
stereo_depth[:,
int((range_start - self.grid_config['dbound'][0]) //
self.grid_config['dbound'][2]):range_length +
int((range_start - self.grid_config['dbound'][0]) //
self.grid_config['dbound'][2]), ..., ] = (
depth_score_single_range *
range_score[:, range_idx:range_idx + 1, ...])
# del range_score
del depth_score_single_range
del mu_repeated
if self.use_mask:
return stereo_depth, mask_score
else:
return stereo_depth
def forward(self, input):
img_feat, depth_prob, rots, trans, intrins, post_rots, post_trans, bda = input
B, N, C, H, W = img_feat.shape
img_feat = img_feat.view(B*N,C,H,W)
# Lift
volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2)
volume = self._forward_voxel_net(volume)
volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
volume = volume.permute(0, 1, 3, 4, 5, 2)
# Splat
if self.accelerate:
bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins,
post_rots, post_trans,
bda, volume)
else:
geom = self.get_geometry(rots, trans, intrins,
post_rots, post_trans, bda)
if self.vp_megvii:
bev_feat = self.voxel_pooling_bevdepth(geom, volume)
else:
bev_feat = self.voxel_pooling(geom, volume)
return bev_feat
================================================
FILE: projects/occ_plugin/occupancy/image2bev/ViewTransformerLSSVoxel.py
================================================
# Copyright (c) Phigent Robotics. All rights reserved.
import math
import torch
import torch.nn as nn
from mmcv.runner import BaseModule
from mmdet3d.models.builder import NECKS
from projects.occ_plugin.ops.occ_pooling import occ_pool
from mmcv.cnn import build_conv_layer
from mmcv.runner import force_fp32
from torch.cuda.amp.autocast_mode import autocast
from projects.occ_plugin.utils.gaussian import generate_guassian_depth_target
import torch.nn.functional as F
import numpy as np
import pdb
from .ViewTransformerLSSBEVDepth import *
import torch.cuda as cuda
def get_gpu_memory_usage():
allocated = cuda.memory_allocated()
reserved = cuda.memory_reserved()
return allocated, reserved
@NECKS.register_module()
class ViewTransformerLiftSplatShootVoxel(ViewTransformerLSSBEVDepth):
def __init__(self, loss_depth_weight, loss_depth_type='bce', **kwargs):
super(ViewTransformerLiftSplatShootVoxel, self).__init__(loss_depth_weight=loss_depth_weight, **kwargs)
self.loss_depth_type = loss_depth_type
self.cam_depth_range = self.grid_config['dbound']
self.constant_std = 0.5
def get_downsampled_gt_depth(self, gt_depths):
"""
Input:
gt_depths: [B, N, H, W]
Output:
gt_depths: [B*N*h*w, d]
"""
B, N, H, W = gt_depths.shape
gt_depths = gt_depths.view(B * N,
H // self.downsample, self.downsample,
W // self.downsample, self.downsample, 1)
gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
gt_depths = gt_depths.view(-1, self.downsample * self.downsample)
gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths)
gt_depths = torch.min(gt_depths_tmp, dim=-1).values
gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample)
# [min - step / 2, min + step / 2] creates min depth
gt_depths = (gt_depths - (self.grid_config['dbound'][0] - self.grid_config['dbound'][2] / 2)) / self.grid_config['dbound'][2]
gt_depths_vals = gt_depths.clone()
gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths))
gt_depths = F.one_hot(gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:]
return gt_depths_vals, gt_depths.float()
@force_fp32()
def get_bce_depth_loss(self, depth_labels, depth_preds):
_, depth_labels = self.get_downsampled_gt_depth(depth_labels)
depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D)
fg_mask = torch.max(depth_labels, dim=1).values > 0.0
depth_labels = depth_labels[fg_mask]
depth_preds = depth_preds[fg_mask]
with autocast(enabled=False):
depth_loss = F.binary_cross_entropy(depth_preds, depth_labels, reduction='none').sum() / max(1.0, fg_mask.sum())
return depth_loss
@force_fp32()
def get_klv_depth_loss(self, depth_labels, depth_preds):
depth_gaussian_labels, depth_values = generate_guassian_depth_target(depth_labels,
self.downsample, self.cam_depth_range, constant_std=self.constant_std)
depth_values = depth_values.view(-1)
fg_mask = (depth_values >= self.cam_depth_range[0]) & (depth_values <= (self.cam_depth_range[1] - self.cam_depth_range[2]))
depth_gaussian_labels = depth_gaussian_labels.view(-1, self.D)[fg_mask]
depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D)[fg_mask]
depth_loss = F.kl_div(torch.log(depth_preds + 1e-4), depth_gaussian_labels, reduction='batchmean', log_target=False)
return depth_loss
@force_fp32()
def get_depth_loss(self, depth_labels, depth_preds):
if self.loss_depth_type == 'bce':
depth_loss = self.get_bce_depth_loss(depth_labels, depth_preds)
elif self.loss_depth_type == 'kld':
depth_loss = self.get_klv_depth_loss(depth_labels, depth_preds)
else:
pdb.set_trace()
return self.loss_depth_weight * depth_loss
def voxel_pooling(self, geom_feats, x):
B, N, D, H, W, C = x.shape
Nprime = B * N * D * H * W
x = x.contiguous().view(Nprime, C)
# flatten indices
geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).long()
geom_feats = geom_feats.view(Nprime, 3)
batch_ix = torch.cat([torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long) for ix in range(B)])
geom_feats = torch.cat((geom_feats, batch_ix), 1)
# filter out points that are outside box
kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \
& (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \
& (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])
x = x[kept]
geom_feats = geom_feats[kept]
final = occ_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1])
final = final.permute(0, 1, 3, 4, 2)
return final
def forward(self, input):
(x, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda, mlp_input_seq) = input[:8]
B, N, C, H, W = x.shape
x = x.view(B * N, C, H, W)
x = self.depth_net(x, mlp_input_seq)
depth_digit = x[:, :self.D, ...]
img_feat = x[:, self.D:self.D + self.numC_Trans, ...]
depth_prob = self.get_depth_dist(depth_digit)
volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2)
volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
volume = volume.permute(0, 1, 3, 4, 5, 2)
geom = self.get_geometry(rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda)
bev_feat = self.voxel_pooling(geom, volume)
return bev_feat, depth_prob
================================================
FILE: projects/occ_plugin/occupancy/image2bev/__init__.py
================================================
from .ViewTransformerLSSBEVDepth import ViewTransformerLSSBEVDepth
from .ViewTransformerLSSVoxel import ViewTransformerLiftSplatShootVoxel
================================================
FILE: projects/occ_plugin/occupancy/necks/__init__.py
================================================
from .second_fpn_3d import SECONDFPN3D
from .fpn3d import FPN3D
================================================
FILE: projects/occ_plugin/occupancy/necks/fpn3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule, auto_fp16
from torch import nn as nn
from mmcv.cnn import ConvModule
from mmdet.models import NECKS
import torch.nn.functional as F
import pdb
@NECKS.register_module()
class FPN3D(BaseModule):
"""FPN used in SECOND/PointPillars/PartA2/MVXNet.
Args:
in_channels (list[int]): Input channels of multi-scale feature maps.
out_channels (list[int]): Output channels of feature maps.
upsample_strides (list[int]): Strides used to upsample the
feature maps.
norm_cfg (dict): Config dict of normalization layers.
upsample_cfg (dict): Config dict of upsample layers.
conv_cfg (dict): Config dict of conv layers.
use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
"""
def __init__(self,
in_channels=[80, 160, 320, 640],
out_channels=256,
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
conv_cfg=dict(type='Conv3d'),
act_cfg=dict(type='ReLU'),
with_cp=False,
upsample_cfg=dict(mode='trilinear'),
init_cfg=None):
super(FPN3D, self).__init__(init_cfg=init_cfg)
self.in_channels = in_channels
self.out_channels = out_channels
self.fp16_enabled = False
self.upsample_cfg = upsample_cfg
self.with_cp = with_cp
self.num_out = len(self.in_channels)
self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()
for i in range(self.num_out):
l_conv = nn.Sequential(
ConvModule(in_channels[i], out_channels,
kernel_size=1, padding=0,
conv_cfg=conv_cfg, norm_cfg=norm_cfg,
act_cfg=act_cfg, bias=False,
inplace=True),
)
fpn_conv = nn.Sequential(
ConvModule(out_channels, out_channels,
kernel_size=3, padding=1,
conv_cfg=conv_cfg, norm_cfg=norm_cfg,
act_cfg=act_cfg, bias=False,
inplace=True),
)
self.lateral_convs.append(l_conv)
self.fpn_convs.append(fpn_conv)
@auto_fp16()
def forward(self, inputs):
"""Forward function.
Args:
x (torch.Tensor): 4D Tensor in (N, C, H, W) shape.
Returns:
list[torch.Tensor]: Multi-level feature maps.
"""
assert len(inputs) == len(self.in_channels)
# build laterals
laterals = []
for i, lateral_conv in enumerate(self.lateral_convs):
if self.with_cp:
lateral_i = torch.utils.checkpoint.checkpoint(lateral_conv, inputs[i])
else:
lateral_i = lateral_conv(inputs[i])
laterals.append(lateral_i)
# build down-top path
for i in range(self.num_out - 1, 0, -1):
prev_shape = laterals[i - 1].shape[2:]
laterals[i - 1] = laterals[i - 1] + F.interpolate(laterals[i],
size=prev_shape, align_corners=False, **self.upsample_cfg)
# outs = [
# self.fpn_convs[i](laterals[i]) for i in range(self.num_out)
# ]
outs = []
for i, fpn_conv in enumerate(self.fpn_convs):
if self.with_cp:
out_i = torch.utils.checkpoint.checkpoint(fpn_conv, laterals[i])
else:
out_i = fpn_conv(laterals[i])
outs.append(out_i)
return outs
================================================
FILE: projects/occ_plugin/occupancy/necks/second_fpn_3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule, auto_fp16
from torch import nn as nn
from mmdet.models import NECKS
import pdb
@NECKS.register_module()
class SECONDFPN3D(BaseModule):
"""FPN used in SECOND/PointPillars/PartA2/MVXNet.
Args:
in_channels (list[int]): Input channels of multi-scale feature maps.
out_channels (list[int]): Output channels of feature maps.
upsample_strides (list[int]): Strides used to upsample the
feature maps.
norm_cfg (dict): Config dict of normalization layers.
upsample_cfg (dict): Config dict of upsample layers.
conv_cfg (dict): Config dict of conv layers.
use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
"""
def __init__(self,
in_channels=[128, 128, 256],
out_channels=[256, 256, 256],
upsample_strides=[1, 2, 4],
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
upsample_cfg=dict(type='deconv3d', bias=False),
conv_cfg=dict(type='Conv3d', bias=False),
use_conv_for_no_stride=False,
init_cfg=None):
# replacing GN with BN3D, performance drops from 42.5 to 40.9.
# the difference may be exaggerated because the performance can fluncate a lot
super(SECONDFPN3D, self).__init__(init_cfg=init_cfg)
assert len(out_channels) == len(upsample_strides) == len(in_channels)
self.in_channels = in_channels
self.out_channels = out_channels
self.fp16_enabled = False
deblocks = []
for i, out_channel in enumerate(out_channels):
stride = upsample_strides[i]
if stride > 1 or (stride == 1 and not use_conv_for_no_stride):
upsample_layer = build_upsample_layer(
upsample_cfg,
in_channels=in_channels[i],
out_channels=out_channel,
kernel_size=upsample_strides[i],
stride=upsample_strides[i])
else:
stride = np.round(1 / stride).astype(np.int64)
upsample_layer = build_conv_layer(
conv_cfg,
in_channels=in_channels[i],
out_channels=out_channel,
kernel_size=stride,
stride=stride)
deblock = nn.Sequential(upsample_layer,
build_norm_layer(norm_cfg, out_channel)[1],
nn.ReLU(inplace=True))
deblocks.append(deblock)
self.deblocks = nn.ModuleList(deblocks)
if init_cfg is None:
self.init_cfg = [
dict(type='Kaiming', layer='ConvTranspose2d'),
dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)
]
@auto_fp16()
def forward(self, x):
"""Forward function.
Args:
x (torch.Tensor): 4D Tensor in (N, C, H, W) shape.
Returns:
list[torch.Tensor]: Multi-level feature maps.
"""
assert len(x) == len(self.in_channels)
ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)]
if len(ups) > 1:
out = torch.cat(ups, dim=1)
else:
out = ups[0]
return [out]
================================================
FILE: projects/occ_plugin/occupancy/voxel_encoder/__init__.py
================================================
from .sparse_lidar_enc import SparseLiDAREnc4x, SparseLiDAREnc8x
================================================
FILE: projects/occ_plugin/occupancy/voxel_encoder/sparse_lidar_enc.py
================================================
import math
from functools import partial
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule
import torch
import torch.nn as nn
import torch.nn.functional as F
import spconv.pytorch as spconv
from spconv.pytorch import functional as Fsp
from mmdet3d.models.builder import MIDDLE_ENCODERS
import copy
def post_act_block(in_channels, out_channels, kernel_size, indice_key=None, stride=1, padding=0,
conv_type='subm', norm_cfg=None):
if conv_type == 'subm':
conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, bias=False, indice_key=indice_key)
elif conv_type == 'spconv':
conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding,
bias=False, indice_key=indice_key)
elif conv_type == 'inverseconv':
conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, indice_key=indice_key, bias=False)
else:
raise NotImplementedError
m = spconv.SparseSequential(
conv,
build_norm_layer(norm_cfg, out_channels)[1],
nn.ReLU(inplace=True),
)
return m
class SparseBasicBlock(spconv.SparseModule):
def __init__(self, inplanes, planes, stride=1, norm_cfg=None, indice_key=None):
super(SparseBasicBlock, self).__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False, indice_key=indice_key),
build_norm_layer(norm_cfg, planes)[1],
nn.ReLU(inplace=True),
spconv.SubMConv3d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False, indice_key=indice_key),
build_norm_layer(norm_cfg, planes)[1],
)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
identity = x
out = self.net(x)
out = out.replace_feature(out.features + identity.features)
out = out.replace_feature(self.relu(out.features))
return out
@MIDDLE_ENCODERS.register_module()
class SparseLiDAREnc4x(nn.Module):
def __init__(self, input_channel, norm_cfg, base_channel, out_channel,
sparse_shape_xyz, **kwargs):
super().__init__()
block = post_act_block
self.sparse_shape_xyz = sparse_shape_xyz
self.conv_input = spconv.SparseSequential(
spconv.SubMConv3d(input_channel, base_channel, 3),
nn.GroupNorm(16, base_channel),
nn.ReLU(inplace=True))
self.conv1 = spconv.SparseSequential(
SparseBasicBlock(base_channel, base_channel, norm_cfg=norm_cfg, indice_key='res1'),
SparseBasicBlock(base_channel, base_channel, norm_cfg=norm_cfg, indice_key='res1'),
)
self.conv2 = spconv.SparseSequential(
block(base_channel, base_channel*2, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'),
SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res2'),
SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res2'),
)
self.conv3 = spconv.SparseSequential(
block(base_channel*2, base_channel*4, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'),
SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res3'),
SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res3'),
)
self.conv_out = spconv.SparseSequential(
spconv.SubMConv3d(base_channel*4, out_channel, 3),
nn.GroupNorm(16, out_channel),
nn.ReLU(inplace=True))
def forward(self, voxel_features, coors, batch_size):
# spconv encoding
coors = coors.int()
# FIXME bs=1 hardcode
input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape_xyz[::-1], batch_size)
x = self.conv_input(input_sp_tensor)
x_conv1 = self.conv1(x)
x_conv2 = self.conv2(x_conv1)
x_conv3 = self.conv3(x_conv2)
x = self.conv_out(x_conv3)
return {'x': x.dense().permute(0,1,4,3,2), # B, C, W, H, D
'pts_feats': [x]}
@MIDDLE_ENCODERS.register_module()
class SparseLiDAREnc8x(nn.Module):
def __init__(self, input_channel, norm_cfg, base_channel, out_channel,
sparse_shape_xyz, **kwargs):
super().__init__()
block = post_act_block
self.sparse_shape_xyz = sparse_shape_xyz
self.conv_input = spconv.SparseSequential(
spconv.SubMConv3d(input_channel, base_channel, 3),
nn.GroupNorm(16, base_channel),
nn.ReLU(inplace=True))
self.conv1 = spconv.SparseSequential(
block(base_channel, base_channel*2, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv1', conv_type='spconv'),
SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res1'),
SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res1'),
)
self.conv2 = spconv.SparseSequential(
block(base_channel*2, base_channel*4, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'),
SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res2'),
SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res2'),
)
self.conv3 = spconv.SparseSequential(
block(base_channel*4, base_channel*8, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'),
SparseBasicBlock(base_channel*8, base_channel*8, norm_cfg=norm_cfg, indice_key='res3'),
SparseBasicBlock(base_channel*8, base_channel*8, norm_cfg=norm_cfg, indice_key='res3'),
)
self.conv_out = spconv.SparseSequential(
spconv.SubMConv3d(base_channel*8, out_channel, 3),
nn.GroupNorm(16, out_channel),
nn.ReLU(inplace=True))
def forward(self, voxel_features, coors, batch_size):
# spconv encoding
coors = coors.int()
# FIXME bs=1 hardcode
input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape_xyz[::-1], batch_size)
x = self.conv_input(input_sp_tensor)
x_conv1 = self.conv1(x)
x_conv2 = self.conv2(x_conv1)
x_conv3 = self.conv3(x_conv2)
x = self.conv_out(x_conv3)
return {'x': x.dense().permute(0,1,4,3,2), # B, C, W, H, D
'pts_feats': [x]}
================================================
FILE: projects/occ_plugin/ops/__init__.py
================================================
from .occ_pooling import *
================================================
FILE: projects/occ_plugin/ops/occ_pooling/OCC_Pool.py
================================================
import torch
from projects.occ_plugin.ops.occ_pooling import occ_pool_ext
__all__ = ["occ_pool"]
class QuickCumsum(torch.autograd.Function):
@staticmethod
def forward(ctx, x, geom_feats, ranks):
x = x.cumsum(0)
kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
kept[:-1] = ranks[1:] != ranks[:-1]
x, geom_feats = x[kept], geom_feats[kept]
x = torch.cat((x[:1], x[1:] - x[:-1]))
# save kept for backward
ctx.save_for_backward(kept)
# no gradient for geom_feats
ctx.mark_non_differentiable(geom_feats)
return x, geom_feats
@staticmethod
def backward(ctx, gradx, gradgeom):
(kept,) = ctx.saved_tensors
back = torch.cumsum(kept, 0)
back[kept] -= 1
val = gradx[back]
return val, None, None
class QuickCumsumCuda(torch.autograd.Function):
@staticmethod
def forward(ctx, x, geom_feats, ranks, B, D, H, W):
kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
kept[1:] = ranks[1:] != ranks[:-1]
interval_starts = torch.where(kept)[0].int()
interval_lengths = torch.zeros_like(interval_starts)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
interval_lengths[-1] = x.shape[0] - interval_starts[-1]
geom_feats = geom_feats.int()
out = occ_pool_ext.occ_pool_forward(
x,
geom_feats,
interval_lengths,
interval_starts,
B,
D,
H,
W,
)
ctx.save_for_backward(interval_starts, interval_lengths, geom_feats)
ctx.saved_shapes = B, D, H, W
return out
@staticmethod
def backward(ctx, out_grad):
interval_starts, interval_lengths, geom_feats = ctx.saved_tensors
B, D, H, W = ctx.saved_shapes
out_grad = out_grad.contiguous()
x_grad = occ_pool_ext.occ_pool_backward(
out_grad,
geom_feats,
interval_lengths,
interval_starts,
B,
D,
H,
W,
)
return x_grad, None, None, None, None, None, None
def occ_pool(feats, coords, B, D, H, W):
assert feats.shape[0] == coords.shape[0]
ranks = (
coords[:, 0] * (W * D * B)
+ coords[:, 1] * (D * B)
+ coords[:, 2] * B
+ coords[:, 3]
)
indices = ranks.argsort()
feats, coords, ranks = feats[indices], coords[indices], ranks[indices]
x = QuickCumsumCuda.apply(feats, coords, ranks, B, D, H, W)
x = x.permute(0, 4, 1, 2, 3).contiguous()
return x
================================================
FILE: projects/occ_plugin/ops/occ_pooling/__init__.py
================================================
from .OCC_Pool import occ_pool
================================================
FILE: projects/occ_plugin/ops/occ_pooling/src/occ_pool.cpp
================================================
#include
#include
// CUDA function declarations
void occ_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);
void occ_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);
/*
Function: pillar pooling (forward, cuda)
Args:
x : input features, FloatTensor[n, c]
geom_feats : input coordinates, IntTensor[n, 4]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
Return:
out : output features, FloatTensor[b, d, h, w, c]
*/
at::Tensor occ_pool_forward(
const at::Tensor _x,
const at::Tensor _geom_feats,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
) {
int n = _x.size(0);
int c = _x.size(1);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_x));
const float* x = _x.data_ptr();
const int* geom_feats = _geom_feats.data_ptr();
const int* interval_lengths = _interval_lengths.data_ptr();
const int* interval_starts = _interval_starts.data_ptr();
auto options =
torch::TensorOptions().dtype(_x.dtype()).device(_x.device());
at::Tensor _out = torch::zeros({b, d, h, w, c}, options);
float* out = _out.data_ptr();
occ_pool(
b, d, h, w, n, c, n_intervals, x,
geom_feats, interval_starts, interval_lengths, out
);
return _out;
}
/*
Function: pillar pooling (backward, cuda)
Args:
out_grad : input features, FloatTensor[b, d, h, w, c]
geom_feats : input coordinates, IntTensor[n, 4]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
Return:
x_grad : output features, FloatTensor[n, 4]
*/
at::Tensor occ_pool_backward(
const at::Tensor _out_grad,
const at::Tensor _geom_feats,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
) {
int n = _geom_feats.size(0);
int c = _out_grad.size(4);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
const float* out_grad = _out_grad.data_ptr();
const int* geom_feats = _geom_feats.data_ptr();
const int* interval_lengths = _interval_lengths.data_ptr();
const int* interval_starts = _interval_starts.data_ptr();
auto options =
torch::TensorOptions().dtype(_out_grad.dtype()).device(_out_grad.device());
at::Tensor _x_grad = torch::zeros({n, c}, options);
float* x_grad = _x_grad.data_ptr();
occ_pool_grad(
b, d, h, w, n, c, n_intervals, out_grad,
geom_feats, interval_starts, interval_lengths, x_grad
);
return _x_grad;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("occ_pool_forward", &occ_pool_forward,
"occ_pool_forward");
m.def("occ_pool_backward", &occ_pool_backward,
"occ_pool_backward");
}
================================================
FILE: projects/occ_plugin/ops/occ_pooling/src/occ_pool_cuda.cu
================================================
#include
#include
/*
Function: pillar pooling
Args:
b : batch size
d : depth of the feature map
h : height of pooled feature map
w : width of pooled feature map
n : number of input points
c : number of channels
n_intervals : number of unique points
x : input features, FloatTensor[n, c]
geom_feats : input coordinates, IntTensor[n, 4]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
out : output features, FloatTensor[b, d, h, w, c]
*/
__global__ void occ_pool_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
const float *__restrict__ x,
const int *__restrict__ geom_feats,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ out) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int index = idx / c;
int cur_c = idx % c;
if (index >= n_intervals) return;
int interval_start = interval_starts[index];
int interval_length = interval_lengths[index];
const int* cur_geom_feats = geom_feats + interval_start * 4;
const float* cur_x = x + interval_start * c + cur_c;
float* cur_out = out + cur_geom_feats[3] * d * h * w * c +
cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c +
cur_geom_feats[1] * c + cur_c;
float psum = 0;
for(int i = 0; i < interval_length; i++){
psum += cur_x[i * c];
}
*cur_out = psum;
}
/*
Function: pillar pooling backward
Args:
b : batch size
d : depth of the feature map
h : height of pooled feature map
w : width of pooled feature map
n : number of input points
c : number of channels
n_intervals : number of unique points
out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
geom_feats : input coordinates, IntTensor[n, 4]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
x_grad : gradient of the image fmap, FloatTensor
*/
__global__ void occ_pool_grad_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
const float *__restrict__ out_grad,
const int *__restrict__ geom_feats,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ x_grad) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int index = idx / c;
int cur_c = idx % c;
if (index >= n_intervals) return;
int interval_start = interval_starts[index];
int interval_length = interval_lengths[index];
const int* cur_geom_feats = geom_feats + interval_start * 4;
float* cur_x_grad = x_grad + interval_start * c + cur_c;
const float* cur_out_grad = out_grad + cur_geom_feats[3] * d * h * w * c +
cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c +
cur_geom_feats[1] * c + cur_c;
for(int i = 0; i < interval_length; i++){
cur_x_grad[i * c] = *cur_out_grad;
}
}
void occ_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out) {
occ_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out
);
}
void occ_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad) {
occ_pool_grad_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
b, d, h, w, n, c, n_intervals, out_grad, geom_feats, interval_starts, interval_lengths, x_grad
);
}
================================================
FILE: projects/occ_plugin/utils/__init__.py
================================================
from .formating import cm_to_ious, format_results
from .metric_util import per_class_iu, fast_hist_crop
from .coordinate_transform import coarse_to_fine_coordinates, project_points_on_img
from .geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy
================================================
FILE: projects/occ_plugin/utils/coordinate_transform.py
================================================
import torch
def coarse_to_fine_coordinates(coarse_cor, ratio, topk=30000):
"""
Args:
coarse_cor (torch.Tensor): [3, N]"""
fine_cor = coarse_cor * ratio
fine_cor = fine_cor[None].repeat(ratio**3, 1, 1) # [8, 3, N]
device = fine_cor.device
value = torch.meshgrid([torch.arange(ratio).to(device), torch.arange(ratio).to(device), torch.arange(ratio).to(device)])
value = torch.stack(value, dim=3).reshape(-1, 3)
fine_cor = fine_cor + value[:,:,None]
if fine_cor.shape[-1] < topk:
return fine_cor.permute(1,0,2).reshape(3,-1)
else:
fine_cor = fine_cor[:,:,torch.randperm(fine_cor.shape[-1])[:topk]]
return fine_cor.permute(1,0,2).reshape(3,-1)
def project_points_on_img(points, rots, trans, intrins, post_rots, post_trans, bda_mat, pts_range,
W_img, H_img, W_occ, H_occ, D_occ):
with torch.no_grad():
voxel_size = ((pts_range[3:] - pts_range[:3]) / torch.tensor([W_occ-1, H_occ-1, D_occ-1])).to(points.device)
points = points * voxel_size[None, None] + pts_range[:3][None, None].to(points.device)
# project 3D point cloud (after bev-aug) onto multi-view images for corresponding 2D coordinates
inv_bda = bda_mat.inverse()
points = (inv_bda @ points.unsqueeze(-1)).squeeze(-1)
# from lidar to camera
points = points.view(-1, 1, 3)
points = points - trans.view(1, -1, 3)
inv_rots = rots.inverse().unsqueeze(0)
points = (inv_rots @ points.unsqueeze(-1))
# from camera to raw pixel
points = (intrins.unsqueeze(0) @ points).squeeze(-1)
points_d = points[..., 2:3]
points_uv = points[..., :2] / (points_d + 1e-5)
# from raw pixel to transformed pixel
points_uv = post_rots[..., :2, :2].unsqueeze(0) @ points_uv.unsqueeze(-1)
points_uv = points_uv.squeeze(-1) + post_trans[..., :2].unsqueeze(0)
points_uv[..., 0] = (points_uv[..., 0] / (W_img-1) - 0.5) * 2
points_uv[..., 1] = (points_uv[..., 1] / (H_img-1) - 0.5) * 2
mask = (points_d[..., 0] > 1e-5) \
& (points_uv[..., 0] > -1) & (points_uv[..., 0] < 1) \
& (points_uv[..., 1] > -1) & (points_uv[..., 1] < 1)
return points_uv.permute(2,1,0,3), mask
================================================
FILE: projects/occ_plugin/utils/formating.py
================================================
from prettytable import PrettyTable
import numpy as np
def cm_to_ious(cm):
# SC:[TN FP \n FN TP]
mean_ious = []
cls_num = len(cm)
for i in range(cls_num):
tp = cm[i, i]
p = cm[:, i].sum()
g = cm[i, :].sum()
union = p + g - tp
mean_ious.append(tp / union)
return mean_ious
def format_results(mean_ious, return_dic=False):
class_map = {
1: 'barrier',
2: 'bicycle',
3: 'bus',
4: 'car',
5: 'construction_vehicle',
6: 'motorcycle',
7: 'pedestrian',
8: 'traffic_cone',
9: 'trailer',
10: 'truck',
11: 'driveable_surface',
12: 'other_flat',
13: 'sidewalk',
14: 'terrain',
15: 'manmade',
16: 'vegetation',
}
x = PrettyTable()
x.field_names = ['class', 'IoU']
class_names = list(class_map.values()) + ['mean']
class_ious = mean_ious + [sum(mean_ious) / len(mean_ious)]
dic = {}
for cls_name, cls_iou in zip(class_names, class_ious):
dic[cls_name] = round(cls_iou, 3)
x.add_row([cls_name, round(cls_iou, 3)])
if return_dic:
return x, dic
else:
return x
def format_iou_results(mean_ious, return_dic=False):
if len(mean_ious) == 2:
class_map = {
0: 'free',
1: 'movable objects',
}
else:
class_map = {
0: 'free',
1: 'bicycle',
2: 'bus',
3: 'car',
4: 'construction',
5: 'motorcycle',
6: 'trailer',
7: 'truck',
8: 'pedestrian',
}
x = PrettyTable()
x.field_names = ['class', 'IoU']
class_names = list(class_map.values())
class_ious = mean_ious
dic = {}
for cls_name, cls_iou in zip(class_names, class_ious):
dic[cls_name] = np.round(cls_iou, 3)
x.add_row([cls_name, np.round(cls_iou, 3)])
mean_ious = sum(mean_ious[1:]) / len(mean_ious[1:])
dic['mean'] = np.round(mean_ious, 3)
x.add_row(['mean', np.round(mean_ious, 3)])
if return_dic:
return x, dic
else:
return x
def format_vel_results(mean_epe, return_dic=False):
class_map = {
0: 'barrier',
1: 'bicycle',
2: 'bus',
3: 'car',
4: 'construction_vehicle',
5: 'motorcycle',
6: 'pedestrian',
7: 'traffic_cone',
8: 'trailer',
9: 'truck',
}
x = PrettyTable()
x.field_names = ['class', 'EPE']
class_names = list(class_map.values())
class_epes = mean_epe
dic = {}
for cls_name, cls_iou in zip(class_names, class_epes):
dic[cls_name] = np.round(cls_iou, 3)
x.add_row([cls_name, np.round(cls_iou, 3)])
mean_all_epe = mean_epe.mean()
dic['mean'] = np.round(mean_all_epe, 3)
x.add_row(['mean', np.round(mean_all_epe, 3)])
if return_dic:
return x, dic
else:
return x
================================================
FILE: projects/occ_plugin/utils/gaussian.py
================================================
import numpy as np
import torch
import torch.nn.functional as F
from torch.distributions import Normal
import pdb
def gaussian_2d(shape, sigma=1):
"""Generate gaussian map.
Args:
shape (list[int]): Shape of the map.
sigma (float): Sigma to generate gaussian map.
Defaults to 1.
Returns:
np.ndarray: Generated gaussian map.
"""
m, n = [(ss - 1.) / 2. for ss in shape]
y, x = np.ogrid[-m:m + 1, -n:n + 1]
h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
h[h < np.finfo(h.dtype).eps * h.max()] = 0
return h
def draw_heatmap_gaussian(heatmap, center, radius, k=1):
"""Get gaussian masked heatmap.
Args:
heatmap (torch.Tensor): Heatmap to be masked.
center (torch.Tensor): Center coord of the heatmap.
radius (int): Radius of gausian.
K (int): Multiple of masked_gaussian. Defaults to 1.
Returns:
torch.Tensor: Masked heatmap.
"""
diameter = 2 * radius + 1
gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[0:2]
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = torch.from_numpy(
gaussian[radius - top:radius + bottom,
radius - left:radius + right]).to(heatmap.device,
torch.float32)
if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
return heatmap
def gaussian_radius(det_size, min_overlap=0.5):
"""Get radius of gaussian.
Args:
det_size (tuple[torch.Tensor]): Size of the detection result.
min_overlap (float): Gaussian_overlap. Defaults to 0.5.
Returns:
torch.Tensor: Computed radius.
"""
height, width = det_size
a1 = 1
b1 = (height + width)
c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
r1 = (b1 + sq1) / 2
a2 = 4
b2 = 2 * (height + width)
c2 = (1 - min_overlap) * width * height
sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
r2 = (b2 + sq2) / 2
a3 = 4 * min_overlap
b3 = -2 * min_overlap * (height + width)
c3 = (min_overlap - 1) * width * height
sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
r3 = (b3 + sq3) / 2
return min(r1, r2, r3)
def generate_guassian_depth_target(depth, stride, cam_depth_range, constant_std=None):
depth = depth.flatten(0, 1) # [bs*s, 6, 896, 1600] -> [bs*s*6, 896, 1600]
B, tH, tW = depth.shape
kernel_size = stride # [4,4,4]
center_idx = kernel_size * kernel_size // 2
H = tH // stride # 896//4 = 248
W = tW // stride # 1600//4 = 400
unfold_depth = F.unfold(depth.unsqueeze(1), kernel_size, dilation=1, padding=0, stride=stride) #B, Cxkxk, HxW
unfold_depth = unfold_depth.view(B, -1, H, W).permute(0, 2, 3, 1).contiguous() # B, H, W, kxk
valid_mask = (unfold_depth != 0) # BN, H, W, kxk
if constant_std is None:
valid_mask_f = valid_mask.float() # BN, H, W, kxk
valid_num = torch.sum(valid_mask_f, dim=-1) # BN, H, W
valid_num[valid_num == 0] = 1e10
mean = torch.sum(unfold_depth, dim=-1) / valid_num
var_sum = torch.sum(((unfold_depth - mean.unsqueeze(-1))**2) * valid_mask_f, dim=-1) # BN, H, W
std_var = torch.sqrt(var_sum / valid_num)
std_var[valid_num == 1] = 1 # set std_var to 1 when only one point in patch
else:
std_var = torch.ones((B, H, W)).type_as(depth).float() * constant_std
unfold_depth[~valid_mask] = 1e10
min_depth = torch.min(unfold_depth, dim=-1)[0] #BN, H, W
min_depth[min_depth == 1e10] = 0
# x in raw depth
x = torch.arange(cam_depth_range[0] - cam_depth_range[2] / 2, cam_depth_range[1], cam_depth_range[2])
# normalized by intervals
dist = Normal(min_depth / cam_depth_range[2], std_var / cam_depth_range[2]) # BN, H, W, D
cdfs = []
for i in x:
cdf = dist.cdf(i)
cdfs.append(cdf)
cdfs = torch.stack(cdfs, dim=-1)
depth_dist = cdfs[..., 1:] - cdfs[...,:-1]
return depth_dist, min_depth
================================================
FILE: projects/occ_plugin/utils/geometry.py
================================================
import numpy as np
import PIL
import torch
import torch.nn.functional as F
from pyquaternion import Quaternion
def convert_egopose_to_matrix_numpy(trans, rot):
transformation_matrix = np.zeros((4, 4), dtype=np.float32)
rotation = Quaternion(rot).rotation_matrix
translation = np.array(trans)
transformation_matrix[:3, :3] = rotation
transformation_matrix[:3, 3] = translation
transformation_matrix[3, 3] = 1.0
return transformation_matrix
def invert_matrix_egopose_numpy(egopose):
""" Compute the inverse transformation of a 4x4 egopose numpy matrix."""
inverse_matrix = np.zeros((4, 4), dtype=np.float32)
rotation = egopose[:3, :3]
translation = egopose[:3, 3]
inverse_matrix[:3, :3] = rotation.T
inverse_matrix[:3, 3] = -np.dot(rotation.T, translation)
inverse_matrix[3, 3] = 1.0
return inverse_matrix
================================================
FILE: projects/occ_plugin/utils/metric_util.py
================================================
# -*- coding:utf-8 -*-
# author: Xinge
# @file: metric_util.py
import numpy as np
def fast_hist(pred, label, n):
k = (label >= 0) & (label < n)
bin_count = np.bincount(
n * label[k].astype(int) + pred[k], minlength=n ** 2)
return bin_count[:n ** 2].reshape(n, n)
def per_class_iu(hist):
return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
def fast_hist_crop(output, target, unique_label):
hist = fast_hist(output.flatten(), target.flatten(), np.max(unique_label) + 2)
hist = hist[unique_label + 1, :]
hist = hist[:, unique_label + 1]
return hist
class SSCMetrics:
def __init__(self, class_names, ignore_idx=255, empty_idx=None):
self.class_names = class_names
self.n_classes = len(class_names)
self.ignore_idx = ignore_idx
self.empty_idx = empty_idx
self.reset()
def hist_info(self, n_cl, pred, gt):
assert pred.shape == gt.shape
k = (gt >= 0) & (gt < n_cl) # exclude 255
labeled = np.sum(k)
correct = np.sum((pred[k] == gt[k]))
return (
np.bincount(
n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
).reshape(n_cl, n_cl),
correct,
labeled,
)
@staticmethod
def compute_score(hist, correct, labeled):
iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
mean_IU = np.nanmean(iu)
mean_IU_no_back = np.nanmean(iu[1:])
freq = hist.sum(1) / hist.sum()
freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
mean_pixel_acc = correct / labeled if labeled != 0 else 0
return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
def add_batch(self, y_pred, y_true, nonsurface=None):
self.count += 1
mask = y_true != self.ignore_idx
if self.empty_idx is not None:
mask = mask & (y_true != self.empty_idx)
if nonsurface is not None:
mask = mask & nonsurface
tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)
self.completion_tp += tp
self.completion_fp += fp
self.completion_fn += fn
mask = y_true != self.ignore_idx
if self.empty_idx is not None:
mask = mask & (y_true != self.empty_idx)
tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
y_pred, y_true, mask
)
self.tps += tp_sum
self.fps += fp_sum
self.fns += fn_sum
def get_stats(self):
if self.completion_tp != 0:
precision = self.completion_tp / (self.completion_tp + self.completion_fp)
recall = self.completion_tp / (self.completion_tp + self.completion_fn)
iou = self.completion_tp / (
self.completion_tp + self.completion_fp + self.completion_fn
)
else:
precision, recall, iou = 0, 0, 0
iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
return {
"precision": precision,
"recall": recall,
"iou": iou,
"iou_ssc": iou_ssc,
"iou_ssc_mean": np.mean(iou_ssc[1:]),
}
def reset(self):
self.completion_tp = 0
self.completion_fp = 0
self.completion_fn = 0
self.tps = np.zeros(self.n_classes)
self.fps = np.zeros(self.n_classes)
self.fns = np.zeros(self.n_classes)
self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
self.labeled_ssc = 0
self.correct_ssc = 0
self.precision = 0
self.recall = 0
self.iou = 0
self.count = 1e-8
self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)
def get_score_completion(self, predict, target, nonempty=None):
predict = np.copy(predict)
target = np.copy(target)
"""for scene completion, treat the task as two-classes problem, just empty or occupancy"""
_bs = predict.shape[0] # batch size
# ---- ignore
predict[target == self.ignore_idx] = 0
target[target == self.ignore_idx] = 0
# ---- flatten
target = target.reshape(_bs, -1) # (_bs, 129600)
predict = predict.reshape(_bs, -1) # (_bs, _C, 129600), 60*36*60=129600
# ---- treat all non-empty object class as one category, set them to label 1
b_pred = np.zeros(predict.shape)
b_true = np.zeros(target.shape)
b_pred[predict != self.empty_idx] = 1
b_true[target != self.empty_idx] = 1
p, r, iou = 0.0, 0.0, 0.0
tp_sum, fp_sum, fn_sum = 0, 0, 0
for idx in range(_bs):
y_true = b_true[idx, :] # GT
y_pred = b_pred[idx, :]
if nonempty is not None:
nonempty_idx = nonempty[idx, :].reshape(-1)
y_true = y_true[nonempty_idx == 1]
y_pred = y_pred[nonempty_idx == 1]
tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
tp_sum += tp
fp_sum += fp
fn_sum += fn
return tp_sum, fp_sum, fn_sum
def get_score_semantic_and_completion(self, predict, target, nonempty=None):
target = np.copy(target)
predict = np.copy(predict)
_bs = predict.shape[0] # batch size
_C = self.n_classes # _C = 12
# ---- ignore
predict[target == self.ignore_idx] = 0
target[target == self.ignore_idx] = 0
# ---- flatten
target = target.reshape(_bs, -1) # (_bs, 129600)
predict = predict.reshape(_bs, -1) # (_bs, 129600), 60*36*60=129600
cnt_class = np.zeros(_C, dtype=np.int32) # count for each class
iou_sum = np.zeros(_C, dtype=np.float32) # sum of iou for each class
tp_sum = np.zeros(_C, dtype=np.int32) # tp
fp_sum = np.zeros(_C, dtype=np.int32) # fp
fn_sum = np.zeros(_C, dtype=np.int32) # fn
for idx in range(_bs):
y_true = target[idx, :] # GT
y_pred = predict[idx, :]
if nonempty is not None:
nonempty_idx = nonempty[idx, :].reshape(-1)
y_pred = y_pred[
np.where(np.logical_and(nonempty_idx == 1, y_true != self.ignore_idx))
]
y_true = y_true[
np.where(np.logical_and(nonempty_idx == 1, y_true != self.ignore_idx))
]
for j in range(_C): # for each class
tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size
tp_sum[j] += tp
fp_sum[j] += fp
fn_sum[j] += fn
return tp_sum, fp_sum, fn_sum
================================================
FILE: projects/occ_plugin/utils/nusc_param.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
nusc_class_frequencies = np.array([2242961742295, 25985376, 1561108, 28862014, 196106643, 15920504,
2158753, 26539491, 4004729, 34838681, 75173306, 2255027978, 50959399, 646022466, 869055679,
1446141335, 1724391378])
nusc_class_names = [
"empty",
"barrier",
"bicycle",
"bus",
"car",
"construction",
"motorcycle",
"pedestrian",
"trafficcone",
"trailer",
"truck",
"driveable_surface",
"other",
"sidewalk",
"terrain",
"mannade",
"vegetation",
]
classname_to_color = { # RGB.
# 0: (0, 0, 0), # Black. noise
1: (112, 128, 144), # Slategrey barrier
2: (220, 20, 60), # Crimson bicycle
3: (255, 127, 80), # Orangered bus
4: (255, 158, 0), # Orange car
5: (233, 150, 70), # Darksalmon construction
6: (255, 61, 99), # Red motorcycle
7: (0, 0, 230), # Blue pedestrian
8: (47, 79, 79), # Darkslategrey trafficcone
9: (255, 140, 0), # Darkorange trailer
10: (255, 99, 71), # Tomato truck
11: (0, 207, 191), # nuTonomy green driveable_surface
12: (175, 0, 75), # flat other
13: (75, 0, 75), # sidewalk
14: (112, 180, 60), # terrain
15: (222, 184, 135), # Burlywood mannade
16: (0, 175, 0), # Green vegetation
}
def KL_sep(p, target):
"""
KL divergence on nonzeros classes
"""
nonzeros = target != 0
nonzero_p = p[nonzeros]
kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
return kl_term
def geo_scal_loss(pred, ssc_target):
# Get softmax probabilities
pred = F.softmax(pred, dim=1)
# Compute empty and nonempty probabilities
empty_probs = pred[:, 0, :, :, :]
nonempty_probs = 1 - empty_probs
# Remove unknown voxels
mask = ssc_target != 255
nonempty_target = ssc_target != 0
nonempty_target = nonempty_target[mask].float()
nonempty_probs = nonempty_probs[mask]
empty_probs = empty_probs[mask]
intersection = (nonempty_target * nonempty_probs).sum()
precision = intersection / nonempty_probs.sum()
recall = intersection / nonempty_target.sum()
spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
return (
F.binary_cross_entropy(precision, torch.ones_like(precision))
+ F.binary_cross_entropy(recall, torch.ones_like(recall))
+ F.binary_cross_entropy(spec, torch.ones_like(spec))
)
def sem_scal_loss(pred, ssc_target):
# Get softmax probabilities
pred = F.softmax(pred, dim=1)
loss = 0
count = 0
mask = ssc_target != 255
n_classes = pred.shape[1]
for i in range(0, n_classes):
# Get probability of class i
p = pred[:, i, :, :, :]
# Remove unknown voxels
target_ori = ssc_target
p = p[mask]
target = ssc_target[mask]
completion_target = torch.ones_like(target)
completion_target[target != i] = 0
completion_target_ori = torch.ones_like(target_ori).float()
completion_target_ori[target_ori != i] = 0
if torch.sum(completion_target) > 0:
count += 1.0
nominator = torch.sum(p * completion_target)
loss_class = 0
if torch.sum(p) > 0:
precision = nominator / (torch.sum(p))
loss_precision = F.binary_cross_entropy(
precision, torch.ones_like(precision)
)
loss_class += loss_precision
if torch.sum(completion_target) > 0:
recall = nominator / (torch.sum(completion_target))
loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
loss_class += loss_recall
if torch.sum(1 - completion_target) > 0:
specificity = torch.sum((1 - p) * (1 - completion_target)) / (
torch.sum(1 - completion_target)
)
loss_specificity = F.binary_cross_entropy(
specificity, torch.ones_like(specificity)
)
loss_class += loss_specificity
loss += loss_class
return loss / count
def CE_ssc_loss(pred, target, class_weights):
"""
:param: prediction: the predicted tensor, must be [BS, C, H, W, D]
"""
criterion = nn.CrossEntropyLoss(
weight=class_weights, ignore_index=255, reduction="mean"
)
loss = criterion(pred, target.long())
return loss
================================================
FILE: projects/occ_plugin/utils/semkitti.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
semantic_kitti_class_frequencies = np.array(
[
5.41773033e09,
1.57835390e07,
1.25136000e05,
1.18809000e05,
6.46799000e05,
8.21951000e05,
2.62978000e05,
2.83696000e05,
2.04750000e05,
6.16887030e07,
4.50296100e06,
4.48836500e07,
2.26992300e06,
5.68402180e07,
1.57196520e07,
1.58442623e08,
2.06162300e06,
3.69705220e07,
1.15198800e06,
3.34146000e05,
]
)
kitti_class_names = [
"empty",
"car",
"bicycle",
"motorcycle",
"truck",
"other-vehicle",
"person",
"bicyclist",
"motorcyclist",
"road",
"parking",
"sidewalk",
"other-ground",
"building",
"fence",
"vegetation",
"trunk",
"terrain",
"pole",
"traffic-sign",
]
def KL_sep(p, target):
"""
KL divergence on nonzeros classes
"""
nonzeros = target != 0
nonzero_p = p[nonzeros]
kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
return kl_term
def geo_scal_loss(pred, ssc_target, ignore_index=255, non_empty_idx=0):
# Get softmax probabilities
pred = F.softmax(pred, dim=1)
# Compute empty and nonempty probabilities
empty_probs = pred[:, non_empty_idx]
nonempty_probs = 1 - empty_probs
# Remove unknown voxels
mask = ssc_target != ignore_index
nonempty_target = ssc_target != non_empty_idx
nonempty_target = nonempty_target[mask].float()
nonempty_probs = nonempty_probs[mask]
empty_probs = empty_probs[mask]
eps = 1e-5
intersection = (nonempty_target * nonempty_probs).sum()
precision = intersection / (nonempty_probs.sum()+eps)
recall = intersection / (nonempty_target.sum()+eps)
spec = ((1 - nonempty_target) * (empty_probs)).sum() / ((1 - nonempty_target).sum()+eps)
return (
F.binary_cross_entropy(precision, torch.ones_like(precision))
+ F.binary_cross_entropy(recall, torch.ones_like(recall))
+ F.binary_cross_entropy(spec, torch.ones_like(spec))
)
def sem_scal_loss(pred, ssc_target, ignore_index=255):
# Get softmax probabilities
pred = F.softmax(pred, dim=1)
loss = 0
count = 0
mask = ssc_target != ignore_index
n_classes = pred.shape[1]
for i in range(0, n_classes):
# Get probability of class i
p = pred[:, i]
# Remove unknown voxels
target_ori = ssc_target
p = p[mask]
target = ssc_target[mask]
completion_target = torch.ones_like(target)
completion_target[target != i] = 0
completion_target_ori = torch.ones_like(target_ori).float()
completion_target_ori[target_ori != i] = 0
if torch.sum(completion_target) > 0:
count += 1.0
nominator = torch.sum(p * completion_target)
loss_class = 0
if torch.sum(p) > 0:
precision = nominator / (torch.sum(p))
loss_precision = F.binary_cross_entropy(
precision, torch.ones_like(precision)
)
loss_class += loss_precision
if torch.sum(completion_target) > 0:
recall = nominator / (torch.sum(completion_target))
loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
loss_class += loss_recall
if torch.sum(1 - completion_target) > 0:
specificity = torch.sum((1 - p) * (1 - completion_target)) / (
torch.sum(1 - completion_target)
)
loss_specificity = F.binary_cross_entropy(
specificity, torch.ones_like(specificity)
)
loss_class += loss_specificity
loss += loss_class
return loss / count
def CE_ssc_loss(pred, target, class_weights=None, ignore_index=255):
"""
:param: prediction: the predicted tensor, must be [BS, C, ...]
"""
criterion = nn.CrossEntropyLoss(
weight=class_weights, ignore_index=ignore_index, reduction="mean"
)
loss = criterion(pred, target.long())
return loss
def Smooth_L1_loss(pred, target, ignore_index=255):
# pred/target B, H, W, D, 3
kept = (target[:, :, :, :, 0] != ignore_index) & (target[:, :, :, :, 1] != ignore_index) & (target[:, :, :, :, 2] != ignore_index)
criterion = nn.SmoothL1Loss( reduction="mean" )
loss = criterion(pred[kept], target[kept])
if torch.isnan(loss):
pred = pred * 0
target = target * 0
loss = criterion(pred, target)
return loss
return loss
def vel_loss(pred, gt):
return F.l1_loss(pred, gt)
================================================
FILE: projects/occ_plugin/utils/voxel_to_points.py
================================================
import open3d as o3d
import numpy as np
def query_points_from_voxels(pred, gt, img_metas):
# pred, [tensor of shape (num_class, x, y, z)]: predicted classes
# gt, [tensor of shape (batch, num_points)]: target points with semantic labels
# logits to pred cls_id
pred = np.argmax(pred.detach().cpu().numpy(), axis=0)
gt_ = gt.detach().cpu().numpy()
pred_fore_mask = pred > 0
if pred_fore_mask.sum() == 0:
return None
# select foreground 3d voxel vertex
x = np.linspace(0, pred.shape[0] - 1, pred.shape[0])
y = np.linspace(0, pred.shape[1] - 1, pred.shape[1])
z = np.linspace(0, pred.shape[2] - 1, pred.shape[2])
X, Y, Z = np.meshgrid(x, y, z, indexing='ij')
vv = np.stack([X, Y, Z], axis=-1)
# foreground predictions & coordinates
pred = pred[pred_fore_mask]
vv = vv[pred_fore_mask]
vv[:, 0] = (vv[:, 0] + 0.5) * (img_metas['pc_range'][3] - img_metas['pc_range'][0]) / img_metas['occ_size'][0] + img_metas['pc_range'][0]
vv[:, 1] = (vv[:, 1] + 0.5) * (img_metas['pc_range'][4] - img_metas['pc_range'][1]) / img_metas['occ_size'][1] + img_metas['pc_range'][1]
vv[:, 2] = (vv[:, 2] + 0.5) * (img_metas['pc_range'][5] - img_metas['pc_range'][2]) / img_metas['occ_size'][2] + img_metas['pc_range'][2]
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(vv)
# for every lidar point, search its nearest *foreground* voxel vertex as the semantic prediction
kdtree = o3d.geometry.KDTreeFlann(pcd)
indices = []
for vert in gt_[:, :3]:
_, inds, _ = kdtree.search_knn_vector_3d(vert, 1)
indices.append(inds[0])
pred_valid = pred[np.array(indices)]
return pred_valid
================================================
FILE: run.sh
================================================
echo "-------------"
echo "load config from local path:" $1
if [ -f $1 ]; then
config=$1
else
echo "need a config file"
exit
fi
bash tools/dist_train.sh $config $2 ${@:3}
================================================
FILE: run_eval.sh
================================================
echo "-------------"
echo "load config from local path:" $1
if [ -f $1 ]; then
config=$1
else
echo "need a config file"
exit
fi
export PYTHONPATH="."
ckpt=$2
gpu=$3
bash tools/dist_test.sh $config $ckpt $gpu ${@:4}
================================================
FILE: setup.py
================================================
from setuptools import find_packages, setup
import os
import torch
from os import path as osp
from torch.utils.cpp_extension import (BuildExtension, CppExtension,
CUDAExtension)
def make_cuda_ext(name,
module,
sources,
sources_cuda=[],
extra_args=[],
extra_include_path=[]):
define_macros = []
extra_compile_args = {'cxx': [] + extra_args}
if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
define_macros += [('WITH_CUDA', None)]
extension = CUDAExtension
extra_compile_args['nvcc'] = extra_args + [
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
sources += sources_cuda
else:
print('Compiling {} without CUDA'.format(name))
extension = CppExtension
# raise EnvironmentError('CUDA is required to compile MMDetection!')
return extension(
name='{}.{}'.format(module, name),
sources=[os.path.join(*module.split('.'), p) for p in sources],
include_dirs=extra_include_path,
define_macros=define_macros,
extra_compile_args=extra_compile_args)
if __name__ == '__main__':
# add_mim_extention()
setup(
name='OpenOccupancy',
version='0.0',
description=("OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception"),
author='OpenOccupancy Contributors',
author_email='wangxiaofeng2020@ia.ac.cn',
keywords='Occupancy Perception',
packages=find_packages(),
include_package_data=True,
package_data={'projects.occ_plugin.ops': ['*/*.so']},
classifiers=[
"Development Status :: 4 - Beta",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
],
license="Apache License 2.0",
ext_modules=[
make_cuda_ext(
name="occ_pool_ext",
module="projects.occ_plugin.ops.occ_pooling",
sources=[
"src/occ_pool.cpp",
"src/occ_pool_cuda.cu",
]),
],
cmdclass={'build_ext': BuildExtension})
================================================
FILE: tools/dist_test.sh
================================================
#!/usr/bin/env bash
CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29504}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.run --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --deterministic --eval bbox
================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash
CONFIG=$1
GPUS=$2
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
PORT=${PORT:-29501}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.run \
--nnodes=$NNODES \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_ADDR \
--nproc_per_node=$GPUS \
--master_port=$PORT \
$(dirname "$0")/train.py \
$CONFIG \
--seed 2 \
--resume ./work_dirs/OCFNet_in_Cam4DOcc_V1.2/epoch_15.pth
--launcher pytorch ${@:3}
================================================
FILE: tools/gen_data/gen_depth_gt.py
================================================
import os
from multiprocessing import Pool
import mmcv
import numpy as np
from nuscenes.utils.data_classes import LidarPointCloud
from nuscenes.utils.geometry_utils import view_points
from pyquaternion import Quaternion
import copy
# https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/nuscenes.py#L834
def map_pointcloud_to_image(
pc,
im,
lidar2ego_translation,
lidar2ego_rotation,
ego2global_translation,
ego2global_rotation,
sensor2ego_translation,
sensor2ego_rotation,
cam_ego2global_translation,
cam_ego2global_rotation,
cam_intrinsic,
min_dist: float = 0.0,
):
# Points live in the point sensor frame. So they need to be
# transformed via global to the image plane.
# First step: transform the pointcloud to the ego vehicle
# frame for the timestamp of the sweep.
pc = LidarPointCloud(pc.T)
pc.rotate(Quaternion(lidar2ego_rotation).rotation_matrix)
pc.translate(np.array(lidar2ego_translation))
# Second step: transform from ego to the global frame.
pc.rotate(Quaternion(ego2global_rotation).rotation_matrix)
pc.translate(np.array(ego2global_translation))
# Third step: transform from global into the ego vehicle
# frame for the timestamp of the image.
pc.translate(-np.array(cam_ego2global_translation))
pc.rotate(Quaternion(cam_ego2global_rotation).rotation_matrix.T)
# Fourth step: transform from ego into the camera.
pc.translate(-np.array(sensor2ego_translation))
pc.rotate(Quaternion(sensor2ego_rotation).rotation_matrix.T)
# Fifth step: actually take a "picture" of the point cloud.
# Grab the depths (camera frame z axis points away from the camera).
depths = pc.points[2, :]
coloring = depths
# Take the actual picture (matrix multiplication with camera-matrix
# + renormalization).
points = view_points(pc.points[:3, :],
cam_intrinsic,
normalize=True)
# Remove points that are either outside or behind the camera.
# Leave a margin of 1 pixel for aesthetic reasons. Also make
# sure points are at least 1m in front of the camera to avoid
# seeing the lidar points on the camera casing for non-keyframes
# which are slightly out of sync.
mask = np.ones(depths.shape[0], dtype=bool)
mask = np.logical_and(mask, depths > min_dist)
mask = np.logical_and(mask, points[0, :] > 1)
mask = np.logical_and(mask, points[0, :] < im.shape[1] - 1)
mask = np.logical_and(mask, points[1, :] > 1)
mask = np.logical_and(mask, points[1, :] < im.shape[0] - 1)
points = points[:, mask]
coloring = coloring[mask]
return points, coloring
data_root = './data/nuscenes'
info_path_train = './data/nuscenes/nuscenes_occ_infos_train.pkl'
info_path_val = './data/nuscenes/nuscenes_occ_infos_val.pkl'
# data3d_nusc = NuscMVDetData()
lidar_key = 'LIDAR_TOP'
cam_keys = [
'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT',
'CAM_BACK', 'CAM_BACK_LEFT'
]
def worker(info):
lidar_path = info['lidar_path']
points = np.fromfile(lidar_path,
dtype=np.float32,
count=-1).reshape(-1, 5)[..., :4]
lidar2ego_translation = info['lidar2ego_translation']
lidar2ego_rotation = info['lidar2ego_rotation']
ego2global_translation = info['ego2global_translation']
ego2global_rotation = info['ego2global_rotation']
for i, cam_key in enumerate(cam_keys):
sensor2ego_translation = info['cams'][cam_key]['sensor2ego_translation']
sensor2ego_rotation = info['cams'][cam_key]['sensor2ego_rotation']
cam_ego2global_translation = info['cams'][cam_key]['ego2global_translation']
cam_ego2global_rotation = info['cams'][cam_key]['ego2global_rotation']
cam_intrinsic = info['cams'][cam_key]['cam_intrinsic']
img = mmcv.imread(
os.path.join(info['cams'][cam_key]['data_path']))
pts_img, depth = map_pointcloud_to_image(
points.copy(), img,
copy.deepcopy(lidar2ego_translation),
copy.deepcopy(lidar2ego_rotation),
copy.deepcopy(ego2global_translation),
copy.deepcopy(ego2global_rotation),
copy.deepcopy(sensor2ego_translation),
copy.deepcopy(sensor2ego_rotation),
copy.deepcopy(cam_ego2global_translation),
copy.deepcopy(cam_ego2global_rotation),
copy.deepcopy(cam_intrinsic))
file_name = os.path.split(info['cams'][cam_key]['data_path'])[-1]
np.concatenate([pts_img[:2, :].T, depth[:, None]],
axis=1).astype(np.float32).flatten().tofile(
os.path.join('./data', 'depth_gt',
f'{file_name}.bin'))
if __name__ == '__main__':
po = Pool(12)
mmcv.mkdir_or_exist(os.path.join('./data', 'depth_gt'))
infos = mmcv.load(info_path_train)['infos']
for info in infos:
po.apply_async(func=worker, args=(info, ))
po.close()
po.join()
po2 = Pool(12)
infos = mmcv.load(info_path_val)['infos']
for info in infos:
po2.apply_async(func=worker, args=(info, ))
po2.close()
po2.join()
================================================
FILE: tools/misc/browse_dataset.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import numpy as np
import warnings
from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
from os import path as osp
from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
DepthInstance3DBoxes, LiDARInstance3DBoxes)
from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
show_seg_result)
from mmdet3d.datasets import build_dataset
def parse_args():
parser = argparse.ArgumentParser(description='Browse a dataset')
parser.add_argument('config', help='train config file path')
parser.add_argument(
'--skip-type',
type=str,
nargs='+',
default=['Normalize'],
help='skip some useless pipeline')
parser.add_argument(
'--output-dir',
default=None,
type=str,
help='If there is no display interface, you can save it')
parser.add_argument(
'--task',
type=str,
choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
help='Determine the visualization method depending on the task.')
parser.add_argument(
'--online',
action='store_true',
help='Whether to perform online visualization. Note that you often '
'need a monitor to do so.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
args = parser.parse_args()
return args
def build_data_cfg(config_path, skip_type, cfg_options):
"""Build data config for loading visualization data."""
cfg = Config.fromfile(config_path)
if cfg_options is not None:
cfg.merge_from_dict(cfg_options)
# import modules from string list.
if cfg.get('custom_imports', None):
from mmcv.utils import import_modules_from_strings
import_modules_from_strings(**cfg['custom_imports'])
# extract inner dataset of `RepeatDataset` as `cfg.data.train`
# so we don't need to worry about it later
if cfg.data.train['type'] == 'RepeatDataset':
cfg.data.train = cfg.data.train.dataset
# use only first dataset for `ConcatDataset`
if cfg.data.train['type'] == 'ConcatDataset':
cfg.data.train = cfg.data.train.datasets[0]
train_data_cfg = cfg.data.train
# eval_pipeline purely consists of loading functions
# use eval_pipeline for data loading
train_data_cfg['pipeline'] = [
x for x in cfg.eval_pipeline if x['type'] not in skip_type
]
return cfg
def to_depth_mode(points, bboxes):
"""Convert points and bboxes to Depth Coord and Depth Box mode."""
if points is not None:
points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
Coord3DMode.DEPTH)
if bboxes is not None:
bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
Box3DMode.DEPTH)
return points, bboxes
def show_det_data(idx, dataset, out_dir, filename, show=False):
"""Visualize 3D point cloud and 3D bboxes."""
example = dataset.prepare_train_data(idx)
points = example['points']._data.numpy()
gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
if dataset.box_mode_3d != Box3DMode.DEPTH:
points, gt_bboxes = to_depth_mode(points, gt_bboxes)
show_result(
points,
gt_bboxes.clone(),
None,
out_dir,
filename,
show=show,
snapshot=True)
def show_seg_data(idx, dataset, out_dir, filename, show=False):
"""Visualize 3D point cloud and segmentation mask."""
example = dataset.prepare_train_data(idx)
points = example['points']._data.numpy()
gt_seg = example['pts_semantic_mask']._data.numpy()
show_seg_result(
points,
gt_seg.copy(),
None,
out_dir,
filename,
np.array(dataset.PALETTE),
dataset.ignore_index,
show=show,
snapshot=True)
def show_proj_bbox_img(idx,
dataset,
out_dir,
filename,
show=False,
is_nus_mono=False):
"""Visualize 3D bboxes on 2D image by projection."""
try:
example = dataset.prepare_train_data(idx)
except AttributeError: # for Mono-3D datasets
example = dataset.prepare_train_img(idx)
gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
img_metas = example['img_metas']._data
img = example['img']._data.numpy()
# need to transpose channel to first dim
img = img.transpose(1, 2, 0)
# no 3D gt bboxes, just show img
if gt_bboxes.tensor.shape[0] == 0:
gt_bboxes = None
if isinstance(gt_bboxes, DepthInstance3DBoxes):
show_multi_modality_result(
img,
gt_bboxes,
None,
None,
out_dir,
filename,
box_mode='depth',
img_metas=img_metas,
show=show)
elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
show_multi_modality_result(
img,
gt_bboxes,
None,
img_metas['lidar2img'],
out_dir,
filename,
box_mode='lidar',
img_metas=img_metas,
show=show)
elif isinstance(gt_bboxes, CameraInstance3DBoxes):
show_multi_modality_result(
img,
gt_bboxes,
None,
img_metas['cam2img'],
out_dir,
filename,
box_mode='camera',
img_metas=img_metas,
show=show)
else:
# can't project, just show img
warnings.warn(
f'unrecognized gt box type {type(gt_bboxes)}, only show image')
show_multi_modality_result(
img, None, None, None, out_dir, filename, show=show)
def main():
args = parse_args()
if args.output_dir is not None:
mkdir_or_exist(args.output_dir)
cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
try:
dataset = build_dataset(
cfg.data.train, default_args=dict(filter_empty_gt=False))
except TypeError: # seg dataset doesn't have `filter_empty_gt` key
dataset = build_dataset(cfg.data.train)
data_infos = dataset.data_infos
dataset_type = cfg.dataset_type
# configure visualization mode
vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det'
for idx, data_info in enumerate(track_iter_progress(data_infos)):
if dataset_type in ['KittiDataset', 'WaymoDataset']:
data_path = data_info['point_cloud']['velodyne_path']
elif dataset_type in [
'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
'S3DISSegDataset', 'S3DISDataset'
]:
data_path = data_info['pts_path']
elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
data_path = data_info['lidar_path']
elif dataset_type in ['NuScenesMonoDataset']:
data_path = data_info['file_name']
else:
raise NotImplementedError(
f'unsupported dataset type {dataset_type}')
file_name = osp.splitext(osp.basename(data_path))[0]
if vis_task in ['det', 'multi_modality-det']:
# show 3D bboxes on 3D point clouds
show_det_data(
idx, dataset, args.output_dir, file_name, show=args.online)
if vis_task in ['multi_modality-det', 'mono-det']:
# project 3D bboxes to 2D image
show_proj_bbox_img(
idx,
dataset,
args.output_dir,
file_name,
show=args.online,
is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
elif vis_task in ['seg']:
# show 3D segmentation mask on 3D point clouds
show_seg_data(
idx, dataset, args.output_dir, file_name, show=args.online)
if __name__ == '__main__':
main()
================================================
FILE: tools/misc/fuse_conv_bn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import torch
from mmcv.runner import save_checkpoint
from torch import nn as nn
from mmdet.apis import init_model
def fuse_conv_bn(conv, bn):
"""During inference, the functionary of batch norm layers is turned off but
only the mean and var alone channels are used, which exposes the chance to
fuse it with the preceding conv layers to save computations and simplify
network structures."""
conv_w = conv.weight
conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
bn.running_mean)
factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
conv.weight = nn.Parameter(conv_w *
factor.reshape([conv.out_channels, 1, 1, 1]))
conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
return conv
def fuse_module(m):
last_conv = None
last_conv_name = None
for name, child in m.named_children():
if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
if last_conv is None: # only fuse BN that is after Conv
continue
fused_conv = fuse_conv_bn(last_conv, child)
m._modules[last_conv_name] = fused_conv
# To reduce changes, set BN as Identity instead of deleting it.
m._modules[name] = nn.Identity()
last_conv = None
elif isinstance(child, nn.Conv2d):
last_conv = child
last_conv_name = name
else:
fuse_module(child)
return m
def parse_args():
parser = argparse.ArgumentParser(
description='fuse Conv and BN layers in a model')
parser.add_argument('config', help='config file path')
parser.add_argument('checkpoint', help='checkpoint file path')
parser.add_argument('out', help='output path of the converted model')
args = parser.parse_args()
return args
def main():
args = parse_args()
# build the model from a config file and a checkpoint file
model = init_model(args.config, args.checkpoint)
# fuse conv and bn layers of the model
fused_model = fuse_module(model)
save_checkpoint(fused_model, args.out)
if __name__ == '__main__':
main()
================================================
FILE: tools/misc/print_config.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
from mmcv import Config, DictAction
def parse_args():
parser = argparse.ArgumentParser(description='Print the whole config')
parser.add_argument('config', help='config file path')
parser.add_argument(
'--options', nargs='+', action=DictAction, help='arguments in dict')
args = parser.parse_args()
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.options is not None:
cfg.merge_from_dict(args.options)
print(f'Config:\n{cfg.pretty_text}')
if __name__ == '__main__':
main()
================================================
FILE: tools/misc/visualize_results.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import mmcv
from mmcv import Config
from mmdet3d.datasets import build_dataset
def parse_args():
parser = argparse.ArgumentParser(
description='MMDet3D visualize the results')
parser.add_argument('config', help='test config file path')
parser.add_argument('--result', help='results file in pickle format')
parser.add_argument(
'--show-dir', help='directory where visualize results will be saved')
args = parser.parse_args()
return args
def main():
args = parse_args()
if args.result is not None and \
not args.result.endswith(('.pkl', '.pickle')):
raise ValueError('The results file must be a pkl file.')
cfg = Config.fromfile(args.config)
cfg.data.test.test_mode = True
# build the dataset
dataset = build_dataset(cfg.data.test)
results = mmcv.load(args.result)
if getattr(dataset, 'show', None) is not None:
# data loading pipeline for showing
eval_pipeline = cfg.get('eval_pipeline', {})
if eval_pipeline:
dataset.show(results, args.show_dir, pipeline=eval_pipeline)
else:
dataset.show(results, args.show_dir) # use default pipeline
else:
raise NotImplementedError(
'Show is not implemented for dataset {}!'.format(
type(dataset).__name__))
if __name__ == '__main__':
main()
================================================
FILE: tools/test.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# Modified by Junyi Ma, following OpenOccupancy of Zhiqi Li
import argparse
import mmcv
import os
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
wrap_fp16_model)
from mmdet3d.apis import single_gpu_test
from mmdet3d.datasets import build_dataset
from projects.occ_plugin.datasets.builder import build_dataloader
from mmdet3d.models import build_model
from mmdet.apis import set_random_seed
from projects.occ_plugin.occupancy.apis.test import custom_single_gpu_test, custom_multi_gpu_test
from mmdet.datasets import replace_ImageToTensor
import time
import os.path as osp
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=FutureWarning)
def parse_args():
parser = argparse.ArgumentParser(
description='MMDet test (and eval) a model')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('--out', help='output result file in pickle format')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument(
'--format-only',
action='store_true',
help='Format the output results without perform evaluation. It is'
'useful when you want to format the result to a specific format and '
'submit it to the test server')
parser.add_argument(
'--eval',
type=str,
nargs='+',
help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
parser.add_argument('--show', action='store_true', help='show results')
parser.add_argument(
'--show-dir', help='directory where results will be saved')
parser.add_argument(
'--gpu-collect',
action='store_true',
help='whether to use gpu to collect results.')
parser.add_argument(
'--tmpdir',
help='tmp directory used for collecting results from multiple '
'workers, available when gpu-collect is not specified')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument(
'--deterministic',
action='store_true',
help='whether to set deterministic options for CUDNN backend.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--options',
nargs='+',
action=DictAction,
help='custom options for evaluation, the key-value pair in xxx=yyy '
'format will be kwargs for dataset.evaluate() function (deprecate), '
'change to --eval-options instead.')
parser.add_argument(
'--eval-options',
nargs='+',
action=DictAction,
help='custom options for evaluation, the key-value pair in xxx=yyy '
'format will be kwargs for dataset.evaluate() function')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
if args.options and args.eval_options:
raise ValueError(
'--options and --eval-options cannot be both specified, '
'--options is deprecated in favor of --eval-options')
if args.options:
warnings.warn('--options is deprecated in favor of --eval-options')
args.eval_options = args.options
return args
def main():
args = parse_args()
assert args.out or args.eval or args.format_only or args.show \
or args.show_dir, \
('Please specify at least one operation (save/eval/format/show the '
'results / save the results) with the argument "--out", "--eval"'
', "--format-only", "--show" or "--show-dir"')
if args.eval and args.format_only:
raise ValueError('--eval and --format_only cannot be both specified')
if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
raise ValueError('The output file must be a pkl file.')
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# import modules from string list.
if cfg.get('custom_imports', None):
from mmcv.utils import import_modules_from_strings
import_modules_from_strings(**cfg['custom_imports'])
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, 'plugin'):
if cfg.plugin:
import importlib
if hasattr(cfg, 'plugin_dir'):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
# print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
# print(_module_path)
plg_lib = importlib.import_module(_module_path)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
# in case the test dataset is concatenated
samples_per_gpu = 1
if isinstance(cfg.data.test, dict):
cfg.data.test.test_mode = True
samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
if samples_per_gpu > 1:
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.test.pipeline = replace_ImageToTensor(
cfg.data.test.pipeline)
elif isinstance(cfg.data.test, list):
for ds_cfg in cfg.data.test:
ds_cfg.test_mode = True
samples_per_gpu = max(
[ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
if samples_per_gpu > 1:
for ds_cfg in cfg.data.test:
ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
# init distributed env first, since logger depends on the dist info.
# print("args.launcher", args.launcher)
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# set random seeds
if args.seed is not None:
set_random_seed(args.seed, deterministic=args.deterministic)
# build the dataloader
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=True,
num_gpus=2,
)
# build the model and load checkpoint
cfg.model.train_cfg = None
model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_conv_bn(model)
# old versions did not save class info in checkpoints, this walkaround is
# for backward compatibility
if 'CLASSES' in checkpoint.get('meta', {}):
model.CLASSES = checkpoint['meta']['CLASSES']
else:
model.CLASSES = dataset.CLASSES
# palette for visualization in segmentation tasks
if 'PALETTE' in checkpoint.get('meta', {}):
model.PALETTE = checkpoint['meta']['PALETTE']
elif hasattr(dataset, 'PALETTE'):
# segmentation dataset has `PALETTE` attribute
model.PALETTE = dataset.PALETTE
if args.show:
if args.show_dir is None:
args.show_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0],
'visualization')
print('save dir: ', args.show_dir)
os.makedirs(args.show_dir, exist_ok=True)
if not distributed:
model = MMDataParallel(model, device_ids=[0])
outputs = custom_single_gpu_test(model, data_loader, args.show, args.show_dir)
else:
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False)
outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
args.gpu_collect, args.show, args.show_dir)
rank, _ = get_dist_info()
if rank == 0 and distributed:
kwargs = {} if args.eval_options is None else args.eval_options
kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
'/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
if args.format_only:
dataset.format_results(outputs, **kwargs)
if args.eval:
eval_kwargs = cfg.get('evaluation', {}).copy()
# hard-code way to remove EvalHook args
for key in [
'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
'rule'
]:
eval_kwargs.pop(key, None)
eval_kwargs.update(dict(metric=args.eval, **kwargs))
print(dataset.evaluate(outputs, **eval_kwargs))
if __name__ == '__main__':
main()
================================================
FILE: tools/train.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# Cam4DOcc refers to OpenOccupancy of Zhiqi Li
from __future__ import division
import argparse
import copy
import mmcv
import os
import time
import torch
import warnings
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist
from os import path as osp
from mmdet import __version__ as mmdet_version
from mmdet3d import __version__ as mmdet3d_version
from mmseg import __version__ as mmseg_version
from mmdet3d.datasets import build_dataset
from mmdet3d.models import build_model
from mmdet3d.utils import collect_env, get_root_logger
from mmdet.apis import set_random_seed
from mmcv.utils import TORCH_VERSION, digit_version
from projects.occ_plugin.occupancy.apis.train import custom_train_model
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=FutureWarning)
def parse_args():
parser = argparse.ArgumentParser(description='Train a detector')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work-dir', help='the dir to save logs and models')
parser.add_argument(
'--resume', help='the checkpoint file to resume from')
parser.add_argument(
'--no-validate',
action='store_true',
help='whether not to evaluate the checkpoint during training')
group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument(
'--gpus',
type=int,
help='number of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument(
'--deterministic',
action='store_true',
help='whether to set deterministic options for CUDNN backend.')
parser.add_argument(
'--options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file (deprecate), '
'change to --cfg-options instead.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='pytorch',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument(
'--autoscale-lr',
action='store_true',
help='automatically scale lr with the number of gpus')
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, 'plugin') and cfg.plugin:
assert cfg.plugin_dir is not None
import importlib
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
# print(_module_path)
plg_lib = importlib.import_module(_module_path)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0])
if args.resume is not None and osp.isfile(args.resume):
cfg.resume_from = args.resume
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
if args.autoscale_lr:
cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# re-set gpu_ids with distributed training mode
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
# create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# dump config
cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
# init the logger before other steps
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
logger = get_root_logger(
log_file=log_file, log_level=cfg.log_level, name='mmdet')
# init the meta dict to record some important information such as
# environment info and seed, which will be logged
meta = dict()
# log env info
env_info_dict = collect_env()
env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
dash_line = '-' * 60 + '\n'
logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line)
meta['env_info'] = env_info
meta['config'] = cfg.pretty_text
# log some basic info
logger.info(f'Distributed training: {distributed}')
# set random seeds
if args.seed is not None:
logger.info(f'Set random seed to {args.seed}, '
f'deterministic: {args.deterministic}')
set_random_seed(args.seed, deterministic=args.deterministic)
cfg.seed = args.seed
meta['seed'] = args.seed
meta['exp_name'] = osp.basename(args.config)
model = build_model(
cfg.model,
train_cfg=cfg.get('train_cfg'),
test_cfg=cfg.get('test_cfg'))
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f'Number of params: {n_parameters}')
model.init_weights()
datasets = [build_dataset(cfg.data.train)]
# add an attribute for visualization convenience
model.CLASSES = datasets[0].CLASSES
custom_train_model(
model,
datasets,
cfg,
distributed=distributed,
validate=(not args.no_validate),
timestamp=timestamp,
meta=meta)
if __name__ == '__main__':
main()
================================================
FILE: viz/viz_gt.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
from tqdm import tqdm
import pickle
import numpy as np
from mayavi import mlab
from tqdm import trange
import os
from xvfbwrapper import Xvfb
# export QT_QPA_PLATFORM='offscreen'
mlab.options.offscreen = True
def viz_occ(occ, occ_mo, file_name, voxel_size, show_occ, show_time_change):
vdisplay = Xvfb(width=1, height=1)
vdisplay.start()
mlab.figure(size=(800,800), bgcolor=(1,1,1))
plt_plot_occ = mlab.points3d(
occ[:, 0] * voxel_size,
occ[:, 1] * voxel_size,
occ[:, 2] * voxel_size,
occ[:, 3],
colormap="viridis",
scale_factor=voxel_size - 0.05 * voxel_size,
mode="cube",
opacity=0.9,
vmin=1,
)
colors_occ = np.array(
[
[152, 251, 152, 255],
[152, 251, 152, 255],
[152, 251, 152, 255],
[152, 251, 152, 255],
[152, 251, 152, 255],
]
).astype(np.uint8)
plt_plot_occ.glyph.scale_mode = "scale_by_vector"
plt_plot_occ.module_manager.scalar_lut_manager.lut.table = colors_occ
plt_plot_mov = mlab.points3d(
occ_mo[:, 0] * voxel_size,
occ_mo[:, 1] * voxel_size,
occ_mo[:, 2] * voxel_size,
occ_mo[:, 3],
colormap="viridis",
scale_factor=voxel_size - 0.05 * voxel_size,
mode="cube",
opacity=0.9,
vmin=1,
)
if show_time_change:
colors_occ_mo = np.array(
[
[255, 70, 255, 255],
[255, 110, 255, 255],
[255, 150, 255, 255],
[255, 190, 255, 255],
[255, 250, 250, 255],
]
).astype(np.uint8)
else:
colors_occ_mo = np.array(
[
[220, 20, 60, 255],
[255, 127, 80, 255],
[0, 0, 230, 255],
[255, 158, 0, 255],
[233, 150, 70, 255],
[47, 79, 79, 255],
[255, 99, 71, 255],
[175, 0, 75, 255],
[255, 61, 99, 255],
]
).astype(np.uint8)
plt_plot_mov.glyph.scale_mode = "scale_by_vector"
plt_plot_mov.module_manager.scalar_lut_manager.lut.table = colors_occ_mo
fig_dir = "./figs"
if not os.path.exists(fig_dir):
os.mkdir(fig_dir)
mlab.savefig(os.path.join(fig_dir, file_name[:-4]+".png"))
vdisplay.stop()
def main():
show_time_change = True
nuscocc_path = "../data/nuScenes-Occupancy/"
cam4docc_path = "../data/cam4docc/GMO/segmentation/"
segmentation_files = os.listdir(cam4docc_path)
segmentation_files.sort(key=lambda x: (x.split("_")[1]))
index = 0
for file_ in tqdm(segmentation_files):
scene_token = file_.split("_")[0]
lidar_token = file_.split("_")[1]
gt_file = nuscocc_path+"scene_"+scene_token+"/occupancy/"+lidar_token[:-4]+".npy"
gt_occ_semantic = np.load(gt_file,allow_pickle=True)
gt_occ_semantic = gt_occ_semantic[gt_occ_semantic[:, -1]!=0]
gt_occ_semantic = gt_occ_semantic[::2]
gt_occ_semantic_refine = np.zeros_like(gt_occ_semantic)
gt_occ_semantic_refine[:, 0] = gt_occ_semantic[:, 2]
gt_occ_semantic_refine[:, 1] = gt_occ_semantic[:, 1]
gt_occ_semantic_refine[:, 2] = gt_occ_semantic[:, 0]
gt_occ_semantic_refine[:, 3] = 1
gt_mo_semantic = np.load(cam4docc_path+file_,allow_pickle=True)['arr_0']
gt_mo_semantic_to_draw=np.zeros((0,4))
for t in range(0,4):
gt_mo_cur = gt_mo_semantic[t]
gt_mo_cur = np.array(gt_mo_cur)
gt_mo_cur = gt_mo_cur[::2]
if show_time_change:
gt_mo_cur[:, -1] = int(t+1)
gt_mo_semantic_to_draw = np.concatenate((gt_mo_semantic_to_draw, gt_mo_cur))
viz_occ(gt_occ_semantic_refine, gt_mo_semantic_to_draw, file_, voxel_size=0.2, show_occ=True, show_time_change=show_time_change)
index += 1
if __name__ == "__main__":
main()
================================================
FILE: viz/viz_pred.py
================================================
from tqdm import tqdm
import pickle
import numpy as np
from mayavi import mlab
from tqdm import trange
import os
from xvfbwrapper import Xvfb
mlab.options.offscreen = True
def viz_occ(occ, occ_mo, file_name, voxel_size, show_occ, show_time_change):
vdisplay = Xvfb(width=1, height=1)
vdisplay.start()
mlab.figure(size=(800,800), bgcolor=(1,1,1))
plt_plot_occ = mlab.points3d(
occ[:, 0] * voxel_size,
occ[:, 1] * voxel_size,
occ[:, 2] * voxel_size,
occ[:, 3],
colormap="viridis",
scale_factor=voxel_size - 0.05 * voxel_size,
mode="cube",
opacity=0.9,
vmin=1,
)
colors_occ = np.array(
[
[152, 251, 152, 255],
[152, 251, 152, 255],
[152, 251, 152, 255],
[152, 251, 152, 255],
[152, 251, 152, 255],
]
).astype(np.uint8)
plt_plot_occ.glyph.scale_mode = "scale_by_vector"
plt_plot_occ.module_manager.scalar_lut_manager.lut.table = colors_occ
plt_plot_mov = mlab.points3d(
occ_mo[:, 0] * voxel_size,
occ_mo[:, 1] * voxel_size,
occ_mo[:, 2] * voxel_size,
occ_mo[:, 3],
colormap="viridis",
scale_factor=voxel_size - 0.05 * voxel_size,
mode="cube",
opacity=0.9,
vmin=1,
)
if show_time_change:
colors_occ_mo = np.array(
[
[255, 70, 255, 255],
[255, 110, 255, 255],
[255, 150, 255, 255],
[255, 190, 255, 255],
[255, 250, 250, 255],
]
).astype(np.uint8)
else:
colors_occ_mo = np.array(
[
[220, 20, 60, 255],
[255, 127, 80, 255],
[0, 0, 230, 255],
[255, 158, 0, 255],
[233, 150, 70, 255],
[47, 79, 79, 255],
[255, 99, 71, 255],
[175, 0, 75, 255],
[255, 61, 99, 255],
]
).astype(np.uint8)
plt_plot_mov.glyph.scale_mode = "scale_by_vector"
plt_plot_mov.module_manager.scalar_lut_manager.lut.table = colors_occ_mo
fig_dir = "./figs"
if not os.path.exists(fig_dir):
os.mkdir(fig_dir)
mlab.savefig(os.path.join(fig_dir, file_name[:-4]+".png"))
vdisplay.stop()
def main():
show_time_change = True
nuscocc_path = "../data/nuScenes-Occupancy/"
pred_path = "../data/cam4docc/results/"
segmentation_files = os.listdir(pred_path)
segmentation_files.sort(key=lambda x: (x.split("_")[1]))
index = 0
segmentation_files = segmentation_files[::10]
for file_ in tqdm(segmentation_files):
scene_token = file_.split("_")[0]
lidar_token = file_.split("_")[1]
gt_file = nuscocc_path+"scene_"+scene_token+"/occupancy/"+lidar_token[:-4]+".npy"
gt_occ_semantic = np.load(gt_file,allow_pickle=True)
gt_occ_semantic = gt_occ_semantic[gt_occ_semantic[:, -1]!=0]
gt_occ_semantic = gt_occ_semantic[::2]
gt_occ_semantic_refine = np.zeros_like(gt_occ_semantic)
gt_occ_semantic_refine[:, 0] = gt_occ_semantic[:, 2]
gt_occ_semantic_refine[:, 1] = gt_occ_semantic[:, 1]
gt_occ_semantic_refine[:, 2] = gt_occ_semantic[:, 0]
gt_occ_semantic_refine[:, 3] = 1
pred_mo_semantic = np.load(pred_path+file_,allow_pickle=True)['arr_0']
pred_mo_semantic_to_draw=np.zeros((0,4))
for t in range(0,4):
pred_mo_cur = pred_mo_semantic[t]
pred_mo_cur = np.array(pred_mo_cur)
pred_mo_cur = pred_mo_cur[::2]
if show_time_change:
pred_mo_cur[:, -1] = int(t+1)
pred_mo_semantic_to_draw = np.concatenate((pred_mo_semantic_to_draw, pred_mo_cur))
viz_occ(gt_occ_semantic_refine, pred_mo_semantic_to_draw, file_, voxel_size=0.2, show_occ=True, show_time_change=show_time_change)
index += 1
if __name__ == "__main__":
main()
# export QT_QPA_PLATFORM='offscreen'