Repository: haomo-ai/Cam4DOcc
Branch: main
Commit: 542f14a9d9e1
Files: 110
Total size: 543.7 KB

Directory structure:
gitextract_hgf40bk9/

├── LICENSE
├── README.md
├── data/
│   ├── README.md
│   ├── cam4docc/
│   │   ├── .gitkeep
│   │   ├── GMO/
│   │   │   └── .gitkeep
│   │   ├── GMO_lyft/
│   │   │   └── .gitkeep
│   │   ├── MMO/
│   │   │   └── .gitkeep
│   │   └── MMO_lyft/
│   │       └── .gitkeep
│   └── nuscenes/
│       └── .gitkeep
├── other_baselines/
│   ├── README.md
│   ├── lifted_2d/
│   │   └── eval_lifted_2d.py
│   ├── static_world/
│   │   └── eval_static_world.py
│   └── voxel_pcp/
│       └── eval_voxel_pcp.py
├── projects/
│   ├── __init__.py
│   ├── configs/
│   │   ├── _base_/
│   │   │   ├── datasets/
│   │   │   │   ├── custom_lyft-3d.py
│   │   │   │   ├── custom_nus-3d.py
│   │   │   │   └── custom_waymo-3d.py
│   │   │   ├── default_runtime.py
│   │   │   └── schedules/
│   │   │       ├── cosine.py
│   │   │       ├── cyclic_20e.py
│   │   │       ├── cyclic_40e.py
│   │   │       ├── mmdet_schedule_1x.py
│   │   │       ├── schedule_2x.py
│   │   │       ├── schedule_3x.py
│   │   │       ├── seg_cosine_150e.py
│   │   │       ├── seg_cosine_200e.py
│   │   │       └── seg_cosine_50e.py
│   │   ├── baselines/
│   │   │   ├── OCFNet_in_Cam4DOcc_V1.1.py
│   │   │   ├── OCFNet_in_Cam4DOcc_V1.1_lyft.py
│   │   │   ├── OCFNet_in_Cam4DOcc_V1.2.py
│   │   │   └── OCFNet_in_Cam4DOcc_V1.2_lyft.py
│   │   └── datasets/
│   │       └── custom_nus-3d.py
│   └── occ_plugin/
│       ├── __init__.py
│       ├── core/
│       │   ├── __init__.py
│       │   ├── evaluation/
│       │   │   ├── __init__.py
│       │   │   ├── efficiency_hooks.py
│       │   │   └── eval_hooks.py
│       │   └── visualizer/
│       │       ├── __init__.py
│       │       └── show_occ.py
│       ├── datasets/
│       │   ├── __init__.py
│       │   ├── builder.py
│       │   ├── cam4docc_dataset.py
│       │   ├── cam4docc_lyft_dataset.py
│       │   ├── nuscenes_dataset.py
│       │   ├── pipelines/
│       │   │   ├── __init__.py
│       │   │   ├── formating.py
│       │   │   ├── loading_bevdet.py
│       │   │   ├── loading_instance.py
│       │   │   ├── loading_occupancy.py
│       │   │   └── transform_3d.py
│       │   └── samplers/
│       │       ├── __init__.py
│       │       ├── distributed_sampler.py
│       │       ├── group_sampler.py
│       │       └── sampler.py
│       ├── occupancy/
│       │   ├── __init__.py
│       │   ├── apis/
│       │   │   ├── __init__.py
│       │   │   ├── mmdet_train.py
│       │   │   ├── test.py
│       │   │   └── train.py
│       │   ├── backbones/
│       │   │   ├── __init__.py
│       │   │   ├── pred_block.py
│       │   │   └── resnet3d.py
│       │   ├── dense_heads/
│       │   │   ├── __init__.py
│       │   │   ├── flow_head.py
│       │   │   ├── lovasz_softmax.py
│       │   │   ├── occ_head.py
│       │   │   └── utils.py
│       │   ├── detectors/
│       │   │   ├── __init__.py
│       │   │   ├── bevdepth.py
│       │   │   └── ocfnet.py
│       │   ├── fuser/
│       │   │   ├── __init__.py
│       │   │   ├── addfuse.py
│       │   │   ├── convfuse.py
│       │   │   └── visfuse.py
│       │   ├── image2bev/
│       │   │   ├── ViewTransformerLSSBEVDepth.py
│       │   │   ├── ViewTransformerLSSVoxel.py
│       │   │   └── __init__.py
│       │   ├── necks/
│       │   │   ├── __init__.py
│       │   │   ├── fpn3d.py
│       │   │   └── second_fpn_3d.py
│       │   └── voxel_encoder/
│       │       ├── __init__.py
│       │       └── sparse_lidar_enc.py
│       ├── ops/
│       │   ├── __init__.py
│       │   └── occ_pooling/
│       │       ├── OCC_Pool.py
│       │       ├── __init__.py
│       │       └── src/
│       │           ├── occ_pool.cpp
│       │           └── occ_pool_cuda.cu
│       └── utils/
│           ├── __init__.py
│           ├── coordinate_transform.py
│           ├── formating.py
│           ├── gaussian.py
│           ├── geometry.py
│           ├── metric_util.py
│           ├── nusc_param.py
│           ├── semkitti.py
│           └── voxel_to_points.py
├── run.sh
├── run_eval.sh
├── setup.py
├── tools/
│   ├── dist_test.sh
│   ├── dist_train.sh
│   ├── gen_data/
│   │   └── gen_depth_gt.py
│   ├── misc/
│   │   ├── browse_dataset.py
│   │   ├── fuse_conv_bn.py
│   │   ├── print_config.py
│   │   └── visualize_results.py
│   ├── test.py
│   └── train.py
└── viz/
    ├── viz_gt.py
    └── viz_pred.py

================================================
FILE CONTENTS
================================================

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023 HAOMO.AI

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# Cam4DOcc

The official code an data for the benchmark with baselines for our paper: [Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications](https://arxiv.org/abs/2311.17663)

This work has been accepted by CVPR 2024 :tada:

[Junyi Ma#](https://github.com/BIT-MJY), [Xieyuanli Chen#](https://github.com/Chen-Xieyuanli), Jiawei Huang, [Jingyi Xu](https://github.com/BIT-XJY), [Zhen Luo](https://github.com/Blurryface0814), Jintao Xu, Weihao Gu, Rui Ai, [Hesheng Wang*](https://scholar.google.com/citations?hl=en&user=q6AY9XsAAAAJ)

<img src="https://github.com/haomo-ai/Cam4DOcc/blob/main/benchmark.png" width="49%"/> <img src="https://github.com/haomo-ai/Cam4DOcc/blob/main/OCFNet.png" width="49%"/>


## Citation
If you use Cam4DOcc in an academic work, please cite our paper:

	@inproceedings{ma2024cvpr,
		author = {Junyi Ma and Xieyuanli Chen and Jiawei Huang and Jingyi Xu and Zhen Luo and Jintao Xu and Weihao Gu and Rui Ai and Hesheng Wang},
		title = {{Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications}},
		booktitle = {Proc.~of the IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR)},
		year = 2024
	}
 
## Installation

<details>
	
<summary>We follow the installation instructions of our codebase OpenOccupancy, which are also posted here
</summary>

* Create a conda virtual environment and activate it
```bash
conda create -n cam4docc python=3.7 -y
conda activate cam4docc
```
* Install PyTorch and torchvision (tested on torch==1.10.1 & cuda=11.3)
```bash
conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge
```
* Install gcc>=5 in conda env
```bash
conda install -c omgarcia gcc-6
```
* Install mmcv, mmdet, and mmseg
```bash
pip install mmcv-full==1.4.0
pip install mmdet==2.14.0
pip install mmsegmentation==0.14.1
```
* Install mmdet3d from the source code
```bash
git clone https://github.com/open-mmlab/mmdetection3d.git
cd mmdetection3d
git checkout v0.17.1 # Other versions may not be compatible.
python setup.py install
```
* Install other dependencies
```bash
pip install timm
pip install open3d-python
pip install PyMCubes
pip install spconv-cu113
pip install fvcore
pip install setuptools==59.5.0

pip install lyft_dataset_sdk # for lyft dataset
```
* Install occupancy pooling
```
git clone git@github.com:haomo-ai/Cam4DOcc.git
cd Cam4DOcc
export PYTHONPATH=“.”
python setup.py develop
```

</details>

## Data Structure

### nuScenes dataset
* Please link your [nuScenes V1.0 full dataset](https://www.nuscenes.org/nuscenes#download) to the data folder. 
* [nuScenes-Occupancy](https://drive.google.com/file/d/1vTbgddMzUN6nLyWSsCZMb9KwihS7nPoH/view?usp=sharing), [nuscenes_occ_infos_train.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/train_pkl), and [nuscenes_occ_infos_val.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/val_pkl) are also provided by the previous work. If you only want to reproduce the forecasting results with "inflated" form, nuScenes dataset and Cam4DOcc are all you need.

### Lyft dataset
* Please link your [Lyft dataset](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) to the data folder.
* The required folders are listed below.

Note that the folders under `cam4docc` will be generated automatically once you first run our training or evaluation scripts.

```bash
Cam4DOcc
├── data/
│   ├── nuscenes/
│   │   ├── maps/
│   │   ├── samples/
│   │   ├── sweeps/
│   │   ├── lidarseg/
│   │   ├── v1.0-test/
│   │   ├── v1.0-trainval/
│   │   ├── nuscenes_occ_infos_train.pkl
│   │   ├── nuscenes_occ_infos_val.pkl
│   ├── nuScenes-Occupancy/
│   ├── lyft/
│   │   ├── maps/
│   │   ├── train_data/
│   │   ├── images/   # from train images, containing xxx.jpeg
│   ├── cam4docc
│   │   ├── GMO/
│   │   │   ├── segmentation/
│   │   │   ├── instance/
│   │   │   ├── flow/
│   │   ├── MMO/
│   │   │   ├── segmentation/
│   │   │   ├── instance/
│   │   │   ├── flow/
│   │   ├── GMO_lyft/
│   │   │   ├── ...
│   │   ├── MMO_lyft/
│   │   │   ├── ...
```
Alternatively, you could manually modify the path parameters in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines) instead of using the default data structure, which are also listed here:
```
occ_path = "./data/nuScenes-Occupancy"
depth_gt_path = './data/depth_gt'
train_ann_file = "./data/nuscenes/nuscenes_occ_infos_train.pkl"
val_ann_file = "./data/nuscenes/nuscenes_occ_infos_val.pkl"
cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/nuscenes/'
```

## Training and Evaluation

We directly integrate the Cam4DOcc dataset generation pipeline into the dataloader, so you can directly run training or evaluate scripts and just wait :smirk:

Optionally, you can set `only_generate_dataset=True` in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines) to only generate the Cam4DOcc data without model training and inference.

### Train OCFNetV1.1 with 8 GPUs

OCFNetV1.1 can forecast inflated GMO and others. In this case, _vehicle_ and _human_ are considered as one unified category.

For the nuScenes dataset, please run

```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py 8
```

For the Lyft dataset, please run

```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1_lyft.py 8
```
### Train OCFNetV1.2 with 8 GPUs

OCFNetV1.2 can forecast inflated GMO including _bicycle_, _bus_, _car_, _construction_, _motorcycle_, _trailer_, _truck_, _pedestrian_, and others. In this case, _vehicle_ and _human_ are divided into multiple categories for clearer evaluation on forecasting performance.

For the nuScenes dataset, please run

```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py 8
```

For the Lyft dataset, please run

```bash
bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2_lyft.py 8
```

* The training/test process will be accelerated several times after you generate datasets by the first epoch.

### Test OCFNet for different tasks

If you only want to test the performance of occupancy prediction for the present frame (current observation), please set `test_present=True` in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines). Otherwise, forecasting performance on the future interval is evaluated.

```bash
bash run_eval.sh $PATH_TO_CFG $PATH_TO_CKPT $GPU_NUM
# e.g. bash run_eval.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py ./work_dirs/OCFNet_in_Cam4DOcc_V1.1/epoch_20.pth  8
```
Please set `save_pred` and `save_path` in the config files once saving prediction results is needed.

`VPQ` evaluation of 3D instance prediction will be refined in the future.

### Visualization

Please install the dependencies as follows:

```bash
sudo apt-get install Xvfb
pip install xvfbwrapper
pip install mayavi
```
where `Xvfb` may be needed for visualization in your server.

**Visualize ground-truth occupancy labels**. Set `show_time_change = True` if you want to show the changing state of occupancy in time intervals. 

```bash
cd viz
python viz_gt.py
```
<img src="https://github.com/haomo-ai/Cam4DOcc/blob/main/viz_occupancy.png" width="100%"/>

**Visualize occupancy forecasting results**. Set `show_time_change = True` if you want to show the changing state of occupancy in time intervals. 

```bash
cd viz
python viz_pred.py
```
<img src="https://github.com/haomo-ai/Cam4DOcc/blob/main/viz_pred.png" width="100%"/>

There is still room for improvement. Camera-only 4D occupancy forecasting remains challenging, especially for predicting over longer time intervals with many moving objects. We envision this benchmark as a valuable evaluation tool, and our OCFNet can serve as a foundational codebase for future research on 4D occupancy forecasting.

## Basic Information

Some basic information as well as key parameters for our current version.

| Type |  Info | Parameter |
| :----: | :----: | :----: |
| train           | 23,930 sequences | train_capacity |
| val             | 5,119 frames | test_capacity |
| voxel size      | 0.2m | voxel_x/y/z |
| range           | [-51.2m, -51.2m, -5m, 51.2m, 51.2m, 3m]| point_cloud_range |
| volume size     | [512, 512, 40]| occ_size |
| classes         | 2 for V1.1 / 9 for V1.2 | num_cls |
| observation frames | 3 | time_receptive_field |
| future frames | 4 | n_future_frames |
| extension frames | 6 | n_future_frames_plus |

Our proposed OCFNet can still perform well while being trained with partial data. Please try to decrease `train_capacity` if you want to explore more details with sparser supervision signals. 

In addition, please make sure that `n_future_frames_plus <= time_receptive_field + n_future_frames` because `n_future_frames_plus` means the real prediction number. We estimate more frames including the past ones rather than only `n_future_frames`.

## Pretrained Models

We will provide our pretrained models of the erratum version. Your patience is appreciated.

**Deprecated:**

~~Please download our pretrained models (for epoch=20) to resume training or reproduce results.~~

| Version | Google Drive <img src="https://ssl.gstatic.com/docs/doclist/images/drive_2022q3_32dp.png" alt="Google Drive" width="18"/> | Baidu Cloud <img src="https://nd-static.bdstatic.com/m-static/v20-main/favicon-main.ico" alt="Baidu Yun" width="18"/> | Config |
| :---: | :---: | :---: | :---: |
| ~~V1.0~~ | ~~link~~ | ~~link~~ | ~~only vehicle~~ |
| V1.1 | [link](https://drive.google.com/file/d/1IXRqOQk3RKpIjGgBBqV9D9vgSt58QDr8/view?usp=sharing) | [link](https://pan.baidu.com/s/18gODsVnBAXEJ4pzv2-LqGA?pwd=m99b) | [OCFNet_in_Cam4DOcc_V1.1.py](https://github.com/haomo-ai/Cam4DOcc/blob/main/projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py) |
| V1.2 | [link](https://drive.google.com/file/d/1q1XnRt0wYE3oq6YBMBnagpGL7h2I46uN/view?usp=sharing) | [link](https://pan.baidu.com/s/1OPc1-a2McOO_0QPX63J7WQ?pwd=adic) | [OCFNet_in_Cam4DOcc_V1.2.py](https://github.com/haomo-ai/Cam4DOcc/blob/main/projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py) |


## Other Baselines

We also provide the evaluation on the forecasting performance of [other baselines](https://github.com/haomo-ai/Cam4DOcc/tree/main/other_baselines) in Cam4DOcc.

## TODO
The tutorial is being updated ...

We will release our pretrained models as soon as possible. OCFNetV1.3 and OCFNetV2 are on their way ...


### Acknowledgement

We thank the fantastic works [OpenOccupancy](https://github.com/JeffWang987/OpenOccupancy), [PowerBEV](https://github.com/EdwardLeeLPZ/PowerBEV), and [FIERY](https://anthonyhu.github.io/fiery) for their pioneer code release, which provide codebase for this benchmark.


================================================
FILE: data/README.md
================================================
### Data Structure

Please link your [nuScenes V1.0 full dataset ](https://www.nuscenes.org/nuscenes#download) to the data folder. 

[nuScenes-Occupancy](https://drive.google.com/file/d/1vTbgddMzUN6nLyWSsCZMb9KwihS7nPoH/view?usp=sharing), [nuscenes_occ_infos_train.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/train_pkl), and [nuscenes_occ_infos_val.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/val_pkl) are also provided by the previous work. If you only want to reproduce the forecasting results with "inflated" form, nuScenes dataset and Cam4DOcc are all you need.

Note that the folders under `cam4docc` will be generated automatically once you first run our training or evaluation scripts.

```bash
Cam4DOcc
├── data/
│   ├── nuscenes/
│   │   ├── maps/
│   │   ├── samples/
│   │   ├── sweeps/
│   │   ├── lidarseg/
│   │   ├── v1.0-test/
│   │   ├── v1.0-trainval/
│   │   ├── nuscenes_occ_infos_train.pkl/
│   │   ├── nuscenes_occ_infos_val.pkl/
│   ├── nuScenes-Occupancy/
│   ├── cam4docc
│   │   ├── GMO/
│   │   │   ├── segmentation/
│   │   │   ├── instance/
│   │   │   ├── flow/
│   │   ├── MMO/
│   │   │   ├── segmentation/
│   │   │   ├── instance/
│   │   │   ├── flow/
```
The GMO folder will contain the data where vehicle and human are considered as one unified category.

The MMO folder will contain the data where vehicle and human are divided into multiple categories for clearer evaluation on forecasting performance.

In near future, we will unify GMO and MMO for easier usage.


================================================
FILE: data/cam4docc/.gitkeep
================================================


================================================
FILE: data/cam4docc/GMO/.gitkeep
================================================


================================================
FILE: data/cam4docc/GMO_lyft/.gitkeep
================================================


================================================
FILE: data/cam4docc/MMO/.gitkeep
================================================


================================================
FILE: data/cam4docc/MMO_lyft/.gitkeep
================================================


================================================
FILE: data/nuscenes/.gitkeep
================================================


================================================
FILE: other_baselines/README.md
================================================
## I. Static World

The static world model is built based on the identity hypothesis.

```bash
cd other_baselines/static_world
python ./eval_static_world.py
```
#### Parameters:
* **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process. 
* **test_results_dir**: Path of occupancy prediction results. Here we simply set it to the path of OCFNet forecasting results and use the present occupancy prediction results for evaluation. You can also replace them with [OpenOccupancy](https://github.com/JeffWang987/OpenOccupancy) estimation results.
* **gt_dir**: Path of ground-truth segmentations.

##  II. Voxelization of PCP

Voxelization of point cloud prediction requires the outputs of [PCPNet](https://github.com/Blurryface0814/PCPNet). Here we use nuScenes-Occupancy as ground-truth since predicted points are limited by sparsity.
```bash
cd other_baselines/voxel_pcp
python ./eval_voxel_pcp.py
```
#### Parameters:
* **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process.
* **occ_path**: Path of nuScenes-Occupancy.
* **test_results_dir**: Path of point cloud prediction results. The data is organized as follows:

```bash
Cam4DOcc
├── data/
│   ├── cam4docc/
│   │   ├── pcpnet_results/
│   │   │   ├── point_clouds/
│   │   │   │   ├── past/
│   │   │   │   │   ├── 000000.ply
│   │   │   │   │   ├── 000001.ply
│   │   │   │   │   ├── 000002.ply
│   │   │   │   │   ├── 000003.ply
│   │   │   │   ├── pred/
│   │   │   │   │   ├── 000000.ply
│   │   │   │   │   ├── ...
│   │   │   ├── saved_labels/
│   │   │   │   ├── past/
│   │   │   │   │   ├── 000000.label
│   │   │   │   │   ├── 000001.label
│   │   │   │   │   ├── 000002.label
│   │   │   │   │   ├── 000003.label
│   │   │   │   ├── pred/
│   │   │   │   │   ├── 000000.ply
│   │   │   │   │   ├── ...
```
We will provide our PCPNet predictions soon and please open an issue [here](https://github.com/Blurryface0814/PCPNet) if you have questions about how PCPNet is implemented for points forecasting.

## III. 2D-3D Lifted Prediction

2D-3D lifted prediction requires the outputs of [PowerBEV](https://github.com/EdwardLeeLPZ/PowerBEV). 

```bash
cd other_baselines/lifted_2d
python ./eval_lifted_2d.py
```
#### Parameters:
* **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process.
* **gt_dir**: Path of ground-truth segmentations.
* **hmin**: minimum height for lifting operation.
* **hmax**: maximum height for lifting operation.
* **test_results_dir**: Path of point cloud prediction results. The data is organized as follows:
```bash
Cam4DOcc
├── data/
│   ├── cam4docc/
│   │   ├── powerbev_results/
│   │   │   ├── {scene_token}_{lidar_token}.npz
│   │   │   ├── ...
```
We have provided our [PowerBEV predictions](https://drive.google.com/file/d/1X_N-GwU2ZB65UI9-EYpeQrb2BzS44VVX/view?usp=sharing) and please open an issue [here](https://github.com/EdwardLeeLPZ/PowerBEV) if you have questions about how PowerBEV is implemented for BEV-based instance prediction.

More refinement strategies for the baselines will be released ... Before that, please simply use the scripts here for fast evaluation.

## Publications

If you use our proposed baselines in your work, please cite as:

* Cam4DOcc
```
@inproceedings{ma2024cvpr,
	author = {Junyi Ma and Xieyuanli Chen and Jiawei Huang and Jingyi Xu and Zhen Luo and Jintao Xu and Weihao Gu and Rui Ai and Hesheng Wang},
	title = {{Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications}},
	booktitle = {Proc.~of the IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR)},
	year = 2024
}
```

* OpenOccupancy
```
@article{wang2023openoccupancy,
  title={Openoccupancy: A large scale benchmark for surrounding semantic occupancy perception},
  author={Wang, Xiaofeng and Zhu, Zheng and Xu, Wenbo and Zhang, Yunpeng and Wei, Yi and Chi, Xu and Ye, Yun and Du, Dalong and Lu, Jiwen and Wang, Xingang},
  journal={arXiv preprint arXiv:2303.03991},
  year={2023}
}
```

* PCPNet
```
@ARTICLE{10141631,
  author={Luo, Zhen and Ma, Junyi and Zhou, Zijie and Xiong, Guangming},
  journal={IEEE Robotics and Automation Letters}, 
  title={PCPNet: An Efficient and Semantic-Enhanced Transformer Network for Point Cloud Prediction}, 
  year={2023},
  volume={8},
  number={7},
  pages={4267-4274},
  doi={10.1109/LRA.2023.3281937}}
```
* PowerBEV
```
@inproceedings{ijcai2023p120,
  title     = {PowerBEV: A Powerful Yet Lightweight Framework for Instance Prediction in Bird’s-Eye View},
  author    = {Li, Peizheng and Ding, Shuxiao and Chen, Xieyuanli and Hanselmann, Niklas and Cordts, Marius and Gall, Juergen},
  booktitle = {Proceedings of the Thirty-Second International Joint Conference on
               Artificial Intelligence, {IJCAI-23}},
  pages     = {1080--1088},
  year      = {2023},
  month     = {8},
  doi       = {10.24963/ijcai.2023/120},
}
```


================================================
FILE: other_baselines/lifted_2d/eval_lifted_2d.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

from tqdm import trange
import numpy as np
from nuscenes import NuScenes
import os
import torch
import torch.nn.functional as F
import copy
from pyquaternion import Quaternion

# Setups =================================================================================================
test_idx_dir = "../../data/cam4docc/test_ids/"
test_results_dir = "../../data/cam4docc/powerbev_results/"
gt_dir = "../../data/cam4docc/MMO/segmentation/"

test_seqs = os.listdir(test_idx_dir)
test_segmentations = os.listdir(test_results_dir)
dimension = [512, 512, 40]
future_ious = [0, 0, 0, 0]

voxel_size = np.array([0.2,0.2,0.2])
pc_range = np.array([-50, -50, 0, 50, 50, 0])
voxel_size_new = np.array([0.2,0.2,0.2])
pc_range_new = np.array([-51.2, -51.2, -5, 51.2, 51.2, 3])

# 10*0.2=2m 
# You can modify the parameters to show the changes with variable heights for lifting
hmin = -1
hmax = 9

nusc = NuScenes(version='v1.0-trainval', dataroot="../../data/nuscenes", verbose=False)
# ========================================================================================================

def cm_to_ious(cm):
    mean_ious = []
    cls_num = len(cm)
    for i in range(cls_num):
        tp = cm[i, i]
        p = cm[:, i].sum()
        g = cm[i, :].sum()
        union = p + g - tp
        mean_ious.append(tp / union)
    return mean_ious

def fast_hist(pred, label, max_label=18): 
    pred = copy.deepcopy(pred.flatten())
    label = copy.deepcopy(label.flatten())
    bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2)
    iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
    return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred

for i in trange(len(test_seqs)):
    segmentation_file = test_results_dir + test_seqs[i]
    instance_seq = np.load(segmentation_file)['arr_0']
    instance_seq = torch.from_numpy(instance_seq)

    test_seqs_idxs = np.load(test_idx_dir+test_seqs[i])["arr_0"]
    gt_segmentation_file = os.path.join(gt_dir, test_seqs[i])
    gt_segmentation_seqs = np.load(gt_segmentation_file, allow_pickle=True)['arr_0']

    for t in range(3, 7):
        scene_token_cur = test_seqs_idxs[t].split("_")[0]
        lidar_token_cur = test_seqs_idxs[t].split("_")[1]

        instance_ = instance_seq[0,(t-1)].unsqueeze(0)  # t-1 -> t
        instance_ = instance_.unsqueeze(0)
        instance_ = F.interpolate(instance_.float(), size=[500, 500], mode='nearest').contiguous() # Note: default PowerBEV has different ranges with OCFNet
        instance_ = instance_.squeeze(0)

        x_grid = torch.linspace(0, 500-1, 500, dtype=torch.float)
        x_grid = x_grid.view(500, 1).expand(500,500)
        y_grid = torch.linspace(0, 500-1,500, dtype=torch.float)
        y_grid = y_grid.view(1, 500).expand(500,500)
        mesh_grid_2d = torch.stack((x_grid, y_grid), -1)
        mesh_grid_2d = mesh_grid_2d.view(-1, 2)
        instance_ = instance_.view(-1, 1)

        semantics_lifted = []
        for ii in range(hmin, hmax): 
            semantics_lifted_ = torch.cat((mesh_grid_2d, ii*torch.ones_like(mesh_grid_2d[:,0:1])),dim=-1)
            semantics_lifted_ = torch.cat((semantics_lifted_, instance_),dim=-1)
            semantics_lifted.append(semantics_lifted_)

        semantics_lifted = np.array(torch.cat(semantics_lifted, dim=0))

        kept = semantics_lifted[:,-1]!=0
        semantics_lifted = semantics_lifted[kept]
        if semantics_lifted.shape[0] == 0:
            semantics_lifted = np.zeros((1,4))

        lidar_sample = nusc.get('sample_data', lidar_token_cur)
        lidar_sample_calib = nusc.get('calibrated_sensor', lidar_sample['calibrated_sensor_token'])
        lidar_sensor_rotation = Quaternion(lidar_sample_calib['rotation'])
        lidar_sensor_translation = np.array(lidar_sample_calib['translation'])[:, None]
        lidar_to_lidarego = np.vstack([
            np.hstack((lidar_sensor_rotation.rotation_matrix, lidar_sensor_translation)),
            np.array([0, 0, 0, 1])
        ])
        lidarego_to_lidar = np.linalg.inv(lidar_to_lidarego)
        points = np.ones_like(semantics_lifted)
        points[:,:3] = semantics_lifted[:,:3]
        points[:,:3] = points[:,:3] * voxel_size[None, :] + pc_range[:3][None, :] 
        points = lidarego_to_lidar @ points.T
        semantics_lifted_transformed = np.ones_like(semantics_lifted)
        semantics_lifted_transformed[:,:3] = (points.T)[:,:3]
        semantics_lifted_transformed[:,-1] = semantics_lifted[:,-1]
        semantics_lifted_transformed[:,:3] = (semantics_lifted_transformed[:,:3] - pc_range_new[:3][None, :]) / voxel_size_new[None, :] 

        pred_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
        for j in range(semantics_lifted_transformed.shape[0]):
            cur_ind = semantics_lifted_transformed[j, :3].astype(int)
            cur_label = semantics_lifted_transformed[j, -1]
            if cur_label != 0:
                pred_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = 1

        gt_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
        gt_segmentation_raw = gt_segmentation_seqs[t].cpu().numpy()
        gt_segmentation[gt_segmentation_raw[:,0].astype(int),gt_segmentation_raw[:,1].astype(int),gt_segmentation_raw[:,2].astype(int)] = gt_segmentation_raw[:, -1]

        hist_cur, iou_per_pred = fast_hist(pred_segmentation.astype(int), gt_segmentation.astype(int), max_label=2)
        
        if t <= 3:
            future_ious[0] = future_ious[0] + hist_cur
        if t <= 4:
            future_ious[1] = future_ious[1] + hist_cur
        if t <= 5:
            future_ious[2] = future_ious[2] + hist_cur
        if t <= 6:
            future_ious[3] = future_ious[3] + hist_cur

for t in range(len(future_ious)):
    print("iou for step "+str(t), cm_to_ious(future_ious[t]))

================================================
FILE: other_baselines/static_world/eval_static_world.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import numpy as np
import os
import copy
from tqdm import trange

# Setups =================================================================================================
test_idx_dir = "../../data/cam4docc/test_ids/"
test_results_dir = "../../data/cam4docc/results/"
gt_dir = "../../data/cam4docc/MMO/segmentation/"

objects_max_label = 9

test_seqs = os.listdir(test_idx_dir)
test_segmentations = os.listdir(test_results_dir)
dimension = [512, 512, 40]
future_ious = [0, 0, 0, 0]
# ========================================================================================================

def cm_to_ious(cm):
    mean_ious = []
    cls_num = len(cm)
    for i in range(cls_num):
        tp = cm[i, i]
        p = cm[:, i].sum()
        g = cm[i, :].sum()
        union = p + g - tp
        mean_ious.append(tp / union)
    return mean_ious

def fast_hist(pred, label, max_label=18):
    pred = copy.deepcopy(pred.flatten())
    label = copy.deepcopy(label.flatten())
    bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2)
    iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
    return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred

for i in trange(len(test_seqs)):
    segmentation_file = test_results_dir + test_seqs[i]
    if not os.path.exists(segmentation_file):
        continue
    segmentation = np.load(segmentation_file,allow_pickle=True)['arr_0']
    test_seqs_idxs = np.load(os.path.join(test_idx_dir, test_seqs[i]))["arr_0"]
    gt_segmentation_file = os.path.join(gt_dir, test_seqs[i])
    gt_segmentation_seqs = np.load(gt_segmentation_file,allow_pickle=True)['arr_0']

    # hard coding for input:3 output:4
    for t in range(3,7):
        # static world using present predictions
        segmentation_t = segmentation[0]
        pred_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
        for j in range(segmentation_t.shape[0]):
            cur_ind = segmentation_t[j, :3].astype(int)
            cur_label = segmentation_t[j, -1]
            pred_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label

        gt_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
        gt_segmentation_raw = gt_segmentation_seqs[t]
        for k in range(gt_segmentation_raw.shape[0]):
            cur_ind = gt_segmentation_raw[k, :3].astype(int)
            cur_label = gt_segmentation_raw[k, -1]
            gt_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label
        hist_cur, iou_per_pred = fast_hist(pred_segmentation.astype(int), gt_segmentation.astype(int), max_label=objects_max_label)
        if t <= 3:
            future_ious[0] = future_ious[0] + hist_cur
        if t <= 4:
            future_ious[1] = future_ious[1] + hist_cur
        if t <= 5:
            future_ious[2] = future_ious[2] + hist_cur
        if t <= 6:
            future_ious[3] = future_ious[3] + hist_cur

for t in range(len(future_ious)):
    print("iou for step "+str(t), cm_to_ious(future_ious[t]))

================================================
FILE: other_baselines/voxel_pcp/eval_voxel_pcp.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import numpy as np
import os
import copy
from tqdm import trange
import open3d as o3d
from nuscenes import NuScenes
from pyquaternion import Quaternion

# Setups =================================================================================================
test_idx_dir = "../../data/cam4docc/test_ids/"
test_results_dir = "../../data/cam4docc/pcpnet_results/"
occ_path = "../../data/nuScenes-Occupancy"

test_seqs = os.listdir(test_idx_dir)
test_segmentations = os.listdir(test_results_dir)
pc_range= np.array([-51.2, -51.2, -5.0, 51.2, 51.2, 3.0])
dimension = [512, 512, 40]
grid_size= np.array(dimension)
voxel_size = (pc_range[3:] -pc_range[:3]) / grid_size
future_ious = [0, 0, 0, 0]

nusc = NuScenes(version='v1.0-trainval', dataroot="../../data/nuscenes", verbose=False)
# ========================================================================================================

lidar_token2sample_token = {}
for i in range(len(nusc.sample)):  
    my_sample = nusc.sample[i]
    frame_token = my_sample['token']
    lidar_token = my_sample['data']['LIDAR_TOP']
    lidar_token2sample_token[lidar_token] = frame_token

def voxel2world(voxel):
    """
    voxel: [N, 3]
    """
    return voxel *voxel_size[None, :] + pc_range[:3][None, :]

def world2voxel(world):
    """
    world: [N, 3]
    """
    return (world - pc_range[:3][None, :]) / voxel_size[None, :]

def cm_to_ious(cm):
    mean_ious = []
    cls_num = len(cm)
    for i in range(cls_num):
        tp = cm[i, i]
        p = cm[:, i].sum()
        g = cm[i, :].sum()
        union = p + g - tp
        mean_ious.append(tp / union)
    return mean_ious

def fast_hist(pred, label, max_label=18):
    pred = copy.deepcopy(pred.flatten())
    label = copy.deepcopy(label.flatten())
    bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2)
    iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
    return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred

def nb_process_label(processed_label, sorted_label_voxel_pair):
    label_size = 256
    counter = np.zeros((label_size,), dtype=np.uint16)
    counter[sorted_label_voxel_pair[0, 3]] = 1
    cur_sear_ind = sorted_label_voxel_pair[0, :3]
    for i in range(1, sorted_label_voxel_pair.shape[0]):
        cur_ind = sorted_label_voxel_pair[i, :3]
        if not np.all(np.equal(cur_ind, cur_sear_ind)):
            processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
            counter = np.zeros((label_size,), dtype=np.uint16)
            cur_sear_ind = cur_ind
        counter[sorted_label_voxel_pair[i, 3]] += 1
    processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
    
    return processed_label

def get_ego2lidar_pose(rec):
    lidar_top_data = nusc.get('sample_data', rec['data']['LIDAR_TOP'])
    lidar2ego_translation = nusc.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation']
    lidar2ego_rotation =  nusc.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation']
    trans = -np.array(lidar2ego_translation)
    rot = Quaternion(lidar2ego_rotation).inverse
    return trans, rot

def get_lidar_pose(rec):
    current_sample = nusc.get('sample', rec['token'])
    egopose = nusc.get('ego_pose', nusc.get('sample_data', current_sample['data']['LIDAR_TOP'])['ego_pose_token'])
    trans = -np.array(egopose['translation'])
    rot = Quaternion(egopose['rotation']).inverse  
    return trans, rot

for i in trange(len(test_seqs)):
    test_seqs_idxs = np.load(os.path.join(test_idx_dir, test_seqs[i]))['arr_0']
    scene_token_present = test_seqs[i].split("_")[0]
    lidar_token_present = test_seqs[i].split("_")[1][:-4]

    # transform past point clouds to the present frame
    # point cloud prediction baseline is limited by sparsity of laser points, so we aggregate
    # past point clouds to mitigate in this version
    # More reasonable versions will be released
    past_voxels = []
    for t in range(1, 4):
        scene_token_ = test_seqs_idxs[t-1].split("_")[0]
        lidar_token_ = test_seqs_idxs[t-1].split("_")[1]
        point_file = test_results_dir+"point_clouds/"+scene_token_present+"_"+lidar_token_present+"/past/00000"+str(t)+".ply"
        label_file = test_results_dir+"saved_labels/"+scene_token_present+"_"+lidar_token_present+"/past/00000"+str(t)+".label"
        
        pcd_load = o3d.io.read_point_cloud(point_file)
        xyz_load = np.asarray(pcd_load.points)

        sample_token_present = lidar_token2sample_token[lidar_token_present]
        rec_present = nusc.get('sample', sample_token_present)
        translation_present, rotation_present = get_lidar_pose(rec_present)
        ego2lidar_translation_present, ego2lidar_rotation_present = get_ego2lidar_pose(rec_present)

        sample_token_ = lidar_token2sample_token[lidar_token_]
        rec_ = nusc.get('sample', sample_token_)
        translation_, rotation_ = get_lidar_pose(rec_)
        ego2lidar_translation_, ego2lidar_rotation_ = get_ego2lidar_pose(rec_)

        present_global2ego = [translation_present, rotation_present]
        present_ego2lidar = [ego2lidar_translation_present, ego2lidar_rotation_present]
        cur_global2ego = [translation_, rotation_]
        cur_ego2lidar = [ego2lidar_translation_, ego2lidar_rotation_]
        pcd_np_cor = np.dot(cur_ego2lidar[1].inverse.rotation_matrix, xyz_load.T)
        pcd_np_cor = pcd_np_cor.T
        pcd_np_cor = pcd_np_cor - cur_ego2lidar[0]
        pcd_np_cor = np.dot(cur_global2ego[1].inverse.rotation_matrix, pcd_np_cor.T)
        pcd_np_cor = pcd_np_cor.T
        pcd_np_cor = pcd_np_cor - cur_global2ego[0]
        pcd_np_cor = pcd_np_cor + present_global2ego[0]
        pcd_np_cor = np.dot(present_global2ego[1].rotation_matrix, pcd_np_cor.T)
        pcd_np_cor = pcd_np_cor.T
        pcd_np_cor = pcd_np_cor + present_ego2lidar[0]   # trans
        pcd_np_cor = np.dot(present_ego2lidar[1].rotation_matrix, pcd_np_cor.T)
        xyz_load = pcd_np_cor.T   

        xyz_load = world2voxel(xyz_load)
        label = np.fromfile(label_file, dtype=np.uint32)
        label = label.reshape((-1,1))
        segmentation_t = np.concatenate((xyz_load, label), axis=-1)
        kept = (segmentation_t[:,0]>0) & (segmentation_t[:,0]<dimension[0]) & (segmentation_t[:,1]>0) & (segmentation_t[:,1]<dimension[1])  & (segmentation_t[:,2]>0) & (segmentation_t[:,2]<dimension[2])
        segmentation_t = segmentation_t[kept]
        past_voxels.append(segmentation_t)

    past_voxel_aggregated = np.concatenate(past_voxels, axis=0)

    # for future forecasting
    for t in range(3, 7):
        scene_token_ = test_seqs_idxs[t].split("_")[0]
        lidar_token_ = test_seqs_idxs[t].split("_")[1]

        point_file = test_results_dir+"point_clouds/"+scene_token_present+"_"+lidar_token_present+"/pred/00000"+str(t-3)+".ply"
        label_file = test_results_dir+"saved_labels/"+scene_token_present+"_"+lidar_token_present+"/pred/00000"+str(t-3)+".label"

        pcd_load = o3d.io.read_point_cloud(point_file)
        xyz_load = np.asarray(pcd_load.points)
        xyz_load = world2voxel(xyz_load)

        label = np.fromfile(label_file, dtype=np.uint32)
        label = label.reshape((-1,1))

        segmentation_t = np.concatenate((xyz_load, label), axis=-1)
        kept = (segmentation_t[:,0]>0) & (segmentation_t[:,0]<dimension[0]) & (segmentation_t[:,1]>0) & (segmentation_t[:,1]<dimension[1])  & (segmentation_t[:,2]>0) & (segmentation_t[:,2]<dimension[2])
        segmentation_t = segmentation_t[kept]
        segmentation_t = np.concatenate((segmentation_t, past_voxel_aggregated), axis=0)

        pred_segmentation = np.zeros((dimension[0], dimension[1], dimension[2]))
        pred_segmentation[segmentation_t[:, 0].astype(int), segmentation_t[:, 1].astype(int), segmentation_t[:, 2].astype(int)] = segmentation_t[:, -1]

        # eval according to setups
        # hardcoding for classes of interest
        for otheridx in [0,1,8,11,12,13,14,15,16,17,18,255]:
            pred_segmentation[pred_segmentation==otheridx] = 0
        for vehidx in [2,3,4,5,6,7,9,10]:
            pred_segmentation[pred_segmentation==vehidx] = 1

        # load nuScenes-Occupancy
        rel_path = 'scene_{0}/occupancy/{1}.npy'.format(scene_token_, lidar_token_)
        gt_segmentation_file = os.path.join(occ_path, rel_path)
        pcd = np.load(gt_segmentation_file)

        pcd_label = pcd[..., -1:]
        pcd_label[pcd_label==0] = 255
        pcd_np_cor = voxel2world(pcd[..., [2,1,0]] + 0.5)
        pcd_np_cor = world2voxel(pcd_np_cor)

        # make sure the point is in the grid
        pcd_np_cor = np.clip(pcd_np_cor, np.array([0,0,0]), grid_size - 1)
        transformed_occ = copy.deepcopy(pcd_np_cor)
        pcd_np = np.concatenate([pcd_np_cor, pcd_label], axis=-1)

        # 255: noise, 1-16 normal classes, 0 unoccupied
        pcd_np = pcd_np[np.lexsort((pcd_np_cor[:, 0], pcd_np_cor[:, 1], pcd_np_cor[:, 2])), :]
        pcd_np = pcd_np.astype(np.int64)
        processed_label = np.ones(grid_size, dtype=np.uint8) * 0.0
        processed_label = nb_process_label(processed_label, pcd_np)
        # Opt.
        # processed_label[pcd_np[:, 0].astype(int), pcd_np[:, 1].astype(int), pcd_np[:, 2].astype(int)] = pcd_np[:, -1]

        for otheridx in [0,1,8,11,12,13,14,15,16,17,18,255]:
            processed_label[processed_label==otheridx] = 0
        for vehidx in [2,3,4,5,6,7,9,10]:
            processed_label[processed_label==vehidx] = 1

        hist_cur, iou_per_pred = fast_hist(pred_segmentation.astype(int), processed_label.astype(int), max_label=2)

        if t <= 3:
            future_ious[0] = future_ious[0] + hist_cur
        if t <= 4:
            future_ious[1] = future_ious[1] + hist_cur
        if t <= 5:
            future_ious[2] = future_ious[2] + hist_cur
        if t <= 6:
            future_ious[3] = future_ious[3] + hist_cur
            
for t in range(len(future_ious)):
    print("ious for step "+str(t), cm_to_ious(future_ious[t]))

================================================
FILE: projects/__init__.py
================================================


================================================
FILE: projects/configs/_base_/datasets/custom_lyft-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-80, -80, -5, 80, 80, 3]
# For Lyft we usually do 9-class detection
class_names = [
    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
    'bicycle', 'pedestrian', 'animal'
]
dataset_type = 'CustomLyftDataset'
data_root = 'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=True,
    use_camera=False,
    use_radar=False,
    use_map=False,
    use_external=True)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/lyft/': 's3://lyft/lyft/',
#         'data/lyft/': 's3://lyft/lyft/'
#    }))
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.3925, 0.3925],
        scale_ratio_range=[0.95, 1.05],
        translation_std=[0, 0, 0]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=False),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'lyft_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)

================================================
FILE: projects/configs/_base_/datasets/custom_nus-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
dataset_type = 'NuScenesDataset_eval_modified'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=True,
    use_camera=False,
    use_radar=False,
    use_map=False,
    use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
#     }))
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.3925, 0.3925],
        scale_ratio_range=[0.95, 1.05],
        translation_std=[0, 0, 0]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectNameFilter', classes=class_names),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=4,
    workers_per_gpu=4,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=False,
        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
        box_type_3d='LiDAR'),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'nuscenes_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True,
        box_type_3d='LiDAR'))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)


================================================
FILE: projects/configs/_base_/datasets/custom_waymo-3d.py
================================================
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type = 'CustomWaymoDataset'
data_root = 'data/waymo/kitti_format/'
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))

img_norm_cfg = dict(
    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
class_names = ['Car', 'Pedestrian', 'Cyclist']
point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
input_modality = dict(use_lidar=False, use_camera=True)
db_sampler = dict(
    data_root=data_root,
    info_path=data_root + 'waymo_dbinfos_train.pkl',
    rate=1.0,
    prepare=dict(
        filter_by_difficulty=[-1],
        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
    classes=class_names,
    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
    points_loader=dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=[0, 1, 2, 3, 4],
        file_client_args=file_client_args))


train_pipeline = [
    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectNameFilter', classes=class_names),
    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    dict(type='PadMultiViewImage', size_divisor=32),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
]


test_pipeline = [
    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
    dict(type='PadMultiViewImage', size_divisor=32),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1920, 1280),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='CustomCollect3D', keys=['img'])
        ])
]


# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)

data = dict(
    samples_per_gpu=2,
    workers_per_gpu=4,
    train=dict(
        type='RepeatDataset',
        times=2,
        dataset=dict(
            type=dataset_type,
            data_root=data_root,
            ann_file=data_root + 'waymo_infos_train.pkl',
            split='training',
            pipeline=train_pipeline,
            modality=input_modality,
            classes=class_names,
            test_mode=False,
            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
            box_type_3d='LiDAR',
            # load one frame every five frames
            load_interval=5)),
    val=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'waymo_infos_val.pkl',
        split='training',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'waymo_infos_val.pkl',
        split='training',
        pipeline=test_pipeline,
        modality=input_modality,
        classes=class_names,
        test_mode=True,
        box_type_3d='LiDAR'))

evaluation = dict(interval=24, pipeline=test_pipeline)

================================================
FILE: projects/configs/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
# yapf:disable push
# By default we use textlogger hook and tensorboard
# For more loggers see
# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
log_config = dict(
    interval=1,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])
# yapf:enable
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = None
resume_from = None
workflow = [('train', 1)]


================================================
FILE: projects/configs/_base_/schedules/cosine.py
================================================
# This schedule is mainly used by models with dynamic voxelization
# optimizer
lr = 0.003  # max learning rate
optimizer = dict(
    type='AdamW',
    lr=lr,
    betas=(0.95, 0.99),  # the momentum is change during training
    weight_decay=0.001)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))

lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=1.0 / 10,
    min_lr_ratio=1e-5)

momentum_config = None

runner = dict(type='EpochBasedRunner', max_epochs=40)


================================================
FILE: projects/configs/_base_/schedules/cyclic_20e.py
================================================
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 20. Please change the interval accordingly if you do not
# use a default schedule.
# optimizer
# This schedule is mainly used by models on nuScenes dataset
optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
# max_norm=10 is better for SECOND
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='cyclic',
    target_ratio=(10, 1e-4),
    cyclic_times=1,
    step_ratio_up=0.4,
)
momentum_config = dict(
    policy='cyclic',
    target_ratio=(0.85 / 0.95, 1),
    cyclic_times=1,
    step_ratio_up=0.4,
)

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=20)


================================================
FILE: projects/configs/_base_/schedules/cyclic_40e.py
================================================
# The schedule is usually used by models trained on KITTI dataset

# The learning rate set in the cyclic schedule is the initial learning rate
# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
lr = 0.0018
# The optimizer follows the setting in SECOND.Pytorch, but here we use
# the offcial AdamW optimizer implemented by PyTorch.
optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69  # noqa
# We implement them in mmcv, for more details, please refer to
# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327  # noqa
# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130  # noqa
lr_config = dict(
    policy='cyclic',
    target_ratio=(10, 1e-4),
    cyclic_times=1,
    step_ratio_up=0.4,
)
momentum_config = dict(
    policy='cyclic',
    target_ratio=(0.85 / 0.95, 1),
    cyclic_times=1,
    step_ratio_up=0.4,
)
# Although the max_epochs is 40, this schedule is usually used we
# RepeatDataset with repeat ratio N, thus the actual max epoch
# number could be Nx40
runner = dict(type='EpochBasedRunner', max_epochs=40)


================================================
FILE: projects/configs/_base_/schedules/mmdet_schedule_1x.py
================================================
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.001,
    step=[8, 11])
runner = dict(type='EpochBasedRunner', max_epochs=12)


================================================
FILE: projects/configs/_base_/schedules/schedule_2x.py
================================================
# optimizer
# This schedule is mainly used by models on nuScenes dataset
optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
# max_norm=10 is better for SECOND
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1000,
    warmup_ratio=1.0 / 1000,
    step=[20, 23])
momentum_config = None
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=24)


================================================
FILE: projects/configs/_base_/schedules/schedule_3x.py
================================================
# optimizer
# This schedule is mainly used by models on indoor dataset,
# e.g., VoteNet on SUNRGBD and ScanNet
lr = 0.008  # max learning rate
optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[24, 32])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=36)


================================================
FILE: projects/configs/_base_/schedules/seg_cosine_150e.py
================================================
# optimizer
# This schedule is mainly used on S3DIS dataset in segmentation task
optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
momentum_config = None

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=150)


================================================
FILE: projects/configs/_base_/schedules/seg_cosine_200e.py
================================================
# optimizer
# This schedule is mainly used on ScanNet dataset in segmentation task
optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
momentum_config = None

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=200)


================================================
FILE: projects/configs/_base_/schedules/seg_cosine_50e.py
================================================
# optimizer
# This schedule is mainly used on S3DIS dataset in segmentation task
optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
momentum_config = None

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=50)


================================================
FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py
================================================
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
# 2 classes: inflated GMO and others

# Basic params ******************************************
_base_ = [
    '../datasets/custom_nus-3d.py',
    '../_base_/default_runtime.py'
]
find_unused_parameters = True
# whether training and test together with dataset generation
only_generate_dataset = False
# we only consider use_camera in Cam4DOcc in the current version
input_modality = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)
plugin = True
plugin_dir = "projects/occ_plugin/"
occ_path = "./data/nuScenes-Occupancy"
depth_gt_path = './data/depth_gt'
train_ann_file = "./data/nuscenes/nuscenes_occ_infos_train.pkl"
val_ann_file = "./data/nuscenes/nuscenes_occ_infos_val.pkl"
cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/nuscenes/'
# GMO class names
class_names = ['vehicle', 'human']
use_separate_classes = False
use_fine_occ = False

# Forecasting-related params ******************************************
# we use *time_receptive_field* past frames to forecast future *n_future_frames* frames
# for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False

# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
    num_cls = len(class_names) + 1
else:
    num_cls = 2
img_norm_cfg = None

# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"

# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccDataset'
file_client_args = dict(backend='disk')
data_config={
    'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
             'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
    'Ncams': 6,
    'input_size': (896, 1600),
    'src_size': (900, 1600),
    # image-view augmentation
    'resize': (-0.06, 0.11),
    'rot': (-5.4, 5.4),
    'flip': False,
    'crop_h': (0.0, 0.0),
    'resize_test': 0.00,
}

bda_aug_conf = dict(
            rot_lim=(-0, 0),
            scale_lim=(0.95, 1.05),
            flip_dx_ratio=0.5,
            flip_dy_ratio=0.5)

train_capacity = 23930 # default: use all sequences
test_capacity = 5119 # default: use all sequences
train_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
                use_separate_classes=use_separate_classes),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
                sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
                mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]

test_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
         use_separate_classes=use_separate_classes),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
         sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), 
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]

train_config=dict(
        type=dataset_type,
        data_root=nusc_root,
        occ_root=occ_path,
        idx_root=cam4docc_dataset_path,
        ori_data_root=cam4docc_dataset_path,
        ann_file=train_ann_file,
        pipeline=train_pipeline,
        classes=class_names,
        use_separate_classes=use_separate_classes,
        modality=input_modality,
        test_mode=False,
        use_valid_flag=True,
        occ_size=occ_size,
        pc_range=point_cloud_range,
        box_type_3d='LiDAR',
        time_receptive_field=time_receptive_field,
        n_future_frames=n_future_frames,
        train_capacity=train_capacity,
        test_capacity=test_capacity,
        ) 

test_config=dict(
    type=dataset_type,
    occ_root=occ_path,
    data_root=nusc_root,
    idx_root=cam4docc_dataset_path,
    ori_data_root=cam4docc_dataset_path,
    ann_file=val_ann_file,
    pipeline=test_pipeline,
    classes=class_names,
    use_separate_classes=use_separate_classes,
    modality=input_modality,
    occ_size=occ_size,
    pc_range=point_cloud_range,
    time_receptive_field=time_receptive_field, 
    n_future_frames=n_future_frames,
    train_capacity=train_capacity,
    test_capacity=test_capacity,
    ) 

# in our work we use 8 NVIDIA A100 GPUs
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=1,
    train=train_config,
    val=test_config,
    test=test_config,
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler'),
)

# Model params ******************************************
grid_config = {
    'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
    'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
    'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
    'dbound': [2.0, 58.0, 0.5],
}

voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]

numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)

model = dict(
    type='OCFNet',
    only_generate_dataset=only_generate_dataset,
    loss_norm=False,
    disable_loss_depth=True,
    point_cloud_range=point_cloud_range,
    time_receptive_field=time_receptive_field,
    n_future_frames=n_future_frames,
    n_future_frames_plus=n_future_frames_plus,
    max_label=num_cls,
    iou_thresh_for_vpq=iou_thresh_for_vpq,
    test_present=test_present,
    record_time=False,
    save_pred=save_pred,
    save_path=save_path,
    img_backbone=dict(
        pretrained='torchvision://resnet50',
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        with_cp=False,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=False,
        style='pytorch'),
    img_neck=dict(
        type='SECONDFPN',
        in_channels=[256, 512, 1024, 2048],
        upsample_strides=[0.25, 0.5, 1, 2],
        out_channels=[128, 128, 128, 128]),
    img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
                              norm_cfg=dict(type='SyncBN', requires_grad=True),
                              loss_depth_weight=3.,
                              loss_depth_type='kld',
                              grid_config=grid_config,
                              data_config=data_config,
                              numC_Trans=numC_Trans,
                              vp_megvii=False),
    occ_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=voxel_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=flow_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_head=dict(
        type='FlowHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True, 
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=3,  # 3-dim flow
        point_cloud_range=point_cloud_range,
    ),
    pts_bbox_head=dict(
        type='OccHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True,
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=num_cls,
        point_cloud_range=point_cloud_range,
        loss_weight_cfg=dict(
            loss_voxel_ce_weight=1.0,
            loss_voxel_sem_scal_weight=1.0,
            loss_voxel_geo_scal_weight=1.0,
            loss_voxel_lovasz_weight=1.0,
        ),
    ),
    empty_idx=empty_idx,
)

# Learning policy params ******************************************
optimizer = dict(
    type='AdamW',
    lr=3e-4,   
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=0.01)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=1e-3)

runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
    interval=1,
    pipeline=test_pipeline,
    save_best='SSC_mean',
    rule='greater',
)

custom_hooks = [
    dict(type='OccEfficiencyHook'),
]

================================================
FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1_lyft.py
================================================
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
# 2 classes: inflated GMO and others

# Basic params ******************************************
_base_ = [
    '../datasets/custom_nus-3d.py',
    '../_base_/default_runtime.py'
]
find_unused_parameters = True
# whether training and test together with dataset generation
only_generate_dataset = False
# we only consider use_camera in Cam4DOcc in the current version
input_modality = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)
plugin = True
plugin_dir = "projects/occ_plugin/"

# path unused for lyft
occ_path = " "
depth_gt_path = " "
train_ann_file = " "
val_ann_file = " "

cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/lyft/'
# GMO class names
class_names = ['vehicle', 'human']
use_separate_classes = False
use_fine_occ = False

# Forecasting-related params ******************************************
# we use *time_receptive_field* past frames to forecast future *n_future_frames* frames
# for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False

# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
    num_cls = len(class_names) + 1
else:
    num_cls = 2
img_norm_cfg = None

# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"

# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccLyftDataset'
file_client_args = dict(backend='disk')
data_config={
    'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
             'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
    'Ncams': 6,
    'input_size': (896, 1600),
    'src_size': (900, 1600),
    # image-view augmentation
    'resize': (-0.06, 0.11),
    'rot': (-5.4, 5.4),
    'flip': False,
    'crop_h': (0.0, 0.0),
    'resize_test': 0.00,
}

bda_aug_conf = dict(
            rot_lim=(-0, 0),
            scale_lim=(0.95, 1.05),
            flip_dx_ratio=0.5,
            flip_dy_ratio=0.5)

train_capacity = 15720 # default: use all sequences
test_capacity = 5880 # default: use all sequences
train_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
                use_separate_classes=use_separate_classes, use_lyft=True),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
                sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
                mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg, use_lyft=True),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]

test_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
         use_separate_classes=use_separate_classes, use_lyft=True),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
         sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True, use_lyft=True),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), 
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]

train_config=dict(
        type=dataset_type,
        data_root=nusc_root,
        occ_root=occ_path,
        idx_root=cam4docc_dataset_path,
        ori_data_root=cam4docc_dataset_path,
        ann_file=train_ann_file,
        pipeline=train_pipeline,
        classes=class_names,
        use_separate_classes=use_separate_classes,
        modality=input_modality,
        test_mode=False,
        use_valid_flag=True,
        occ_size=occ_size,
        pc_range=point_cloud_range,
        box_type_3d='LiDAR',
        time_receptive_field=time_receptive_field,
        n_future_frames=n_future_frames,
        train_capacity=train_capacity,
        test_capacity=test_capacity,
        ) 

test_config=dict(
    type=dataset_type,
    occ_root=occ_path,
    data_root=nusc_root,
    idx_root=cam4docc_dataset_path,
    ori_data_root=cam4docc_dataset_path,
    ann_file=val_ann_file,
    pipeline=test_pipeline,
    classes=class_names,
    use_separate_classes=use_separate_classes,
    modality=input_modality,
    occ_size=occ_size,
    pc_range=point_cloud_range,
    time_receptive_field=time_receptive_field, 
    n_future_frames=n_future_frames,
    train_capacity=train_capacity,
    test_capacity=test_capacity,
    ) 

# in our work we use 8 NVIDIA A100 GPUs
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=1,
    train=train_config,
    val=test_config,
    test=test_config,
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler'),
)

# Model params ******************************************
grid_config = {
    'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
    'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
    'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
    'dbound': [2.0, 58.0, 0.5],
}

voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]

numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)

model = dict(
    type='OCFNet',
    only_generate_dataset=only_generate_dataset,
    loss_norm=False,
    disable_loss_depth=True,
    point_cloud_range=point_cloud_range,
    time_receptive_field=time_receptive_field,
    n_future_frames=n_future_frames,
    n_future_frames_plus=n_future_frames_plus,
    max_label=num_cls,
    iou_thresh_for_vpq=iou_thresh_for_vpq,
    test_present=test_present,
    record_time=False,
    save_pred=save_pred,
    save_path=save_path,
    img_backbone=dict(
        pretrained='torchvision://resnet50',
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        with_cp=False,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=False,
        style='pytorch'),
    img_neck=dict(
        type='SECONDFPN',
        in_channels=[256, 512, 1024, 2048],
        upsample_strides=[0.25, 0.5, 1, 2],
        out_channels=[128, 128, 128, 128]),
    img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
                              norm_cfg=dict(type='SyncBN', requires_grad=True),
                              loss_depth_weight=3.,
                              loss_depth_type='kld',
                              grid_config=grid_config,
                              data_config=data_config,
                              numC_Trans=numC_Trans,
                              vp_megvii=False),
    occ_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=voxel_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=flow_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_head=dict(
        type='FlowHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True, 
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=3,  # 3-dim flow
        point_cloud_range=point_cloud_range,
    ),
    pts_bbox_head=dict(
        type='OccHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True,
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=num_cls,
        point_cloud_range=point_cloud_range,
        loss_weight_cfg=dict(
            loss_voxel_ce_weight=1.0,
            loss_voxel_sem_scal_weight=1.0,
            loss_voxel_geo_scal_weight=1.0,
            loss_voxel_lovasz_weight=1.0,
        ),
    ),
    empty_idx=empty_idx,
)

# Learning policy params ******************************************
optimizer = dict(
    type='AdamW',
    lr=3e-4,   
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=0.01)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=1e-3)

runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
    interval=1,
    pipeline=test_pipeline,
    save_best='SSC_mean',
    rule='greater',
)

custom_hooks = [
    dict(type='OccEfficiencyHook'),
]

================================================
FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py
================================================
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
# multiple classes: inflated multiple MO classes

# Basic params ******************************************
_base_ = [
    '../datasets/custom_nus-3d.py',
    '../_base_/default_runtime.py'
]
find_unused_parameters = True
# whether training and test together with dataset generation
only_generate_dataset = False
# we only consider use_camera in Cam4DOcc in the current version
input_modality = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)
plugin = True
plugin_dir = "projects/occ_plugin/"
occ_path = "./data/nuScenes-Occupancy"
depth_gt_path = './data/depth_gt'
train_ann_file = "./data/nuscenes/nuscenes_occ_infos_train.pkl"
val_ann_file = "./data/nuscenes/nuscenes_occ_infos_val.pkl"
cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/nuscenes/'
# GMO class names
class_names = [
    'vehicle.bicycle', 'bus', 'car', 'construction', 'motorcycle', 'trailer', 'truck', 'pedestrian'
]
use_separate_classes = True
use_fine_occ = False

# Forecasting-related params ******************************************
# we use *time_receptive_field* past frames to forecast future *n_future_frames* frames
# for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False

# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
    num_cls = len(class_names) + 1
else:
    num_cls = 2
img_norm_cfg = None

# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"

# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccDataset'
file_client_args = dict(backend='disk')
data_config={
    'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
             'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
    'Ncams': 6,
    'input_size': (896, 1600),
    'src_size': (900, 1600),
    # image-view augmentation
    'resize': (-0.06, 0.11),
    'rot': (-5.4, 5.4),
    'flip': False,
    'crop_h': (0.0, 0.0),
    'resize_test': 0.00,
}

bda_aug_conf = dict(
            rot_lim=(-0, 0),
            scale_lim=(0.95, 1.05),
            flip_dx_ratio=0.5,
            flip_dy_ratio=0.5)


train_capacity = 23930 # default: use all sequences
test_capacity = 5119 # default: use all sequences
train_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
                use_separate_classes=use_separate_classes),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
                sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
                mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]

test_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
         use_separate_classes=use_separate_classes),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
         sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), 
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]

train_config=dict(
        type=dataset_type,
        data_root=nusc_root,
        occ_root=occ_path,
        idx_root=cam4docc_dataset_path,
        ori_data_root=cam4docc_dataset_path,
        ann_file=train_ann_file,
        pipeline=train_pipeline,
        classes=class_names,
        use_separate_classes=use_separate_classes,
        modality=input_modality,
        test_mode=False,
        use_valid_flag=True,
        occ_size=occ_size,
        pc_range=point_cloud_range,
        box_type_3d='LiDAR',
        time_receptive_field=time_receptive_field,
        n_future_frames=n_future_frames,
        train_capacity=train_capacity,
        test_capacity=test_capacity,
        ) 

test_config=dict(
    type=dataset_type,
    occ_root=occ_path,
    data_root=nusc_root,
    idx_root=cam4docc_dataset_path,
    ori_data_root=cam4docc_dataset_path,
    ann_file=val_ann_file,
    pipeline=test_pipeline,
    classes=class_names,
    use_separate_classes=use_separate_classes,
    modality=input_modality,
    occ_size=occ_size,
    pc_range=point_cloud_range,
    time_receptive_field=time_receptive_field, 
    n_future_frames=n_future_frames,
    train_capacity=train_capacity,
    test_capacity=test_capacity,
    ) 

# in our work we use 8 NVIDIA A100 GPUs
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=1,
    train=train_config,
    val=test_config,
    test=test_config,
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler'),
)

# Model params ******************************************
grid_config = {
    'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
    'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
    'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
    'dbound': [2.0, 58.0, 0.5],
}

voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]

numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)

model = dict(
    type='OCFNet',
    only_generate_dataset=only_generate_dataset,
    loss_norm=False,
    disable_loss_depth=True,
    point_cloud_range=point_cloud_range,
    time_receptive_field=time_receptive_field,
    n_future_frames=n_future_frames,
    n_future_frames_plus=n_future_frames_plus,
    max_label=num_cls,
    iou_thresh_for_vpq=iou_thresh_for_vpq,
    test_present=test_present,
    record_time=False,
    save_pred=save_pred,
    save_path=save_path,
    img_backbone=dict(
        pretrained='torchvision://resnet50',
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        with_cp=False,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=False,
        style='pytorch'),
    img_neck=dict(
        type='SECONDFPN',
        in_channels=[256, 512, 1024, 2048],
        upsample_strides=[0.25, 0.5, 1, 2],
        out_channels=[128, 128, 128, 128]),
    img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
                              norm_cfg=dict(type='SyncBN', requires_grad=True),
                              loss_depth_weight=3.,
                              loss_depth_type='kld',
                              grid_config=grid_config,
                              data_config=data_config,
                              numC_Trans=numC_Trans,
                              vp_megvii=False),
    occ_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=voxel_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=flow_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_head=dict(
        type='FlowHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True, 
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=3,  # 3-dim flow
        point_cloud_range=point_cloud_range,
    ),
    pts_bbox_head=dict(
        type='OccHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True,
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=num_cls,
        point_cloud_range=point_cloud_range,
        loss_weight_cfg=dict(
            loss_voxel_ce_weight=1.0,
            loss_voxel_sem_scal_weight=1.0,
            loss_voxel_geo_scal_weight=1.0,
            loss_voxel_lovasz_weight=1.0,
        ),
    ),
    empty_idx=empty_idx,
)

# Learning policy params ******************************************
optimizer = dict(
    type='AdamW',
    lr=3e-4,   
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=0.01)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=1e-3)

runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
    interval=1,
    pipeline=test_pipeline,
    save_best='SSC_mean',
    rule='greater',
)

custom_hooks = [
    dict(type='OccEfficiencyHook'),
]

================================================
FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2_lyft.py
================================================
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc
# multiple classes: inflated multiple MO classes

# Basic params ******************************************
_base_ = [
    '../datasets/custom_nus-3d.py',
    '../_base_/default_runtime.py'
]
find_unused_parameters = True
# whether training and test together with dataset generation
only_generate_dataset = False
# we only consider use_camera in Cam4DOcc in the current version
input_modality = dict(
    use_lidar=False,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)
plugin = True
plugin_dir = "projects/occ_plugin/"

# path unused for lyft
occ_path = " "
depth_gt_path = " "
train_ann_file = " "
val_ann_file = " "

cam4docc_dataset_path = "./data/cam4docc/"
nusc_root = './data/lyft/'
# GMO class names
# refine the classes for lyft datasets according to your needs
class_names = [
    'bicycle', 'bus', 'car', 'construction', 'motorcycle', 'trailer', 'truck', 'pedestrian'
]
use_separate_classes = True
use_fine_occ = False

# Forecasting-related params ******************************************
# we use *time_receptive_field* past frames to forecast future *n_future_frames* frames
# for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set
time_receptive_field = 3
n_future_frames = 4
n_future_frames_plus = 6
iou_thresh_for_vpq = 0.2
test_present = False

# Occupancy-related params ******************************************
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
occ_size = [512, 512, 40]
lss_downsample = [4, 4, 4]
voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0]
voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1]
voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2]
empty_idx = 0
if use_separate_classes:
    num_cls = len(class_names) + 1
else:
    num_cls = 2
img_norm_cfg = None

# Save params ******************************************
save_pred = False
save_path = "./data/cam4docc/results"

# Data-generation and pipeline params ******************************************
dataset_type = 'Cam4DOccLyftDataset'
file_client_args = dict(backend='disk')
data_config={
    'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
             'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
    'Ncams': 6,
    'input_size': (896, 1600),
    'src_size': (900, 1600),
    # image-view augmentation
    'resize': (-0.06, 0.11),
    'rot': (-5.4, 5.4),
    'flip': False,
    'crop_h': (0.0, 0.0),
    'resize_test': 0.00,
}

bda_aug_conf = dict(
            rot_lim=(-0, 0),
            scale_lim=(0.95, 1.05),
            flip_dx_ratio=0.5,
            flip_dy_ratio=0.5)


train_capacity = 15720 # default: use all sequences
test_capacity = 5880 # default: use all sequences
train_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
                use_separate_classes=use_separate_classes, use_lyft=True),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config,
                sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root,
                mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg, use_lyft=True),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']),
]

test_pipeline = [
    dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range,
         use_separate_classes=use_separate_classes, use_lyft=True),
    dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root,
         sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True, use_lyft=True),
    dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True),
    dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), 
    dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']),
]

train_config=dict(
        type=dataset_type,
        data_root=nusc_root,
        occ_root=occ_path,
        idx_root=cam4docc_dataset_path,
        ori_data_root=cam4docc_dataset_path,
        ann_file=train_ann_file,
        pipeline=train_pipeline,
        classes=class_names,
        use_separate_classes=use_separate_classes,
        modality=input_modality,
        test_mode=False,
        use_valid_flag=True,
        occ_size=occ_size,
        pc_range=point_cloud_range,
        box_type_3d='LiDAR',
        time_receptive_field=time_receptive_field,
        n_future_frames=n_future_frames,
        train_capacity=train_capacity,
        test_capacity=test_capacity,
        ) 

test_config=dict(
    type=dataset_type,
    occ_root=occ_path,
    data_root=nusc_root,
    idx_root=cam4docc_dataset_path,
    ori_data_root=cam4docc_dataset_path,
    ann_file=val_ann_file,
    pipeline=test_pipeline,
    classes=class_names,
    use_separate_classes=use_separate_classes,
    modality=input_modality,
    occ_size=occ_size,
    pc_range=point_cloud_range,
    time_receptive_field=time_receptive_field, 
    n_future_frames=n_future_frames,
    train_capacity=train_capacity,
    test_capacity=test_capacity,
    ) 

# in our work we use 8 NVIDIA A100 GPUs
data = dict(
    samples_per_gpu=1,
    workers_per_gpu=1,
    train=train_config,
    val=test_config,
    test=test_config,
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    nonshuffler_sampler=dict(type='DistributedSampler'),
)

# Model params ******************************************
grid_config = {
    'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]],
    'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]],
    'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]],
    'dbound': [2.0, 58.0, 0.5],
}

voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)]
pred_channels = [32, 32*2, 32*4, 32*8]
decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)]

numC_Trans = 64
occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field
voxel_out_channel = 32*(n_future_frames_plus)
flow_out_channel = 32*(n_future_frames_plus)
voxel_out_channel_per_frame = 32
voxel_out_indices = (0, 1, 2, 3)
my_voxel_out_indices = (0, 1, 2, 3)

model = dict(
    type='OCFNet',
    only_generate_dataset=only_generate_dataset,
    loss_norm=False,
    disable_loss_depth=True,
    point_cloud_range=point_cloud_range,
    time_receptive_field=time_receptive_field,
    n_future_frames=n_future_frames,
    n_future_frames_plus=n_future_frames_plus,
    max_label=num_cls,
    iou_thresh_for_vpq=iou_thresh_for_vpq,
    test_present=test_present,
    record_time=False,
    save_pred=save_pred,
    save_path=save_path,
    img_backbone=dict(
        pretrained='torchvision://resnet50',
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        with_cp=False,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        norm_eval=False,
        style='pytorch'),
    img_neck=dict(
        type='SECONDFPN',
        in_channels=[256, 512, 1024, 2048],
        upsample_strides=[0.25, 0.5, 1, 2],
        out_channels=[128, 128, 128, 128]),
    img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel',
                              norm_cfg=dict(type='SyncBN', requires_grad=True),
                              loss_depth_weight=3.,
                              loss_depth_type='kld',
                              grid_config=grid_config,
                              data_config=data_config,
                              numC_Trans=numC_Trans,
                              vp_megvii=False),
    occ_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    occ_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=voxel_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_backbone=dict(
        type='CustomResNet3D',
        depth=18,
        n_input_channels=occ_encoder_input_channel,
        block_inplanes=voxel_channels,
        out_indices=my_voxel_out_indices,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_predictor=dict(
        type='Predictor',
        n_input_channels=pred_channels,
        in_timesteps=time_receptive_field,
        out_timesteps=n_future_frames_plus,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_encoder_neck=dict(
        type='FPN3D',
        with_cp=False,
        in_channels=decoder_channels,
        out_channels=flow_out_channel,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
    ),
    flow_head=dict(
        type='FlowHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True, 
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=3,  # 3-dim flow
        point_cloud_range=point_cloud_range,
    ),
    pts_bbox_head=dict(
        type='OccHead',
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        soft_weights=True,
        final_occ_size=occ_size,
        fine_topk=15000,
        empty_idx=empty_idx,
        num_level=len(my_voxel_out_indices),
        in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices),
        out_channel=num_cls,
        point_cloud_range=point_cloud_range,
        loss_weight_cfg=dict(
            loss_voxel_ce_weight=1.0,
            loss_voxel_sem_scal_weight=1.0,
            loss_voxel_geo_scal_weight=1.0,
            loss_voxel_lovasz_weight=1.0,
        ),
    ),
    empty_idx=empty_idx,
)

# Learning policy params ******************************************
optimizer = dict(
    type='AdamW',
    lr=3e-4,   
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),
        }),
    weight_decay=0.01)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=1.0 / 3,
    min_lr_ratio=1e-3)

runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(
    interval=1,
    pipeline=test_pipeline,
    save_best='SSC_mean',
    rule='greater',
)

custom_hooks = [
    dict(type='OccEfficiencyHook'),
]

================================================
FILE: projects/configs/datasets/custom_nus-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
dataset_type = 'NuScenesDataset_eval_modified'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
    use_lidar=True,
    use_camera=False,
    use_radar=False,
    use_map=False,
    use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
#     backend='petrel',
#     path_mapping=dict({
#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
#     }))
train_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.3925, 0.3925],
        scale_ratio_range=[0.95, 1.05],
        translation_std=[0, 0, 0]),
    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectNameFilter', classes=class_names),
    dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='MultiScaleFlipAug3D',
        img_scale=(1333, 800),
        pts_scale_ratio=1,
        flip=False,
        transforms=[
            dict(
                type='GlobalRotScaleTrans',
                rot_range=[0, 0],
                scale_ratio_range=[1., 1.],
                translation_std=[0, 0, 0]),
            dict(type='RandomFlip3D'),
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
            dict(
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
            dict(type='Collect3D', keys=['points'])
        ])
]
# construct a pipeline for data and gt loading in show function
# please keep its loading function consistent with test_pipeline (e.g. client)
eval_pipeline = [
    dict(
        type='LoadPointsFromFile',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        file_client_args=file_client_args),
    dict(
        type='LoadPointsFromMultiSweeps',
        sweeps_num=10,
        file_client_args=file_client_args),
    dict(
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
    dict(type='Collect3D', keys=['points'])
]

data = dict(
    samples_per_gpu=4,
    workers_per_gpu=4,
    train=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_train.pkl',
        pipeline=train_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=False,
        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
        box_type_3d='LiDAR'),
    val=dict(
        type=dataset_type,
        ann_file=data_root + 'nuscenes_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True,
        box_type_3d='LiDAR'),
    test=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=data_root + 'nuscenes_infos_val.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        modality=input_modality,
        test_mode=True,
        box_type_3d='LiDAR'))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24, pipeline=eval_pipeline)


================================================
FILE: projects/occ_plugin/__init__.py
================================================
from .core.evaluation.eval_hooks import OccDistEvalHook, OccEvalHook
from .core.evaluation.efficiency_hooks import OccEfficiencyHook
from .core.visualizer import save_occ
from .datasets.pipelines import (
  PhotoMetricDistortionMultiViewImage, PadMultiViewImage, 
  NormalizeMultiviewImage,  CustomCollect3D)
from .occupancy import *

================================================
FILE: projects/occ_plugin/core/__init__.py
================================================
from .evaluation import *
from .visualizer import *

================================================
FILE: projects/occ_plugin/core/evaluation/__init__.py
================================================
from .eval_hooks import OccDistEvalHook, OccEvalHook
from .efficiency_hooks import OccEfficiencyHook

================================================
FILE: projects/occ_plugin/core/evaluation/efficiency_hooks.py
================================================
import copy
from mmcv.runner import HOOKS, Hook
import time
try:
    from mmcv.cnn import get_model_complexity_info
except ImportError:
    raise ImportError('Please upgrade mmcv to >0.6.2')
import torch
import torch.distributed as dist

@HOOKS.register_module()
class OccEfficiencyHook(Hook):
    def __init__(self, dataloader,  **kwargs):
        self.dataloader = dataloader 
        self.warm_up = 5
        
    def construct_input(self, DUMMY_SHAPE=None, m_info=None):
        if m_info is None:
            m_info = next(iter(self.dataloader))
        img_metas = m_info['img_metas'].data
        input = dict(
            img_metas=img_metas,
        )
        if 'img_inputs' in m_info.keys():
            img_inputs = m_info['img_inputs']
            for i in range(len(img_inputs)):
                if isinstance(img_inputs[i], list):
                    for j in range(len(img_inputs[i])):
                        img_inputs[i][j] = img_inputs[i][j].cuda()
                else:
                    img_inputs[i] = img_inputs[i].cuda()
            input['img_inputs'] = img_inputs
            
        if 'points' in m_info.keys():
            points = m_info['points'].data[0]
            points[0] = points[0].cuda()
            input['points'] = points
        return input
    
    def before_run(self, runner):
        torch.cuda.reset_peak_memory_stats()
        
        # model = copy.deepcopy(runner.model)
        # if hasattr(model, 'module'):
        #     model = model.module
        # if hasattr(model, 'forward_dummy'):
        #     model.forward_train = model.forward_dummy
        #     model.forward_test = model.forward_dummy
        #     model.eval()
        # else:
        #     raise NotImplementedError(
        #         'FLOPs counter is currently not supported for {}'.format(
        #             model.__class__.__name__))
        # # inf time
        # pure_inf_time = 0
        # itv_sample = 10
        # for i, data in enumerate(self.dataloader):
        #     torch.cuda.synchronize()
        #     start_time = time.perf_counter()

        #     with torch.no_grad():
        #         model(return_loss=False, rescale=True, **self.construct_input(m_info=data))

        #     torch.cuda.synchronize()
        #     elapsed = time.perf_counter() - start_time

        #     if i >= self.warm_up:
        #         pure_inf_time += elapsed
        #         if (i + 1) % itv_sample == 0:
        #             fps = (i + 1 - self.warm_up) / pure_inf_time
        #             if runner.rank == 0:
        #                 runner.logger.info(f'Done sample [{i + 1:<3}/ {itv_sample*5}], '
        #                     f'fps: {fps:.1f} sample / s')

        #     if (i + 1) == itv_sample*5:
        #         pure_inf_time += elapsed
        #         fps = (i + 1 - self.warm_up) / pure_inf_time
        #         if runner.rank == 0:
        #             runner.logger.info(f'Overall fps: {fps:.1f} sample / s')
        #         break
            
        # # flops and params   
        # if runner.rank == 0:
        #     flops, params = get_model_complexity_info(
        #         model, (None, None), input_constructor=self.construct_input)
            
        #     split_line = '=' * 30
        #     gpu_measure = torch.cuda.max_memory_allocated() / 1024. / 1024. /1024. 
        #     runner.logger.info(f'{split_line}\n' f'Flops: {flops}\nParams: {params}\nGPU memory: {gpu_measure:.2f}GB\n{split_line}')
            
        if dist.is_available() and dist.is_initialized():
            dist.barrier() 
        
        
    def after_run(self, runner):
        pass

    def before_epoch(self, runner):
        pass

    def after_epoch(self, runner):
        pass

    def before_iter(self, runner):
        pass

    def after_iter(self, runner):
        pass
  

================================================
FILE: projects/occ_plugin/core/evaluation/eval_hooks.py
================================================

# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
# in order to avoid strong version dependency, we did not directly
# inherit EvalHook but BaseDistEvalHook.

import os.path as osp
import torch.distributed as dist
from mmcv.runner import DistEvalHook as BaseDistEvalHook
from torch.nn.modules.batchnorm import _BatchNorm
from mmcv.runner import EvalHook as BaseEvalHook


class OccEvalHook(BaseEvalHook):
    def __init__(self, *args,  **kwargs):
        super(OccEvalHook, self).__init__(*args, **kwargs)  
            
    def _do_evaluate(self, runner):
        """perform evaluation and save ckpt."""
        if not self._should_evaluate(runner):
            return

        from projects.occ_plugin.occupancy.apis.test import custom_single_gpu_test
        results = custom_single_gpu_test(runner.model, self.dataloader, show=False)
        
        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
        key_score = self.evaluate(runner, results)
        if self.save_best:
            self._save_ckpt(runner, key_score)
            
            
class OccDistEvalHook(BaseDistEvalHook):
    def __init__(self, *args,  **kwargs):
        super(OccDistEvalHook, self).__init__(*args, **kwargs)       

    def _do_evaluate(self, runner):
        """perform evaluation and save ckpt."""
        # Synchronization of BatchNorm's buffer (running_mean
        # and running_var) is not supported in the DDP of pytorch,
        # which may cause the inconsistent performance of models in
        # different ranks, so we broadcast BatchNorm's buffers
        # of rank 0 to other ranks to avoid this.
        if self.broadcast_bn_buffer:
            model = runner.model
            for name, module in model.named_modules():
                if isinstance(module,
                              _BatchNorm) and module.track_running_stats:
                    dist.broadcast(module.running_var, 0)
                    dist.broadcast(module.running_mean, 0)

        if not self._should_evaluate(runner):
            return

        tmpdir = self.tmpdir
        if tmpdir is None:
            tmpdir = osp.join(runner.work_dir, '.eval_hook')

        from projects.occ_plugin.occupancy.apis.test import custom_multi_gpu_test # to solve circlur  import

        results = custom_multi_gpu_test(
            runner.model,
            self.dataloader,
            tmpdir=tmpdir,
            gpu_collect=self.gpu_collect)
        
        if runner.rank == 0:
            print('\n')
            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
            
            key_score = self.evaluate(runner, results)

            if self.save_best:
                self._save_ckpt(runner, key_score)
  

================================================
FILE: projects/occ_plugin/core/visualizer/__init__.py
================================================
from .show_occ import save_occ

================================================
FILE: projects/occ_plugin/core/visualizer/show_occ.py
================================================

import torch.nn.functional as F
import torch
import numpy as np
from os import path as osp
import os

def save_occ(pred_c, pred_f, img_metas, path, visible_mask=None, gt_occ=None, free_id=0, thres_low=0.4, thres_high=0.99):

    """
    visualization saving for paper:
    1. gt
    2. pred_f pred_c
    3. gt visible
    4. pred_f visible
    """
    pred_f = F.softmax(pred_f, dim=1)
    pred_f = pred_f[0].cpu().numpy()  # C W H D
    pred_c = F.softmax(pred_c, dim=1)
    pred_c = pred_c[0].cpu().numpy()  # C W H D
    visible_mask = visible_mask[0].cpu().numpy().reshape(-1) > 0  # WHD
    gt_occ = gt_occ.data[0][0].cpu().numpy()  # W H D
    gt_occ[gt_occ==255] = 0
    _, W, H, D = pred_f.shape
    coordinates_3D_f = np.stack(np.meshgrid(np.arange(W), np.arange(H), np.arange(D), indexing='ij'), axis=-1).reshape(-1, 3) # (W*H*D, 3)
    _, W, H, D = pred_c.shape
    coordinates_3D_c = np.stack(np.meshgrid(np.arange(W), np.arange(H), np.arange(D), indexing='ij'), axis=-1).reshape(-1, 3) # (W*H*D, 3)
    pred_f = np.argmax(pred_f, axis=0) # (W, H, D)
    pred_c = np.argmax(pred_c, axis=0) # (W, H, D)
    occ_pred_f_mask = (pred_f.reshape(-1))!=free_id
    occ_pred_c_mask = (pred_c.reshape(-1))!=free_id
    occ_gt_mask = (gt_occ.reshape(-1))!=free_id
    pred_f_save = np.concatenate([coordinates_3D_f[occ_pred_f_mask], pred_f.reshape(-1)[occ_pred_f_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]]  # zyx cls
    pred_c_save = np.concatenate([coordinates_3D_c[occ_pred_c_mask], pred_c.reshape(-1)[occ_pred_c_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]]  # zyx cls
    pred_f_visible_save = np.concatenate([coordinates_3D_f[occ_pred_f_mask&visible_mask], pred_f.reshape(-1)[occ_pred_f_mask&visible_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]]  # zyx cls
    gt_save = np.concatenate([coordinates_3D_f[occ_gt_mask], gt_occ.reshape(-1)[occ_gt_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]]  # zyx cls
    gt_visible_save = np.concatenate([coordinates_3D_f[occ_gt_mask&visible_mask], gt_occ.reshape(-1)[occ_gt_mask&visible_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]]  # zyx cls
    
    scene_token = img_metas.data[0][0]['scene_token']
    lidar_token = img_metas.data[0][0]['lidar_token']
    save_path = osp.join(path, scene_token, lidar_token)
    if not osp.exists(save_path):
        os.makedirs(save_path)
    save_pred_f_path = osp.join(save_path, 'pred_f.npy')
    save_pred_c_path = osp.join(save_path, 'pred_c.npy')
    save_pred_f_v_path = osp.join(save_path, 'pred_f_visible.npy')
    save_gt_path = osp.join(save_path, 'gt.npy')
    save_gt_v_path = osp.join(save_path, 'gt_visible.npy')
    np.save(save_pred_f_path, pred_f_save)
    np.save(save_pred_c_path, pred_c_save)
    np.save(save_pred_f_v_path, pred_f_visible_save)
    np.save(save_gt_path, gt_save)
    np.save(save_gt_v_path, gt_visible_save)


================================================
FILE: projects/occ_plugin/datasets/__init__.py
================================================
from .nuscenes_dataset import CustomNuScenesDataset
from .cam4docc_dataset import Cam4DOccDataset
from .cam4docc_lyft_dataset import Cam4DOccLyftDataset
from .builder import custom_build_dataset

__all__ = [
    'CustomNuScenesDataset', 'NuscOCCDataset'
]


================================================
FILE: projects/occ_plugin/datasets/builder.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import platform
import random
from functools import partial

import numpy as np
from mmcv.parallel import collate
from mmcv.runner import get_dist_info
from mmcv.utils import Registry, build_from_cfg
from torch.utils.data import DataLoader

from mmdet.datasets.samplers import GroupSampler
from projects.occ_plugin.datasets.samplers.group_sampler import DistributedGroupSampler
from projects.occ_plugin.datasets.samplers.distributed_sampler import DistributedSampler
from projects.occ_plugin.datasets.samplers.sampler import build_sampler

def build_dataloader(dataset,
                     samples_per_gpu,
                     workers_per_gpu,
                     num_gpus=1,
                     dist=True,
                     shuffle=True,
                     seed=None,
                     shuffler_sampler=None,
                     nonshuffler_sampler=None,
                     **kwargs):
    """Build PyTorch DataLoader.
    In distributed training, each GPU/process has a dataloader.
    In non-distributed training, there is only one dataloader for all GPUs.
    Args:
        dataset (Dataset): A PyTorch dataset.
        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
            batch size of each GPU.
        workers_per_gpu (int): How many subprocesses to use for data loading
            for each GPU.
        num_gpus (int): Number of GPUs. Only used in non-distributed training.
        dist (bool): Distributed training/test or not. Default: True.
        shuffle (bool): Whether to shuffle the data at every epoch.
            Default: True.
        kwargs: any keyword argument to be used to initialize DataLoader
    Returns:
        DataLoader: A PyTorch dataloader.
    """
    rank, world_size = get_dist_info()
    if dist:
        # DistributedGroupSampler will definitely shuffle the data to satisfy
        # that images on each GPU are in the same group
        if shuffle:
            sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
                                     dict(
                                         dataset=dataset,
                                         samples_per_gpu=samples_per_gpu,
                                         num_replicas=world_size,
                                         rank=rank,
                                         seed=seed)
                                     )

        else:
            sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
                                     dict(
                                         dataset=dataset,
                                         num_replicas=world_size,
                                         rank=rank,
                                         shuffle=shuffle,
                                         seed=seed)
                                     )

        batch_size = samples_per_gpu
        num_workers = workers_per_gpu
    else:
        print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
        batch_size = num_gpus * samples_per_gpu
        num_workers = num_gpus * workers_per_gpu

    init_fn = partial(
        worker_init_fn, num_workers=num_workers, rank=rank,
        seed=seed) if seed is not None else None

    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=num_workers,
        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
        pin_memory=False,
        worker_init_fn=init_fn,
        **kwargs)

    return data_loader


def worker_init_fn(worker_id, num_workers, rank, seed):
    # The seed of each worker equals to
    # num_worker * rank + worker_id + user_seed
    worker_seed = num_workers * rank + worker_id + seed
    np.random.seed(worker_seed)
    random.seed(worker_seed)


# Copyright (c) OpenMMLab. All rights reserved.
import platform
from mmcv.utils import Registry, build_from_cfg

from mmdet.datasets import DATASETS
from mmdet.datasets.builder import _concat_dataset

if platform.system() != 'Windows':
    # https://github.com/pytorch/pytorch/issues/973
    import resource
    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
    base_soft_limit = rlimit[0]
    hard_limit = rlimit[1]
    soft_limit = min(max(4096, base_soft_limit), hard_limit)
    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))

OBJECTSAMPLERS = Registry('Object sampler')


def custom_build_dataset(cfg, default_args=None):
    from mmdet3d.datasets.dataset_wrappers import CBGSDataset
    from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
                                                 ConcatDataset, RepeatDataset)
    if isinstance(cfg, (list, tuple)):
        dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
    elif cfg['type'] == 'ConcatDataset':
        dataset = ConcatDataset(
            [custom_build_dataset(c, default_args) for c in cfg['datasets']],
            cfg.get('separate_eval', True))
    elif cfg['type'] == 'RepeatDataset':
        dataset = RepeatDataset(
            custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
    elif cfg['type'] == 'ClassBalancedDataset':
        dataset = ClassBalancedDataset(
            custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
    elif cfg['type'] == 'CBGSDataset':
        dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
    elif isinstance(cfg.get('ann_file'), (list, tuple)):
        dataset = _concat_dataset(cfg, default_args)
    else:
        dataset = build_from_cfg(cfg, DATASETS, default_args)

    return dataset


================================================
FILE: projects/occ_plugin/datasets/cam4docc_dataset.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import numpy as np
from mmcv.runner import get_dist_info
from mmdet.datasets import DATASETS
from mmdet3d.datasets import NuScenesDataset
import os
from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
from projects.occ_plugin.utils.formating import cm_to_ious, format_iou_results
from projects.occ_plugin.utils.geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy
from nuscenes import NuScenes
from pyquaternion import Quaternion
import torch
import random
import time

@DATASETS.register_module()
class Cam4DOccDataset(NuScenesDataset):
    def __init__(self, occ_size, pc_range, occ_root, idx_root, ori_data_root, data_root, time_receptive_field, n_future_frames, classes, use_separate_classes,
                  train_capacity, test_capacity, **kwargs):
        
        '''
        Cam4DOccDataset contains sequential occupancy states as well as instance flow for training occupancy forecasting models. We unify the related operations in the LiDAR coordinate system following OpenOccupancy.

        occ_size: number of grids along H W L, default: [512, 512, 40]
        pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
        occ_root: data path of nuScenes-Occupancy
        idx_root: save path of test indexes
        time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3
        n_future_frames: number of forecasted future frames, default: 4
        classes: predefiend categories in GMO
        use_separate_classes: separate movable objects instead of the general one
        train_capacity: number of sequences used for training, default: 23930
        test_capacity: number of sequences used for testing, default: 5119
        '''
        
        self.train_capacity = train_capacity
        self.test_capacity = test_capacity

        super().__init__(**kwargs)

        rank, world_size = get_dist_info()

        self.time_receptive_field = time_receptive_field
        self.n_future_frames = n_future_frames
        self.sequence_length = time_receptive_field + n_future_frames

        if rank == 0:
            print("-------------")
            print("use past " + str(self.time_receptive_field) + " frames to forecast future " + str(self.n_future_frames) + " frames")
            print("-------------")

        self.data_infos = list(sorted(self.data_infos, key=lambda e: e['timestamp']))
        self.data_infos = self.data_infos[::self.load_interval]
        self.occ_size = occ_size
        self.pc_range = pc_range
        self.occ_root = occ_root
        self.idx_root = idx_root
        self.ori_data_root = ori_data_root
        self.data_root = data_root
        self.classes = classes
        self.use_separate_classes = use_separate_classes

        self.indices = self.get_indices()
        self.present_scene_lidar_token = " "
        self._set_group_flag()

        # load origin nusc dataset for instance annotation
        self.nusc = NuScenes(version='v1.0-trainval', dataroot=self.data_root, verbose=False)
        if self.test_mode:
            self.chosen_list = random.sample(range(0, self.test_capacity) , self.test_capacity)
            self.chosen_list_num = len(self.chosen_list)
        else:
            self.chosen_list = random.sample(range(0, self.train_capacity) , self.train_capacity)
            self.chosen_list_num = len(self.chosen_list)
    
    def _set_group_flag(self):
        if self.test_mode:
            self.flag = np.zeros(self.test_capacity, dtype=np.uint8)
        else:
            self.flag = np.zeros(self.train_capacity, dtype=np.uint8)

    def __len__(self):
        if self.test_mode:
            return self.test_capacity
        else:
            return self.train_capacity

    def __getitem__(self, idx):
     
        idx = int(self.chosen_list[idx])

        self.egopose_list = []
        self.ego2lidar_list = []
        self.visible_instance_set = set()
        self.instance_dict = {}

        if self.test_mode:
            return self.prepare_test_data(idx)
            
        while True:
            data = self.prepare_train_data(idx)
            if data is None:
                idx = self._rand_another(idx)
                idx = int(self.chosen_list[idx])
                continue
            
            return data

    def get_indices(self):
        '''
        Generate sequential indexes for training and testing
        '''
        indices = []
        for index in range(len(self.data_infos)): 
            is_valid_data = True
            previous_rec = None
            current_indices = []
            for t in range(self.sequence_length):
                index_t = index + t
                # Going over the dataset size limit.
                if index_t >= len(self.data_infos):
                    is_valid_data = False
                    break
                rec = self.data_infos[index_t]
                # Check if scene is the same
                if (previous_rec is not None) and (rec['scene_token'] != previous_rec['scene_token']):
                    is_valid_data = False
                    break

                current_indices.append(index_t)
                previous_rec = rec

            if is_valid_data:
                indices.append(current_indices)

        return np.asarray(indices)

    def get_lidar_pose(self, rec):
        '''
        Get global poses for following bbox transforming
        '''
        ego2global_translation = rec['ego2global_translation']
        ego2global_rotation = rec['ego2global_rotation']
        trans = -np.array(ego2global_translation)
        rot = Quaternion(ego2global_rotation).inverse
        
        return trans, rot
    
    def get_ego2lidar_pose(self, rec):
        '''
        Get LiDAR poses in ego system
        '''
        lidar2ego_translation = rec['lidar2ego_translation']
        lidar2ego_rotation = rec['lidar2ego_rotation']
        trans = -np.array(lidar2ego_translation)
        rot = Quaternion(lidar2ego_rotation).inverse
        return trans, rot

    def record_instance(self, idx, instance_map):
        """
        Record information about each visible instance in the sequence and assign a unique ID to it
        """
        rec = self.data_infos[idx]
        translation, rotation = self.get_lidar_pose(rec)
        self.egopose_list.append([translation, rotation])
        ego2lidar_translation, ego2lidar_rotation = self.get_ego2lidar_pose(rec)
        self.ego2lidar_list.append([ego2lidar_translation, ego2lidar_rotation])

        current_sample = self.nusc.get('sample', rec['token'])
        for annotation_token in current_sample['anns']:
            annotation = self.nusc.get('sample_annotation', annotation_token)
            # Instance extraction for Cam4DOcc-V1 
            # Filter out all non vehicle instances
            # if 'vehicle' not in annotation['category_name']:
            #     continue
            gmo_flag = False
            for class_name in self.classes:
                if class_name in annotation['category_name']:
                    gmo_flag = True
                    break
            if not gmo_flag:
                continue
            # Specify semantic id if use_separate_classes
            semantic_id = 1
            if self.use_separate_classes:
                if 'vehicle.bicycle' in annotation['category_name']: # rm static_object.bicycle_rack
                    semantic_id = 1
                elif 'bus'  in annotation['category_name']:
                    semantic_id = 2
                elif 'car'  in annotation['category_name']:
                    semantic_id = 3
                elif 'construction'  in annotation['category_name']:
                    semantic_id = 4
                elif 'motorcycle'  in annotation['category_name']:
                    semantic_id = 5
                elif 'trailer'  in annotation['category_name']:
                    semantic_id = 6
                elif 'truck'  in annotation['category_name']:
                    semantic_id = 7
                elif 'pedestrian'  in annotation['category_name']:
                    semantic_id = 8

            # Filter out invisible vehicles
            FILTER_INVISIBLE_VEHICLES = True
            if FILTER_INVISIBLE_VEHICLES and int(annotation['visibility_token']) == 1 and annotation['instance_token'] not in self.visible_instance_set:
                continue
            # Filter out vehicles that have not been seen in the past
            if self.counter >= self.time_receptive_field and annotation['instance_token'] not in self.visible_instance_set:
                continue
            self.visible_instance_set.add(annotation['instance_token'])

            if annotation['instance_token'] not in instance_map:
                instance_map[annotation['instance_token']] = len(instance_map) + 1
            instance_id = instance_map[annotation['instance_token']]
            instance_attribute = int(annotation['visibility_token'])

            if annotation['instance_token'] not in self.instance_dict:
                # For the first occurrence of an instance
                self.instance_dict[annotation['instance_token']] = {
                    'timestep': [self.counter],
                    'translation': [annotation['translation']],
                    'rotation': [annotation['rotation']],
                    'size': annotation['size'],
                    'instance_id': instance_id,
                    'semantic_id': semantic_id,
                    'attribute_label': [instance_attribute],
                }
            else:
                # For the instance that have appeared before
                self.instance_dict[annotation['instance_token']]['timestep'].append(self.counter)
                self.instance_dict[annotation['instance_token']]['translation'].append(annotation['translation'])
                self.instance_dict[annotation['instance_token']]['rotation'].append(annotation['rotation'])
                self.instance_dict[annotation['instance_token']]['attribute_label'].append(instance_attribute)

        return instance_map

    def get_future_egomotion(self, idx):
        '''
        Calculate LiDAR pose updates between idx and idx+1
        '''
        rec_t0 = self.data_infos[idx]
        future_egomotion = np.eye(4, dtype=np.float32)

        if idx < len(self.data_infos) - 1:
            rec_t1 = self.data_infos[idx + 1]

            if rec_t0['scene_token'] == rec_t1['scene_token']:
                egopose_t0_trans = rec_t0['ego2global_translation']
                egopose_t0_rot = rec_t0['ego2global_rotation']
                egopose_t1_trans = rec_t1['ego2global_translation']
                egopose_t1_rot = rec_t1['ego2global_rotation']
                egopose_t0 = convert_egopose_to_matrix_numpy(egopose_t0_trans, egopose_t0_rot)
                egopose_t1 = convert_egopose_to_matrix_numpy(egopose_t1_trans, egopose_t1_rot)

                lidar2ego_t0_trans = rec_t0['lidar2ego_translation']
                lidar2ego_t0_rot = rec_t0['lidar2ego_rotation']
                lidar2ego_t1_trans = rec_t1['lidar2ego_translation']
                lidar2ego_t1_rot = rec_t1['lidar2ego_rotation']
                lidar2ego_t0 = convert_egopose_to_matrix_numpy(lidar2ego_t0_trans, lidar2ego_t0_rot)
                lidar2ego_t1 = convert_egopose_to_matrix_numpy(lidar2ego_t1_trans, lidar2ego_t1_rot)

                future_egomotion = invert_matrix_egopose_numpy(lidar2ego_t1).dot(invert_matrix_egopose_numpy(egopose_t1)).dot(egopose_t0).dot(lidar2ego_t0)   

        future_egomotion = torch.Tensor(future_egomotion).float()

        # Convert to 6DoF vector
        return future_egomotion.unsqueeze(0)

    @staticmethod
    def _check_consistency(translation, prev_translation, threshold=1.0):
        """
        Check for significant displacement of the instance adjacent moments
        """
        x, y = translation[:2]
        prev_x, prev_y = prev_translation[:2]

        if abs(x - prev_x) > threshold or abs(y - prev_y) > threshold:
            return False
        return True

    def refine_instance_poly(self, instance):
        """
        Fix the missing frames and disturbances of ground truth caused by noise
        """
        pointer = 1
        for i in range(instance['timestep'][0] + 1, self.sequence_length):
            # Fill in the missing frames
            if i not in instance['timestep']:
                instance['timestep'].insert(pointer, i)
                instance['translation'].insert(pointer, instance['translation'][pointer-1])
                instance['rotation'].insert(pointer, instance['rotation'][pointer-1])
                instance['attribute_label'].insert(pointer, instance['attribute_label'][pointer-1])
                pointer += 1
                continue
            
            # Eliminate observation disturbances
            if self._check_consistency(instance['translation'][pointer], instance['translation'][pointer-1]):
                instance['translation'][pointer] = instance['translation'][pointer-1]
                instance['rotation'][pointer] = instance['rotation'][pointer-1]
                instance['attribute_label'][pointer] = instance['attribute_label'][pointer-1]
            pointer += 1
        
        return instance

    def prepare_train_data(self, index):
        '''
        Generate a training sequence
        '''
        input_dict = self.get_data_info(index)
        if input_dict is None:
            return None
        
        example = self.prepare_sequential_data(index)
        return example

    def prepare_test_data(self, index):
        '''
        Generate a test sequence
        TODO: Give additional functions here such as visualization
        '''
        input_dict = self.get_data_info(index)
        if input_dict is None:
            return None
        
        example = self.prepare_sequential_data(index)
        # TODO: visualize example data
        return example
    
    def prepare_sequential_data(self, index):
        '''
        Use the predefined pipeline to generate inputs of the baseline network and ground truth for the standard evaluation protocol in Cam4DOcc
        '''
        instance_map = {}
        input_seq_data = {}
        keys = ['input_dict','future_egomotion', 'sample_token']
        for key in keys:
            input_seq_data[key] = []
        scene_lidar_token = []

        for self.counter, index_t in enumerate(self.indices[index]):
            input_dict_per_frame = self.get_data_info(index_t)
            if input_dict_per_frame is None:
                return None
            
            input_seq_data['input_dict'].append(input_dict_per_frame)
            input_seq_data['sample_token'].append(input_dict_per_frame['sample_idx'])

            instance_map = self.record_instance(index_t, instance_map)
            future_egomotion = self.get_future_egomotion(index_t)
            input_seq_data['future_egomotion'].append(future_egomotion)

            scene_lidar_token.append(input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token'])
            if self.counter == self.time_receptive_field - 1:
                self.present_scene_lidar_token = input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token']

        # save sequential test indexes for possible evaluation
        if self.test_mode:
            test_idx_path = os.path.join(self.idx_root, "test_ids")
            if not os.path.exists(test_idx_path):
                os.mkdir(test_idx_path)
            np.savez(os.path.join(test_idx_path, self.present_scene_lidar_token), scene_lidar_token)

        for token in self.instance_dict.keys():
            self.instance_dict[token] = self.refine_instance_poly(self.instance_dict[token])

        input_seq_data.update(
            dict(
                time_receptive_field=self.time_receptive_field,
                sequence_length=self.sequence_length,
                egopose_list=self.egopose_list,
                ego2lidar_list=self.ego2lidar_list,
                instance_dict=self.instance_dict,
                instance_map=instance_map,
                indices=self.indices[index],
                scene_token=self.present_scene_lidar_token,
            ))

        example = self.pipeline(input_seq_data)

        return example


    def get_data_info(self, index):
        '''
        get_data_info from .pkl also used by OpenOccupancy
        '''
        
        info = self.data_infos[index]
        
        # standard protocal modified from SECOND.Pytorch
        input_dict = dict(
            sample_idx=info['token'],
            pts_filename=info['lidar_path'],
            sweeps=info['sweeps'],
            lidar2ego_translation=info['lidar2ego_translation'],
            lidar2ego_rotation=info['lidar2ego_rotation'],
            ego2global_translation=info['ego2global_translation'],
            ego2global_rotation=info['ego2global_rotation'],
            prev_idx=info['prev'],
            next_idx=info['next'],
            scene_token=info['scene_token'],
            can_bus=info['can_bus'],
            # frame_idx=info['frame_idx'],
            timestamp=info['timestamp'] / 1e6,
            occ_size = np.array(self.occ_size),
            pc_range = np.array(self.pc_range),
            lidar_token=info['lidar_token'],
            lidarseg=info['lidarseg'],
            curr=info,
        )

        if self.modality['use_camera']:
            image_paths = []
            lidar2img_rts = []
            lidar2cam_rts = []
            cam_intrinsics = []
            
            lidar2cam_dic = {}
            
            for cam_type, cam_info in info['cams'].items():
                image_paths.append(cam_info['data_path'])
                # obtain lidar to image transformation matrix
                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
                lidar2cam_t = cam_info[
                    'sensor2lidar_translation'] @ lidar2cam_r.T
                lidar2cam_rt = np.eye(4)
                lidar2cam_rt[:3, :3] = lidar2cam_r.T
                lidar2cam_rt[3, :3] = -lidar2cam_t
                intrinsic = cam_info['cam_intrinsic']
                viewpad = np.eye(4)
                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
                lidar2img_rts.append(lidar2img_rt)

                cam_intrinsics.append(viewpad)
                lidar2cam_rts.append(lidar2cam_rt.T)
                
                lidar2cam_dic[cam_type] = lidar2cam_rt.T

            input_dict.update(
                dict(
                    img_filename=image_paths,
                    lidar2img=lidar2img_rts,
                    cam_intrinsic=cam_intrinsics,
                    lidar2cam=lidar2cam_rts,
                    lidar2cam_dic=lidar2cam_dic,
                ))

        return input_dict


    def evaluate(self, results, logger=None, **kawrgs):
        '''
        Evaluate by IOU and VPQ metrics for model evaluation
        '''
        eval_results = {}
        
        ''' calculate IOU '''
        hist_for_iou = sum(results['hist_for_iou'])
        ious = cm_to_ious(hist_for_iou)
        res_table, res_dic = format_iou_results(ious, return_dic=True)
        for key, val in res_dic.items():
            eval_results['IOU_{}'.format(key)] = val
        if logger is not None:
            logger.info('IOU Evaluation')
            logger.info(res_table)        

        ''' calculate VPQ '''
        if 'vpq_metric' in results.keys() and 'vpq_len' in results.keys():
            vpq_sum = sum(results['vpq_metric'])
            eval_results['VPQ'] = vpq_sum/results['vpq_len']

        return eval_results


================================================
FILE: projects/occ_plugin/datasets/cam4docc_lyft_dataset.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import numpy as np
from mmcv.runner import get_dist_info
from mmdet.datasets import DATASETS
from mmdet3d.datasets import NuScenesDataset
from mmdet3d.datasets.pipelines import Compose
from torch.utils.data import Dataset
from lyft_dataset_sdk.lyftdataset import LyftDataset
import os
from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
from projects.occ_plugin.utils.formating import cm_to_ious, format_iou_results
from projects.occ_plugin.utils.geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy
from nuscenes import NuScenes
from pyquaternion import Quaternion
import torch
import random
import time

@DATASETS.register_module()
class Cam4DOccLyftDataset(Dataset):
    def __init__(self, occ_size, pc_range, occ_root, idx_root, ori_data_root, data_root, time_receptive_field, n_future_frames, classes, use_separate_classes,
                  train_capacity, test_capacity, test_mode=False, pipeline=None, **kwargs):
        
        '''
        Cam4DOccLyftDataset contains sequential occupancy states as well as instance flow for training occupancy forecasting models. We unify the related operations in the LiDAR coordinate system following OpenOccupancy.

        occ_size: number of grids along H W L, default: [512, 512, 40]
        pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
        occ_root: data path of nuScenes-Occupancy
        idx_root: save path of test indexes
        time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3
        n_future_frames: number of forecasted future frames, default: 4
        classes: predefiend categories in GMO
        use_separate_classes: separate movable objects instead of the general one
        train_capacity: number of sequences used for training, default: 23930
        test_capacity: number of sequences used for testing, default: 5119
        '''

        self.test_mode = test_mode
        self.CLASSES = classes
        
        self.train_capacity = train_capacity
        self.test_capacity = test_capacity

        super().__init__()

        # training and test indexes following PowerBEV
        self.TRAIN_LYFT_INDICES = [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16,
                      17, 18, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32,
                      33, 35, 36, 37, 39, 41, 43, 44, 45, 46, 47, 48, 49,
                      50, 51, 52, 53, 55, 56, 59, 60, 62, 63, 65, 68, 69,
                      70, 71, 72, 73, 74, 75, 76, 78, 79, 81, 82, 83, 84,
                      86, 87, 88, 89, 93, 95, 97, 98, 99, 103, 104, 107, 108,
                      109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 121, 122, 124,
                      127, 128, 130, 131, 132, 134, 135, 136, 137, 138, 139, 143, 144,
                      146, 147, 148, 149, 150, 151, 152, 153, 154, 156, 157, 158, 159,
                      161, 162, 165, 166, 167, 171, 172, 173, 174, 175, 176, 177, 178,
                      179]

        self.VAL_LYFT_INDICES = [0, 2, 4, 13, 22, 25, 26, 34, 38, 40, 42, 54, 57,
                    58, 61, 64, 66, 67, 77, 80, 85, 90, 91, 92, 94, 96,
                    100, 101, 102, 105, 106, 112, 120, 123, 125, 126, 129, 133, 140,
                    141, 142, 145, 155, 160, 163, 164, 168, 169, 170]


        rank, world_size = get_dist_info()

        self.time_receptive_field = time_receptive_field
        self.n_future_frames = n_future_frames
        self.sequence_length = time_receptive_field + n_future_frames

        if rank == 0:
            print("-------------")
            print("use past " + str(self.time_receptive_field) + " frames to forecast future " + str(self.n_future_frames) + " frames")
            print("-------------")

        self.occ_size = occ_size
        self.pc_range = pc_range
        self.occ_root = occ_root
        self.idx_root = idx_root
        self.ori_data_root = ori_data_root
        self.data_root = data_root
        self.classes = classes
        self.use_separate_classes = use_separate_classes

        self.pipeline = Compose(pipeline)

        # load origin nusc dataset for instance annotation
        self.lyft = LyftDataset(data_path=self.data_root, json_path=os.path.join(self.data_root, 'train_data'), verbose=False)

        self.scenes = self.get_scenes()
        self.ixes = self.get_samples()
        self.indices = self.get_indices()

        self.present_scene_lidar_token = " "
        self._set_group_flag()

        if self.test_mode:
            self.chosen_list = random.sample(range(0, self.test_capacity) , self.test_capacity)
            self.chosen_list_num = len(self.chosen_list)
        else:
            self.chosen_list = random.sample(range(0, self.train_capacity) , self.train_capacity)
            self.chosen_list_num = len(self.chosen_list)
    
    def _set_group_flag(self):
        if self.test_mode:
            self.flag = np.zeros(self.test_capacity, dtype=np.uint8)
        else:
            self.flag = np.zeros(self.train_capacity, dtype=np.uint8)

    def __len__(self):
        if self.test_mode:
            return self.test_capacity
        else:
            return self.train_capacity

    def __getitem__(self, idx):
     
        idx = int(self.chosen_list[idx])

        self.egopose_list = []
        self.ego2lidar_list = []
        self.visible_instance_set = set()
        self.instance_dict = {}

        if self.test_mode:
            return self.prepare_test_data(idx)
            
        while True:
            data = self.prepare_train_data(idx)
            if data is None:
                idx = self._rand_another(idx)
                idx = int(self.chosen_list[idx])
                continue
            
            return data

    def get_scenes(self):
        """
        Obtain the list of scenes names in the given split.
        """
        scenes = [row['name'] for row in self.lyft.scene]
        # split in train/val
        indices = self.VAL_LYFT_INDICES  if self.test_mode else  self.TRAIN_LYFT_INDICES
        scenes = [scenes[i] for i in indices]
        return scenes

    def get_samples(self):
        """
        Find and sort the samples in the given split by scene.
        """
        samples = [sample for sample in self.lyft.sample]
        # remove samples that aren't in this split
        samples = [sample for sample in samples if self.lyft.get('scene', sample['scene_token'])['name'] in self.scenes]
        # sort by scene, timestamp (only to make chronological viz easier)
        samples.sort(key=lambda x: (x['scene_token'], x['timestamp']))

        return samples

    def get_indices(self):
        '''
        Generate sequential indexes for training and testing
        '''
        indices = []
        for index in range(len(self.ixes)):
            is_valid_data = True
            previous_rec = None
            current_indices = []
            for t in range(self.sequence_length):
                index_t = index + t
                # Going over the dataset size limit.
                if index_t >= len(self.ixes):
                    is_valid_data = False
                    break
                rec = self.ixes[index_t]
                # Check if scene is the same
                if (previous_rec is not None) and (rec['scene_token'] != previous_rec['scene_token']):
                    is_valid_data = False
                    break

                current_indices.append(index_t)
                previous_rec = rec

            if is_valid_data:
                indices.append(current_indices)

        return np.asarray(indices)

    def get_lidar_pose(self, rec):
        '''
        Get global poses for following bbox transforming
        '''
        current_sample = self.lyft.get('sample', rec['token'])
        egopose = self.lyft.get('ego_pose', self.lyft.get('sample_data', current_sample['data']['LIDAR_TOP'])['ego_pose_token'])
        ego2global_translation = egopose['translation']
        ego2global_rotation = egopose['rotation']
        trans = -np.array(ego2global_translation)
        rot = Quaternion(ego2global_rotation).inverse
        
        return trans, rot
    
    def get_ego2lidar_pose(self, rec):
        '''
        Get LiDAR poses in ego system
        '''
        current_sample = self.lyft.get('sample', rec['token'])
        lidar_top_data = self.lyft.get('sample_data', current_sample['data']['LIDAR_TOP'])
        lidar2ego_translation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation']
        lidar2ego_rotation =  self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation']

        trans = -np.array(lidar2ego_translation)
        rot = Quaternion(lidar2ego_rotation).inverse
        return trans, rot

    def record_instance(self, idx, instance_map):
        """
        Record information about each visible instance in the sequence and assign a unique ID to it
        """
        rec = self.ixes[idx]
        translation, rotation = self.get_lidar_pose(rec)
        self.egopose_list.append([translation, rotation])
        ego2lidar_translation, ego2lidar_rotation = self.get_ego2lidar_pose(rec)
        self.ego2lidar_list.append([ego2lidar_translation, ego2lidar_rotation])

        current_sample = self.lyft.get('sample', rec['token'])
        for annotation_token in current_sample['anns']:
            annotation = self.lyft.get('sample_annotation', annotation_token)
            # Instance extraction for Cam4DOcc-V1 
            # Filter out all non vehicle instances
            # if 'vehicle' not in annotation['category_name']:
            #     continue
            gmo_flag = False
            for class_name in self.classes:
                if class_name in annotation['category_name']:
                    gmo_flag = True
                    break
            if not gmo_flag:
                continue
            # Specify semantic id if use_separate_classes
            semantic_id = 1
            if self.use_separate_classes:
                if 'bicycle' in annotation['category_name']:
                    semantic_id = 1
                elif 'bus'  in annotation['category_name']:
                    semantic_id = 2
                elif 'car'  in annotation['category_name']:
                    semantic_id = 3
                elif 'construction'  in annotation['category_name']:
                    semantic_id = 4
                elif 'motorcycle'  in annotation['category_name']:
                    semantic_id = 5
                elif 'trailer'  in annotation['category_name']:
                    semantic_id = 6
                elif 'truck'  in annotation['category_name']:
                    semantic_id = 7
                elif 'pedestrian'  in annotation['category_name']:
                    semantic_id = 8

            if annotation['instance_token'] not in instance_map:
                instance_map[annotation['instance_token']] = len(instance_map) + 1
            instance_id = instance_map[annotation['instance_token']]
            instance_attribute = 1 # deprecated

            if annotation['instance_token'] not in self.instance_dict:
                # For the first occurrence of an instance
                self.instance_dict[annotation['instance_token']] = {
                    'timestep': [self.counter],
                    'translation': [annotation['translation']],
                    'rotation': [annotation['rotation']],
                    'size': annotation['size'],
                    'instance_id': instance_id,
                    'semantic_id': semantic_id,
                    'attribute_label': [instance_attribute],
                }
            else:
                # For the instance that have appeared before
                self.instance_dict[annotation['instance_token']]['timestep'].append(self.counter)
                self.instance_dict[annotation['instance_token']]['translation'].append(annotation['translation'])
                self.instance_dict[annotation['instance_token']]['rotation'].append(annotation['rotation'])
                self.instance_dict[annotation['instance_token']]['attribute_label'].append(instance_attribute)

        return instance_map

    def get_future_egomotion(self, idx):
        '''
        Calculate LiDAR pose updates between idx and idx+1
        '''
        rec_t0 = self.ixes[idx]
        future_egomotion = np.eye(4, dtype=np.float32)

        if idx < len(self.ixes) - 1:
            rec_t1 = self.ixes[idx + 1]

            if rec_t0['scene_token'] == rec_t1['scene_token']:
                egopose_t0 = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec_t0['data']['LIDAR_TOP'])['ego_pose_token'])
                egopose_t0_trans = egopose_t0['translation']
                egopose_t0_rot = egopose_t0['rotation']

                egopose_t1 = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec_t1['data']['LIDAR_TOP'])['ego_pose_token'])
                egopose_t1_trans = egopose_t1['translation']
                egopose_t1_rot = egopose_t1['rotation']

                egopose_t0 = convert_egopose_to_matrix_numpy(egopose_t0_trans, egopose_t0_rot)
                egopose_t1 = convert_egopose_to_matrix_numpy(egopose_t1_trans, egopose_t1_rot)

                lidar_top_data_t0 = self.lyft.get('sample_data', rec_t0['data']['LIDAR_TOP'])
                lidar2ego_t0_trans = self.lyft.get('calibrated_sensor', lidar_top_data_t0['calibrated_sensor_token'])['translation']
                lidar2ego_t0_rot =  self.lyft.get('calibrated_sensor', lidar_top_data_t0['calibrated_sensor_token'])['rotation']
                lidar_top_data_t1 = self.lyft.get('sample_data', rec_t1['data']['LIDAR_TOP'])
                lidar2ego_t1_trans = self.lyft.get('calibrated_sensor', lidar_top_data_t1['calibrated_sensor_token'])['translation']
                lidar2ego_t1_rot =  self.lyft.get('calibrated_sensor', lidar_top_data_t1['calibrated_sensor_token'])['rotation']


                lidar2ego_t0 = convert_egopose_to_matrix_numpy(lidar2ego_t0_trans, lidar2ego_t0_rot)
                lidar2ego_t1 = convert_egopose_to_matrix_numpy(lidar2ego_t1_trans, lidar2ego_t1_rot)

                future_egomotion = invert_matrix_egopose_numpy(lidar2ego_t1).dot(invert_matrix_egopose_numpy(egopose_t1)).dot(egopose_t0).dot(lidar2ego_t0)   

        future_egomotion = torch.Tensor(future_egomotion).float()
        return future_egomotion.unsqueeze(0)

    @staticmethod
    def _check_consistency(translation, prev_translation, threshold=1.0):
        """
        Check for significant displacement of the instance adjacent moments
        """
        x, y = translation[:2]
        prev_x, prev_y = prev_translation[:2]

        if abs(x - prev_x) > threshold or abs(y - prev_y) > threshold:
            return False
        return True

    def refine_instance_poly(self, instance):
        """
        Fix the missing frames and disturbances of ground truth caused by noise
        """
        pointer = 1
        for i in range(instance['timestep'][0] + 1, self.sequence_length):
            # Fill in the missing frames
            if i not in instance['timestep']:
                instance['timestep'].insert(pointer, i)
                instance['translation'].insert(pointer, instance['translation'][pointer-1])
                instance['rotation'].insert(pointer, instance['rotation'][pointer-1])
                instance['attribute_label'].insert(pointer, instance['attribute_label'][pointer-1])
                pointer += 1
                continue
            
            # Eliminate observation disturbances
            if self._check_consistency(instance['translation'][pointer], instance['translation'][pointer-1]):
                instance['translation'][pointer] = instance['translation'][pointer-1]
                instance['rotation'][pointer] = instance['rotation'][pointer-1]
                instance['attribute_label'][pointer] = instance['attribute_label'][pointer-1]
            pointer += 1
        
        return instance

    def prepare_train_data(self, index):
        '''
        Generate a training sequence
        '''
        
        example = self.prepare_sequential_data(index)
        return example

    def prepare_test_data(self, index):
        '''
        Generate a test sequence
        TODO: Give additional functions here such as visualization
        '''
        
        example = self.prepare_sequential_data(index)
        # TODO: visualize example data
        return example
    
    def prepare_sequential_data(self, index):
        '''
        Use the predefined pipeline to generate inputs of the baseline network and ground truth for the standard evaluation protocol in Cam4DOcc
        '''
        instance_map = {}
        input_seq_data = {}
        keys = ['input_dict','future_egomotion', 'sample_token']
        for key in keys:
            input_seq_data[key] = []
        scene_lidar_token = []

        for self.counter, index_t in enumerate(self.indices[index]):

            input_dict_per_frame = {}
            rec = self.ixes[index_t]  # sample

            lidar_top_data = self.lyft.get('sample_data', rec['data']['LIDAR_TOP'])
            lidar2ego_translation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation']
            lidar2ego_rotation =  self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation']

            egopose = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec['data']['LIDAR_TOP'])['ego_pose_token'])
            ego2global_translation = egopose['translation']
            ego2global_rotation = egopose['rotation']

            input_dict_per_frame['lidar2ego_translation'] = lidar2ego_translation
            input_dict_per_frame['lidar2ego_rotation'] = lidar2ego_rotation
            input_dict_per_frame['ego2global_translation'] = ego2global_translation
            input_dict_per_frame['ego2global_rotation'] = ego2global_rotation
            input_dict_per_frame['scene_token'] = rec['scene_token']
            input_dict_per_frame['lidar_token'] = rec['data']['LIDAR_TOP']
            input_dict_per_frame['occ_size'] = np.array(self.occ_size)
            input_dict_per_frame['pc_range'] = np.array(self.pc_range)
            input_dict_per_frame['sample_idx'] = rec['token']


            image_paths = []
            lidar2img_rts = []
            lidar2cam_rts = []
            cam_intrinsics = []
            cam_intrinsics_ori = []
            lidar2cam_dic = {}

            lidar_sample = self.lyft.get('sample_data', rec['data']['LIDAR_TOP'])
            lidar_pose = self.lyft.get('ego_pose', lidar_sample['ego_pose_token'])
            lidar_rotation = Quaternion(lidar_pose['rotation'])
            lidar_translation = np.array(lidar_pose['translation'])[:, None]
            lidar_to_world = np.vstack([
                np.hstack((lidar_rotation.rotation_matrix, lidar_translation)),
                np.array([0, 0, 0, 1])
            ])

            lidar_sample_calib = self.lyft.get('calibrated_sensor', lidar_sample['calibrated_sensor_token'])
            lidar_sensor_rotation = Quaternion(lidar_sample_calib['rotation'])
            lidar_sensor_translation = np.array(lidar_sample_calib['translation'])[:, None]
            lidar_to_lidarego = np.vstack([
                np.hstack((lidar_sensor_rotation.rotation_matrix, lidar_sensor_translation)),
                np.array([0, 0, 0, 1])
            ])

            cameras = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT']

            for cam in cameras:
                camera_sample = self.lyft.get('sample_data', rec['data'][cam])
                image_paths.append(os.path.join("/tos://haomo-algorithms/c6089dc67ff976615510d22b5eaaaa4e/mjy/cam4docc/data/lyft/", camera_sample['filename']))

                car_egopose = self.lyft.get('ego_pose', camera_sample['ego_pose_token'])
                egopose_rotation = Quaternion(car_egopose['rotation']).inverse
                egopose_translation = -np.array(car_egopose['translation'])[:, None]
                world_to_car_egopose = np.vstack([
                    np.hstack((egopose_rotation.rotation_matrix, egopose_rotation.rotation_matrix @ egopose_translation)),
                    np.array([0, 0, 0, 1])
                ])

                sensor_sample = self.lyft.get('calibrated_sensor', camera_sample['calibrated_sensor_token'])
                intrinsic = torch.Tensor(sensor_sample['camera_intrinsic'])
                cam_intrinsics_ori.append(intrinsic)
                sensor_rotation = Quaternion(sensor_sample['rotation'])
                sensor_translation = np.array(sensor_sample['translation'])[:, None]
                car_egopose_to_sensor = np.vstack([
                    np.hstack((sensor_rotation.rotation_matrix, sensor_translation)),
                    np.array([0, 0, 0, 1])
                ])
                car_egopose_to_sensor = np.linalg.inv(car_egopose_to_sensor)

                lidar_to_sensor = car_egopose_to_sensor @ world_to_car_egopose @ lidar_to_world @ lidar_to_lidarego
                sensor_to_lidar =np.linalg.inv(lidar_to_sensor)

                lidar2cam_r = lidar_to_sensor[:3, :3] 
                lidar2cam_t = sensor_to_lidar[:3, -1].reshape(1,3) @ lidar2cam_r.T
                lidar2cam_rt = np.eye(4)
                lidar2cam_rt[:3, :3] = lidar2cam_r.T
                lidar2cam_rt[3, :3] = -lidar2cam_t
                viewpad = np.eye(4)
                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
                lidar2img_rts.append(lidar2img_rt)

                cam_intrinsics.append(viewpad)
                lidar2cam_rts.append(lidar2cam_rt.T)
                
                lidar2cam_dic[cam] = lidar2cam_rt.T

            input_dict_per_frame.update(
                dict(
                    img_filename=image_paths,
                    lidar2img=lidar2img_rts,
                    cam_intrinsic=cam_intrinsics,
                    cam_intrinsics=cam_intrinsics_ori,
                    lidar2cam=lidar2cam_rts,
                    lidar2cam_dic=lidar2cam_dic,
                ))
                        
            input_seq_data['input_dict'].append(input_dict_per_frame)

            instance_map = self.record_instance(index_t, instance_map)
            future_egomotion = self.get_future_egomotion(index_t)
            input_seq_data['future_egomotion'].append(future_egomotion)
            input_seq_data['sample_token'].append(input_dict_per_frame['sample_idx'])

            scene_lidar_token.append(input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token'])
            if self.counter == self.time_receptive_field - 1:
                self.present_scene_lidar_token = input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token']

        for token in self.instance_dict.keys():
            self.instance_dict[token] = self.refine_instance_poly(self.instance_dict[token])

        input_seq_data.update(
            dict(
                time_receptive_field=self.time_receptive_field,
                sequence_length=self.sequence_length,
                egopose_list=self.egopose_list,
                ego2lidar_list=self.ego2lidar_list,
                instance_dict=self.instance_dict,
                instance_map=instance_map,
                indices=self.indices[index],
                scene_token=self.present_scene_lidar_token,
            ))

        example = self.pipeline(input_seq_data)
        return example


    def evaluate(self, results, logger=None, **kawrgs):
        '''
        Evaluate by IOU and VPQ metrics for model evaluation
        '''
        eval_results = {}
        
        ''' calculate IOU '''
        hist_for_iou = sum(results['hist_for_iou'])
        ious = cm_to_ious(hist_for_iou)
        res_table, res_dic = format_iou_results(ious, return_dic=True)
        for key, val in res_dic.items():
            eval_results['IOU_{}'.format(key)] = val
        if logger is not None:
            logger.info('IOU Evaluation')
            logger.info(res_table)        

        ''' calculate VPQ '''
        if 'vpq_metric' in results.keys() and 'vpq_len' in results.keys():
            vpq_sum = sum(results['vpq_metric'])
            eval_results['VPQ'] = vpq_sum/results['vpq_len']

        return eval_results


================================================
FILE: projects/occ_plugin/datasets/nuscenes_dataset.py
================================================
import copy
import numpy as np
from mmdet.datasets import DATASETS
from mmdet3d.datasets import NuScenesDataset
import mmcv
from os import path as osp
from mmdet.datasets import DATASETS
import torch
import numpy as np
from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
from mmcv.parallel import DataContainer as DC
import random


@DATASETS.register_module()
class CustomNuScenesDataset(NuScenesDataset):
    r"""NuScenes Dataset.

    This datset only add camera intrinsics and extrinsics to the results.
    """

    def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.queue_length = queue_length
        self.overlap_test = overlap_test
        self.bev_size = bev_size
        
    def prepare_train_data(self, index):
        """
        Training data preparation.
        Args:
            index (int): Index for accessing the target data.
        Returns:
            dict: Training data dict of the corresponding index.
        """
        queue = []
        index_list = list(range(index-self.queue_length, index))
        random.shuffle(index_list)
        index_list = sorted(index_list[1:])
        index_list.append(index)
        for i in index_list:
            i = max(0, i)
            input_dict = self.get_data_info(i)
            if input_dict is None:
                return None
            self.pre_pipeline(input_dict)
            example = self.pipeline(input_dict)
            if self.filter_empty_gt and \
                    (example is None or ~(example['gt_labels_3d']._data != -1).any()):
                return None
            queue.append(example)
        return self.union2one(queue)


    def union2one(self, queue):
        imgs_list = [each['img'].data for each in queue]
        metas_map = {}
        prev_scene_token = None
        prev_pos = None
        prev_angle = None
        for i, each in enumerate(queue):
            metas_map[i] = each['img_metas'].data
            if metas_map[i]['scene_token'] != prev_scene_token:
                metas_map[i]['prev_bev_exists'] = False
                prev_scene_token = metas_map[i]['scene_token']
                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
                metas_map[i]['can_bus'][:3] = 0
                metas_map[i]['can_bus'][-1] = 0
            else:
                metas_map[i]['prev_bev_exists'] = True
                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
                metas_map[i]['can_bus'][:3] -= prev_pos
                metas_map[i]['can_bus'][-1] -= prev_angle
                prev_pos = copy.deepcopy(tmp_pos)
                prev_angle = copy.deepcopy(tmp_angle)
        queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
        queue = queue[-1]
        return queue

    def get_data_info(self, index):
        """Get data info according to the given index.

        Args:
            index (int): Index of the sample data to get.

        Returns:
            dict: Data information that will be passed to the data \
                preprocessing pipelines. It includes the following keys:

                - sample_idx (str): Sample index.
                - pts_filename (str): Filename of point clouds.
                - sweeps (list[dict]): Infos of sweeps.
                - timestamp (float): Sample timestamp.
                - img_filename (str, optional): Image filename.
                - lidar2img (list[np.ndarray], optional): Transformations \
                    from lidar to different cameras.
                - ann_info (dict): Annotation info.
        """
        info = self.data_infos[index]
        # standard protocal modified from SECOND.Pytorch
        input_dict = dict(
            sample_idx=info['token'],
            pts_filename=info['lidar_path'],
            sweeps=info['sweeps'],
            ego2global_translation=info['ego2global_translation'],
            ego2global_rotation=info['ego2global_rotation'],
            prev_idx=info['prev'],
            next_idx=info['next'],
            scene_token=info['scene_token'],
            can_bus=info['can_bus'],
            frame_idx=info['frame_idx'],
            timestamp=info['timestamp'] / 1e6,
        )

        if self.modality['use_camera']:
            image_paths = []
            lidar2img_rts = []
            lidar2cam_rts = []
            cam_intrinsics = []
            for cam_type, cam_info in info['cams'].items():
                image_paths.append(cam_info['data_path'])
                # obtain lidar to image transformation matrix
                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
                lidar2cam_t = cam_info[
                    'sensor2lidar_translation'] @ lidar2cam_r.T
                lidar2cam_rt = np.eye(4)
                lidar2cam_rt[:3, :3] = lidar2cam_r.T
                lidar2cam_rt[3, :3] = -lidar2cam_t
                intrinsic = cam_info['cam_intrinsic']
                viewpad = np.eye(4)
                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
                lidar2img_rts.append(lidar2img_rt)

                cam_intrinsics.append(viewpad)
                lidar2cam_rts.append(lidar2cam_rt.T)

            input_dict.update(
                dict(
                    img_filename=image_paths,
                    lidar2img=lidar2img_rts,
                    cam_intrinsic=cam_intrinsics,
                    lidar2cam=lidar2cam_rts,
                ))

        if not self.test_mode:
            annos = self.get_ann_info(index)
            input_dict['ann_info'] = annos

        rotation = Quaternion(input_dict['ego2global_rotation'])
        translation = input_dict['ego2global_translation']
        can_bus = input_dict['can_bus']
        can_bus[:3] = translation
        can_bus[3:7] = rotation
        patch_angle = quaternion_yaw(rotation) / np.pi * 180
        if patch_angle < 0:
            patch_angle += 360
        can_bus[-2] = patch_angle / 180 * np.pi
        can_bus[-1] = patch_angle

        return input_dict

    def __getitem__(self, idx):
        """Get item from infos according to the given index.
        Returns:
            dict: Data dictionary of the corresponding index.
        """
        if self.test_mode:
            return self.prepare_test_data(idx)
        while True:

            data = self.prepare_train_data(idx)
            if data is None:
                idx = self._rand_another(idx)
                continue
            return data

   
================================================
FILE: projects/occ_plugin/datasets/pipelines/__init__.py
================================================
from .transform_3d import (
    PadMultiViewImage, NormalizeMultiviewImage, 
    PhotoMetricDistortionMultiViewImage, CustomCollect3D, CustomOccCollect3D, RandomScaleImageMultiViewImage)
from .formating import OccDefaultFormatBundle3D
from .loading_occupancy import LoadOccupancy
from .loading_bevdet import LoadAnnotationsBEVDepth, LoadMultiViewImageFromFiles_BEVDet
from .loading_instance import LoadInstanceWithFlow

__all__ = [
    'PadMultiViewImage', 'NormalizeMultiviewImage', 'CustomOccCollect3D', 'LoadAnnotationsBEVDepth', 'LoadMultiViewImageFromFiles_BEVDet', 'LoadOccupancy',
    'PhotoMetricDistortionMultiViewImage', 'OccDefaultFormatBundle3D', 'CustomCollect3D', 'RandomScaleImageMultiViewImage', "LoadInstanceWithFlow",
]

================================================
FILE: projects/occ_plugin/datasets/pipelines/formating.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
from mmcv.parallel import DataContainer as DC

from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import to_tensor
from mmdet3d.datasets.pipelines import DefaultFormatBundle3D


@PIPELINES.register_module()
class OccDefaultFormatBundle3D(DefaultFormatBundle3D):
    """Default formatting bundle.
    It simplifies the pipeline of formatting common fields for voxels,
    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
    "gt_semantic_seg".
    These fields are formatted as follows.
    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
    - proposals: (1)to tensor, (2)to DataContainer
    - gt_bboxes: (1)to tensor, (2)to DataContainer
    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
    - gt_labels: (1)to tensor, (2)to DataContainer
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        
    def __call__(self, results):
        """Call function to transform and format common fields in results.
        Args:
            results (dict): Result dict contains the data to convert.
        Returns:
            dict: The result dict contains the data that is formatted with
                default bundle.
        """
        # Format 3D data
        results = super(OccDefaultFormatBundle3D, self).__call__(results)

        if 'gt_occ' in results.keys():
            results['gt_occ'] = DC(to_tensor(results['gt_occ']), stack=True)
        if 'gt_occ' in results.keys():
            results['segmentation'] = DC(to_tensor(results['segmentation']), stack=True)
        if 'gt_occ' in results.keys():
            results['instance'] = DC(to_tensor(results['instance']), stack=True)
        if 'gt_occ' in results.keys():
            results['attribute_label'] = DC(to_tensor(results['attribute_label']), stack=True)
        if 'gt_occ' in results.keys():
            results['flow'] = DC(to_tensor(results['flow']), stack=True)
        if 'gt_vel' in results.keys():
            results['gt_vel'] = DC(to_tensor(results['gt_vel']), stack=False)

        return results

================================================
FILE: projects/occ_plugin/datasets/pipelines/loading_bevdet.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.

import mmcv
import numpy as np
from mmdet.datasets.builder import PIPELINES
import os
import torch
from PIL import Image
from pyquaternion import Quaternion
from mmdet3d.core.bbox import LiDARInstance3DBoxes

from numpy import random
import pdb

def mmlabNormalize(img, img_norm_cfg=None):
    from mmcv.image.photometric import imnormalize
    
    if img_norm_cfg is None:
        mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
        std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
        to_rgb = True
    else:
        mean = np.array(img_norm_cfg['mean'], dtype=np.float32)
        std = np.array(img_norm_cfg['std'], dtype=np.float32)
        to_rgb = img_norm_cfg['to_rgb']
    
    img = imnormalize(np.array(img), mean, std, to_rgb)
    img = torch.tensor(img).float().permute(2, 0, 1).contiguous()
    
    return img

def depth_transform(cam_depth, resize, resize_dims, crop, flip, rotate):
    """Transform depth based on ida augmentation configuration.

    Args:
        cam_depth (np array): Nx3, 3: x,y,d.
        resize (float): Resize factor.
        resize_dims (list): Final dimension.
        crop (list): x1, y1, x2, y2
        flip (bool): Whether to flip.
        rotate (float): Rotation value.

    Returns:
        np array: [h/down_ratio, w/down_ratio, d]
    """

    H, W = resize_dims
    cam_depth[:, :2] = cam_depth[:, :2] * resize
    cam_depth[:, 0] -= crop[0]
    cam_depth[:, 1] -= crop[1]
    if flip:
        cam_depth[:, 0] = resize_dims[1] - cam_depth[:, 0]

    cam_depth[:, 0] -= W / 2.0
    cam_depth[:, 1] -= H / 2.0

    h = rotate / 180 * np.pi
    rot_matrix = [
        [np.cos(h), np.sin(h)],
        [-np.sin(h), np.cos(h)],
    ]
    cam_depth[:, :2] = np.matmul(rot_matrix, cam_depth[:, :2].T).T

    cam_depth[:, 0] += W / 2.0
    cam_depth[:, 1] += H / 2.0

    depth_coords = cam_depth[:, :2].astype(np.int16)

    depth_map = np.zeros(resize_dims)
    valid_mask = ((depth_coords[:, 1] < resize_dims[0])
                  & (depth_coords[:, 0] < resize_dims[1])
                  & (depth_coords[:, 1] >= 0)
                  & (depth_coords[:, 0] >= 0))
    depth_map[depth_coords[valid_mask, 1],
              depth_coords[valid_mask, 0]] = cam_depth[valid_mask, 2]

    return torch.Tensor(depth_map)

@PIPELINES.register_module()
class LoadMultiViewImageFromFiles_BEVDet(object):
    """Load multi channel images from a list of separate channel files.

    Expects results['img_filename'] to be a list of filenames.

    Args:
        to_float32 (bool): Whether to convert the img to float32.
            Defaults to False.
        color_type (str): Color type of the file. Defaults to 'unchanged'.
    """

    def __init__(self, data_config, is_train=False, using_ego=True, colorjitter=False,
                 sequential=False, aligned=False, trans_only=True, img_norm_cfg=None,
                 mmlabnorm=False, load_depth=False, depth_gt_path=None, data_root=None, test_mode=False, use_lyft=False):
        self.is_train = is_train
        self.data_config = data_config
        
        # using mean camera ego frame, rather than the lidar coordinates
        self.using_ego = using_ego
        self.normalize_img = mmlabNormalize
        self.img_norm_cfg = img_norm_cfg
        
        self.sequential = sequential
        self.aligned = aligned
        self.trans_only = trans_only
        self.load_depth = load_depth
        
        self.depth_gt_path = depth_gt_path
        self.data_root = data_root
        
        self.colorjitter = colorjitter
        self.pipeline_colorjitter = PhotoMetricDistortionMultiViewImage()

        self.test_mode = test_mode

        self.use_lyft = use_lyft

    def get_rot(self,h):
        return torch.Tensor([
            [np.cos(h), np.sin(h)],
            [-np.sin(h), np.cos(h)],
        ])

    def img_transform(self, img, post_rot, post_tran,
                      resize, resize_dims, crop,
                      flip, rotate):
        # adjust image
        img = self.img_transform_core(img, resize_dims, crop, flip, rotate)

        # post-homography transformation
        post_rot *= resize
        post_tran -= torch.Tensor(crop[:2])
        if flip:
            A = torch.Tensor([[-1, 0], [0, 1]])
            b = torch.Tensor([crop[2] - crop[0], 0])
            post_rot = A.matmul(post_rot)
            post_tran = A.matmul(post_tran) + b
        A = self.get_rot(rotate / 180 * np.pi)
        b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
        b = A.matmul(-b) + b
        post_rot = A.matmul(post_rot)
        post_tran = A.matmul(post_tran) + b

        return img, post_rot, post_tran

    def img_transform_core(self, img, resize_dims, crop, flip, rotate):
        # adjust image
        img = img.resize(resize_dims)
        img = img.crop(crop)
        if flip:
            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
        img = img.rotate(rotate)
        return img

    def choose_cams(self):
        if self.is_train and self.data_config['Ncams'] < len(self.data_config['cams']):
            cam_names = np.random.choice(self.data_config['cams'], self.data_config['Ncams'],
                                    replace=False)
        else:
            cam_names = self.data_config['cams']
        return cam_names

    def sample_augmentation(self, H , W, flip=None, scale=None):
        fH, fW = self.data_config['input_size']
        if self.is_train:
            resize = float(fW)/float(W)
            resize += np.random.uniform(*self.data_config['resize'])
            resize_dims = (int(W * resize), int(H * resize))
            newW, newH = resize_dims
            crop_h = int((1 - np.random.uniform(*self.data_config['crop_h'])) * newH) - fH
            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
            # We do not use flip here to keep right forecasting
            flip = None
            rotate = 0
        else:
            resize = float(fW)/float(W)
            resize += self.data_config.get('resize_test', 0.0)
            if scale is not None:
                resize = scale
            resize_dims = (int(W * resize), int(H * resize))
            newW, newH = resize_dims
            crop_h = int((1 - np.mean(self.data_config['crop_h'])) * newH) - fH
            crop_w = int(max(0, newW - fW) / 2)
            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
            flip = None
            rotate = 0
        return resize, resize_dims, crop, flip, rotate

    def get_sensor2ego_transformation(self, cam_info, key_info, cam_name):
        w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation']
        # sweep sensor to sweep ego
        sweepsensor2sweepego_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        sweepsensor2sweepego_tran = torch.Tensor(
            cam_info['cams'][cam_name]['sensor2ego_translation'])
        sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros(
            (4, 4))
        sweepsensor2sweepego[3, 3] = 1
        sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot
        sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran
        # sweep ego to global
        w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation']
        sweepego2global_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        sweepego2global_tran = torch.Tensor(
            cam_info['cams'][cam_name]['ego2global_translation'])
        sweepego2global = sweepego2global_rot.new_zeros((4, 4))
        sweepego2global[3, 3] = 1
        sweepego2global[:3, :3] = sweepego2global_rot
        sweepego2global[:3, -1] = sweepego2global_tran

        # global sensor to cur ego
        w, x, y, z = key_info['cams'][cam_name]['ego2global_rotation']
        keyego2global_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        keyego2global_tran = torch.Tensor(
            key_info['cams'][cam_name]['ego2global_translation'])
        keyego2global = keyego2global_rot.new_zeros((4, 4))
        keyego2global[3, 3] = 1
        keyego2global[:3, :3] = keyego2global_rot
        keyego2global[:3, -1] = keyego2global_tran
        global2keyego = keyego2global.inverse()

        # cur ego to sensor
        w, x, y, z = key_info['cams'][cam_name]['sensor2ego_rotation']
        keysensor2keyego_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        keysensor2keyego_tran = torch.Tensor(
            key_info['cams'][cam_name]['sensor2ego_translation'])
        keysensor2keyego = keysensor2keyego_rot.new_zeros((4, 4))
        keysensor2keyego[3, 3] = 1
        keysensor2keyego[:3, :3] = keysensor2keyego_rot
        keysensor2keyego[:3, -1] = keysensor2keyego_tran
        keyego2keysensor = keysensor2keyego.inverse()
        keysensor2sweepsensor = (
                keyego2keysensor @ global2keyego @ sweepego2global
                @ sweepsensor2sweepego).inverse()
        sweepsensor2keyego = global2keyego @ sweepego2global @ \
                             sweepsensor2sweepego
        return sweepsensor2keyego, keysensor2sweepsensor
    
    def get_sensor2lidar_transformation(self, cam_info, cam_name, sample_info):
        w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation']
        # sweep sensor to sweep ego
        sweepsensor2sweepego_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        sweepsensor2sweepego_tran = torch.Tensor(
            cam_info['cams'][cam_name]['sensor2ego_translation'])
        sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros(
            (4, 4))
        sweepsensor2sweepego[3, 3] = 1
        sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot
        sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran
        # sweep ego to global
        w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation']
        sweepego2global_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        sweepego2global_tran = torch.Tensor(
            cam_info['cams'][cam_name]['ego2global_translation'])
        sweepego2global = sweepego2global_rot.new_zeros((4, 4))
        sweepego2global[3, 3] = 1
        sweepego2global[:3, :3] = sweepego2global_rot
        sweepego2global[:3, -1] = sweepego2global_tran
        
        # global to lidar ego
        w, x, y, z = sample_info['ego2global_rotation']
        lidarego2global_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        lidarego2global_tran = torch.Tensor(sample_info['ego2global_translation'])
        lidarego2global = lidarego2global_rot.new_zeros((4, 4))
        lidarego2global[3, 3] = 1
        lidarego2global[:3, :3] = lidarego2global_rot
        lidarego2global[:3, -1] = lidarego2global_tran
        global2lidarego = lidarego2global.inverse()
        
        # lidar ego to lidar
        w, x, y, z = sample_info['lidar2ego_rotation']
        lidar2ego_rot = torch.Tensor(
            Quaternion(w, x, y, z).rotation_matrix)
        lidar2ego_tran = torch.Tensor(sample_info['lidar2ego_translation'])
        lidar2ego = lidar2ego_rot.new_zeros((4, 4))
        lidar2ego[3, 3] = 1
        lidar2ego[:3, :3] = lidar2ego_rot
        lidar2ego[:3, -1] = lidar2ego_tran
        ego2lidar = lidar2ego.inverse()
        
        # camera to lidar
        sweepsensor2lidar = ego2lidar @ global2lidarego @ sweepego2global @ sweepsensor2sweepego
        
        return sweepsensor2lidar


    def get_seq_inputs(self, results, flip=None, scale=None):
        
        cam_names = self.choose_cams()
        results['cam_names'] = cam_names

        if self.use_lyft:
            filename = results['input_dict'][0]['img_filename'][0]
        else:
            cam_data = results['input_dict'][0]['curr']['cams'][cam_names[0]]
            filename = cam_data['data_path']
            filename = os.path.join(self.data_root, filename.split('/')[-3], filename.split('/')[-2], filename.split('/')[-1])
            
        img = Image.open(filename)

        img_augs = self.sample_augmentation(H=img.height,
                                            W=img.width,
                                            flip=flip,
                                            scale=scale)
        resize, resize_dims, crop, flip, rotate = img_augs

        sequence_length = results['sequence_length']

        imgs_seq = []
        rots_seq = []
        trans_seq = []
        intrins_seq = []
        post_rots_seq = []
        post_trans_seq = []
        gt_depths_seq = list()
        canvas_seq = []
        sensor2sensors_seq = []

        for counter in range(sequence_length):

            input_dict_curr = results['input_dict'][counter]

            imgs = []
            rots = []
            trans = []
            intrins = []
            post_rots = []
            post_trans = []
            gt_depths = list()
            canvas = []
            sensor2sensors = []

            for cam_idx, cam_name in enumerate(cam_names):
                if self.use_lyft:
                    cam_data = None
                    filename = input_dict_curr['img_filename'][cam_idx]
                else:
                    cam_data = input_dict_curr['curr']['cams'][cam_name]
                    filename = cam_data['data_path']
                    filename = os.path.join(self.data_root, filename.split('/')[-3], filename.split('/')[-2], filename.split('/')[-1])
                
                img = Image.open(filename)
                post_rot = torch.eye(2)
                post_tran = torch.zeros(2)

                if self.use_lyft:
                    intrin = torch.Tensor(input_dict_curr['cam_intrinsics'][cam_idx])
                else:
                    intrin = torch.Tensor(cam_data['cam_intrinsic'])
                
                # from camera to lidar 
                sensor2lidar = torch.tensor(input_dict_curr['lidar2cam_dic'][cam_name]).inverse().float()
                rot = sensor2lidar[:3, :3]
                tran = sensor2lidar[:3, 3]

                img, post_rot2, post_tran2 = \
                    self.img_transform(img, post_rot,
                                    post_tran,
                                    resize=resize,
                                    resize_dims=resize_dims,
                                    crop=crop,
                                    flip=flip,
                                    rotate=rotate)

                # for convenience, make augmentation matrices 3x3
                post_tran = torch.zeros(3)
                post_rot = torch.eye(3)
                post_tran[:2] = post_tran2
                post_rot[:2, :2] = post_rot2

                # TODO: open source depth enhancement
                gt_depths.append(torch.zeros(1))
                
                canvas.append(np.array(img))
                
                if self.colorjitter and self.is_train:
                    img = self.pipeline_colorjitter(img)
                
                imgs.append(self.normalize_img(img, img_norm_cfg=self.img_norm_cfg))
                intrins.append(intrin)
                rots.append(rot)
                trans.append(tran)
                post_rots.append(post_rot)
                post_trans.append(post_tran)
                sensor2sensors.append(sensor2lidar)

            imgs = torch.stack(imgs)
            rots = torch.stack(rots)
            trans = torch.stack(trans)
            intrins = torch.stack(intrins)
            post_rots = torch.stack(post_rots)
            post_trans = torch.stack(post_trans)
            gt_depths = torch.stack(gt_depths)
            sensor2sensors = torch.stack(sensor2sensors)

            imgs_seq.append(imgs)
            rots_seq.append(rots)
            trans_seq.append(trans)
            intrins_seq.append(intrins)
            post_rots_seq.append(post_rots)
            post_trans_seq.append(post_trans)
            gt_depths_seq.append(gt_depths)
            canvas_seq.append(canvas)
            sensor2sensors_seq.append(sensor2sensors)


        imgs_seq = torch.stack(imgs_seq)
        rots_seq = torch.stack(rots_seq)
        trans_seq = torch.stack(trans_seq)
        intrins_seq = torch.stack(intrins_seq)
        post_rots_seq = torch.stack(post_rots_seq)
        post_trans_seq = torch.stack(post_trans_seq)
        gt_depths_seq = torch.stack(gt_depths_seq)
        sensor2sensors_seq = torch.stack(sensor2sensors_seq)

        results['canvas'] = canvas

        return imgs_seq, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, gt_depths_seq, sensor2sensors_seq


    def __call__(self, results):

        results['img_inputs_seq'] = self.get_seq_inputs(results)
    
        return results

def bev_transform(rotate_angle, scale_ratio, flip_dx, flip_dy):
    rotate_angle = torch.tensor(rotate_angle / 180 * np.pi)
    rot_sin = torch.sin(rotate_angle)
    rot_cos = torch.cos(rotate_angle)
    rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0],
                            [0, 0, 1]])
    scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0],
                              [0, 0, scale_ratio]])
    flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    if flip_dx:
        flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0], [0, 0, 1]])
    if flip_dy:
        flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]])
    rot_mat = flip_mat @ (scale_mat @ rot_mat)
    return rot_mat

@PIPELINES.register_module()
class LoadAnnotationsBEVDepth():
    def __init__(self, bda_aug_conf, classes, is_train=True,
                 input_modality=None):
        self.bda_aug_conf = bda_aug_conf
        self.is_train = is_train
        self.classes = classes
        if input_modality == None:
            input_modality = dict(
                use_lidar=True,
                use_camera=True,
                use_radar=False,
                use_map=False,
                use_external=False)
        self.input_modality = input_modality

    def sample_bda_augmentation(self):
        """Generate bda augmentation values based on bda_config."""
        if self.is_train:
            rotate_bda = np.random.uniform(*self.bda_aug_conf['rot_lim'])
            scale_bda = np.random.uniform(*self.bda_aug_conf['scale_lim'])
            flip_dx = np.random.uniform() < self.bda_aug_conf['flip_dx_ratio']
            flip_dy = np.random.uniform() < self.bda_aug_conf['flip_dy_ratio']
        else:
            rotate_bda = 0
            scale_bda = 1.0
            flip_dx = False
            flip_dy = False
        return rotate_bda, scale_bda, flip_dx, flip_dy

    def __call__(self, results):
        rotate_bda, scale_bda, flip_dx, flip_dy = self.sample_bda_augmentation()
        
        bda_mat = torch.zeros(4, 4)
        bda_mat[3, 3] = 1
        bda_rot = bev_transform(rotate_bda, scale_bda, flip_dx, flip_dy)
        bda_mat[:3, :3] = bda_rot

        results['bda_mat'] = bda_rot
        if 'points' in results.keys():
            results['points'].rotate(bda_rot)
        
        if self.input_modality['use_camera']:
            assert len(results['img_inputs']) == 8
            imgs, rots, trans, intrins, post_rots, post_trans, gt_depths, sensor2sensors = results['img_inputs']
            results['img_inputs'] = (imgs, rots, trans, intrins, post_rots, post_trans, bda_rot, imgs.shape[-2:], gt_depths, sensor2sensors)
        
        return results

class PhotoMetricDistortionMultiViewImage(object):
    """Apply photometric distortion to image sequentially, every transformation
    is applied with a probability of 0.5. The position of random contrast is in
    second or second to last.
    1. random brightness
    2. random contrast (mode 0)
    3. convert color from BGR to HSV
    4. random saturation
    5. random hue
    6. convert color from HSV to BGR
    7. random contrast (mode 1)
    8. randomly swap channels
    Args:
        brightness_delta (int): delta of brightness.
        contrast_range (tuple): range of contrast.
        saturation_range (tuple): range of saturation.
        hue_delta (int): delta of hue.
    """

    def __init__(self,
                 brightness_delta=32,
                 contrast_range=(0.5, 1.5),
                 saturation_range=(0.5, 1.5),
                 hue_delta=18):
        self.brightness_delta = brightness_delta
        self.contrast_lower, self.contrast_upper = contrast_range
        self.saturation_lower, self.saturation_upper = saturation_range
        self.hue_delta = hue_delta

    def __call__(self, img):
        """Call function to perform photometric distortion on images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Result dict with images distorted.
        """
        
        # convert PIL Image to Ndarray float32
        img = np.array(img, dtype=np.float32)

        assert img.dtype == np.float32, \
            'PhotoMetricDistortion needs the input image of dtype np.float32,'\
            ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
        # random brightness
        if random.randint(2):
            delta = random.uniform(-self.brightness_delta,
                                self.brightness_delta)
            img += delta

        # mode == 0 --> do random contrast first
        # mode == 1 --> do random contrast last
        mode = random.randint(2)
        if mode == 1:
            if random.randint(2):
                alpha = random.uniform(self.contrast_lower,
                                    self.contrast_upper)
                img *= alpha

        # convert color from BGR to HSV
        img = mmcv.bgr2hsv(img)

        # random saturation
        if random.randint(2):
            img[..., 1] *= random.uniform(self.saturation_lower,
                                        self.saturation_upper)

        # random hue
        if random.randint(2):
            img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
            img[..., 0][img[..., 0] > 360] -= 360
            img[..., 0][img[..., 0] < 0] += 360

        # convert color from HSV to BGR
        img = mmcv.hsv2bgr(img)

        # random contrast
        if mode == 0:
            if random.randint(2):
                alpha = random.uniform(self.contrast_lower,
                                    self.contrast_upper)
                img *= alpha

        # randomly swap channels
        if random.randint(2):
            img = img[..., random.permutation(3)]
        
        img = Image.fromarray(img.astype(np.uint8))
        
        return img

================================================
FILE: projects/occ_plugin/datasets/pipelines/loading_instance.py
================================================
# Developed by Junyi Ma based on the codebase of PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import numpy as np
from mmdet.datasets.builder import PIPELINES
import os
import torch
from pyquaternion import Quaternion
from nuscenes.utils.data_classes import Box
import time

@PIPELINES.register_module()
class LoadInstanceWithFlow(object):
    def __init__(self, cam4docc_dataset_path, grid_size=[512, 512, 40], pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], background=0, 
                    use_flow=True, use_separate_classes=False, use_lyft=False):
        '''
        Loading sequential occupancy labels and instance flows for training and testing

        cam4docc_dataset_path: data path of Cam4DOcc dataset, including 'segmentation', 'instance', and 'flow'
        grid_size: number of grids along H W L, default: [512, 512, 40]
        pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
        background: background pixel value for segmentation/instance/flow maps, default: 0
        use_flow: whether use flow for training schemes, default: True
        '''

        self.cam4docc_dataset_path = cam4docc_dataset_path

        self.pc_range = pc_range
        self.resolution = [(self.pc_range[3+i] - self.pc_range[i])/grid_size[i] for i in range(len(self.pc_range[:3]))]
        self.start_position = [self.pc_range[i] + self.resolution[i] / 2.0 for i in range(len(self.pc_range[:3]))]
        self.dimension = grid_size

        self.pc_range = np.array(self.pc_range)
        self.resolution = np.array(self.resolution)
        self.start_position = np.array(self.start_position)
        self.dimension = np.array(self.dimension)

        self.background = background
        self.use_flow = use_flow
        self.use_separate_classes = use_separate_classes

        self.use_lyft = use_lyft

    def get_poly_region(self, instance_annotation, present_egopose, present_ego2lidar):
        """
        Obtain the bounding box polygon of the instance
        """
        present_ego_translation, present_ego_rotation = present_egopose
        present_ego2lidar_translation, present_ego2lidar_rotation = present_ego2lidar

        box = Box(
            instance_annotation['translation'], instance_annotation['size'], Quaternion(instance_annotation['rotation'])
        )
        box.translate(present_ego_translation)
        box.rotate(present_ego_rotation)

        box.translate(present_ego2lidar_translation)
        box.rotate(present_ego2lidar_rotation)
        pts=box.corners().T

        X_min_box = pts.min(axis=0)[0]
        X_max_box = pts.max(axis=0)[0]
        Y_min_box = pts.min(axis=0)[1]
        Y_max_box = pts.max(axis=0)[1]
        Z_min_box = pts.min(axis=0)[2]
        Z_max_box = pts.max(axis=0)[2]

        if self.pc_range[0] <= X_min_box and X_max_box <= self.pc_range[3] \
                and self.pc_range[1] <= Y_min_box and Y_max_box <= self.pc_range[4] \
                and self.pc_range[2] <= Z_min_box and Z_max_box <= self.pc_range[5]:
            pts = np.round((pts - self.start_position[:3] + self.resolution[:3] / 2.0) / self.resolution[:3]).astype(np.int32)

            return pts
        else:
            return None

    def fill_occupancy(self, occ_instance, occ_segmentation, occ_attribute_label, instance_fill_info):

        x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.float)
        x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
        y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.float)
        y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
        z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.float)
        z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2])
        mesh_grid_3d = torch.stack((x_grid, y_grid, z_grid), -1)
        mesh_grid_3d = mesh_grid_3d.view(-1, 3)

        occ_instance = torch.from_numpy(occ_instance).view(-1, 1)
        occ_segmentation = torch.from_numpy(occ_segmentation).view(-1, 1)
        occ_attribute_label = torch.from_numpy(occ_attribute_label).view(-1, 1)

        for instance_info in instance_fill_info:
            poly_region_pts = instance_info['poly_region']
            semantic_id = instance_info['semantic_id']
            instance_id = instance_info['instance_id']
            attribute_label=instance_info['attribute_label']

            X_min_box = poly_region_pts.min(axis=0)[0]
            X_max_box = poly_region_pts.max(axis=0)[0]
            Y_min_box = poly_region_pts.min(axis=0)[1]
            Y_max_box = poly_region_pts.max(axis=0)[1]
            Z_min_box = poly_region_pts.min(axis=0)[2]
            Z_max_box = poly_region_pts.max(axis=0)[2]

            mask_cur_instance = (mesh_grid_3d[:,0] >= X_min_box) & (X_max_box >= mesh_grid_3d[:,0]) \
                                & (mesh_grid_3d[:,1] >= Y_min_box) & (Y_max_box >= mesh_grid_3d[:,1]) \
                                & (mesh_grid_3d[:,2] >= Z_min_box) & (Z_max_box >= mesh_grid_3d[:,2])
            occ_instance[mask_cur_instance] = instance_id
            occ_segmentation[mask_cur_instance] = semantic_id
            occ_attribute_label[mask_cur_instance] = attribute_label
        
        occ_instance = occ_instance.view(self.dimension[0], self.dimension[1], self.dimension[2]).long()
        occ_segmentation = occ_segmentation.view(self.dimension[0], self.dimension[1], self.dimension[2]).long()
        occ_attribute_label = occ_attribute_label.view(self.dimension[0], self.dimension[1], self.dimension[2]).long()

        return occ_instance, occ_segmentation, occ_attribute_label


    def get_label(self, input_seq_data):
        """
        Generate labels for semantic segmentation, instance segmentation, z position, attribute from the raw data of nuScenes
        """
        timestep = self.counter
        # Background is ID 0
        segmentation = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
        instance = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
        attribute_label = np.ones((self.dimension[0], self.dimension[1], self.dimension[2]))  * self.background
        
        instance_dict = input_seq_data['instance_dict']
        egopose_list = input_seq_data['egopose_list']
        ego2lidar_list = input_seq_data['ego2lidar_list']
        time_receptive_field = input_seq_data['time_receptive_field']

        instance_fill_info = []
        
        for instance_token, instance_annotation in instance_dict.items():
            if timestep not in instance_annotation['timestep']:
                continue
            pointer = instance_annotation['timestep'].index(timestep)
            annotation = {
                'translation': instance_annotation['translation'][pointer],
                'rotation': instance_annotation['rotation'][pointer],
                'size': instance_annotation['size'],
            }
            
            poly_region = self.get_poly_region(annotation, egopose_list[time_receptive_field - 1], ego2lidar_list[time_receptive_field - 1]) 

            if isinstance(poly_region, np.ndarray):
                if self.counter >= time_receptive_field and instance_token not in self.visible_instance_set:
                    continue
                self.visible_instance_set.add(instance_token)

                prepare_for_fill = dict(
                    poly_region=poly_region,
                    instance_id=instance_annotation['instance_id'],
                    attribute_label=instance_annotation['attribute_label'][pointer],
                    semantic_id=instance_annotation['semantic_id'],
                )

                instance_fill_info.append(prepare_for_fill)

        instance, segmentation, attribute_label = self.fill_occupancy(instance, segmentation, attribute_label, instance_fill_info)

        segmentation = segmentation.unsqueeze(0)
        instance = instance.unsqueeze(0)
        attribute_label = attribute_label.unsqueeze(0).unsqueeze(0)

        return segmentation, instance, attribute_label


    @staticmethod
    def generate_flow(flow, occ_instance_seq, instance, instance_id):
        """
        Generate ground truth for the flow of each instance based on instance segmentation
        """
        seg_len, wx, wy, wz = occ_instance_seq.shape
        ratio = 4
        occ_instance_seq = occ_instance_seq.reshape(seg_len, wx//ratio, ratio, wy//ratio, ratio, wz//ratio, ratio).permute(0,1,3,5,2,4,6).reshape(seg_len, wx//ratio, wy//ratio, wz//ratio, ratio**3)
        empty_mask = occ_instance_seq.sum(-1) == 0
        occ_instance_seq = occ_instance_seq.to(torch.int64)
        occ_space = occ_instance_seq[~empty_mask]
        occ_space[occ_space==0] = -torch.arange(len(occ_space[occ_space==0])).to(occ_space.device) - 1 
        occ_instance_seq[~empty_mask] = occ_space
        occ_instance_seq = torch.mode(occ_instance_seq, dim=-1)[0]
        occ_instance_seq[occ_instance_seq<0] = 0
        occ_instance_seq = occ_instance_seq.long()

        _, wx, wy, wz = occ_instance_seq.shape 
        x, y, z = torch.meshgrid(torch.arange(wx, dtype=torch.float), torch.arange(wy, dtype=torch.float), torch.arange(wz, dtype=torch.float))
        
        grid = torch.stack((x, y, z), dim=0)

        # Set the first frame
        init_pointer = instance['timestep'][0]
        instance_mask = (occ_instance_seq[init_pointer] == instance_id)

        flow[init_pointer, 0, instance_mask] = grid[0, instance_mask].mean(dim=0, keepdim=True).round() - grid[0, instance_mask]
        flow[init_pointer, 1, instance_mask] = grid[1, instance_mask].mean(dim=0, keepdim=True).round() - grid[1, instance_mask]
        flow[init_pointer, 2, instance_mask] = grid[2, instance_mask].mean(dim=0, keepdim=True).round() - grid[2, instance_mask]

        for i, timestep in enumerate(instance['timestep']):
            if i == 0:
                continue

            instance_mask = (occ_instance_seq[timestep] == instance_id)
            prev_instance_mask = (occ_instance_seq[timestep-1] == instance_id)
            if instance_mask.sum() == 0 or prev_instance_mask.sum() == 0:
                continue

            flow[timestep, 0, instance_mask] = grid[0, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[0, instance_mask]
            flow[timestep, 1, instance_mask] = grid[1, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[1, instance_mask]
            flow[timestep, 2, instance_mask] = grid[2, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[2, instance_mask]

        return flow

    def get_flow_label(self, input_seq_data, ignore_index=255):
        """
        Generate the global map of the flow ground truth
        """
        occ_instance = input_seq_data['instance']
        instance_dict = input_seq_data['instance_dict']
        instance_map = input_seq_data['instance_map']

        seq_len, wx, wy, wz = occ_instance.shape
        ratio = 4
        flow = ignore_index * torch.ones(seq_len, 3, wx//ratio, wy//ratio, wz//ratio)
        
        # ignore flow generation for faster pipelines
        if not self.use_flow:
            return flow

        for token, instance in instance_dict.items():
            flow = self.generate_flow(flow, occ_instance, instance, instance_map[token])
        return flow.float()

    # set ignore index to 0 for vis
    @staticmethod
    def convert_instance_mask_to_center_and_offset_label(input_seq_data, ignore_index=255, sigma=3):
        occ_instance = input_seq_data['instance']
        num_instances=len(input_seq_data['instance_map'])

        seq_len, wx, wy, wz = occ_instance.shape
        center_label = torch.zeros(seq_len, 1, wx, wy, wz)
        offset_label = ignore_index * torch.ones(seq_len, 3, wx, wy, wz)
        # x is vertical displacement, y is horizontal displacement
        x, y, z = torch.meshgrid(torch.arange(wx, dtype=torch.float), torch.arange(wy, dtype=torch.float), torch.arange(wz, dtype=torch.float))
        
        # Ignore id 0 which is the background
        for instance_id in range(1, num_instances+1):
            for t in range(seq_len):
                instance_mask = (occ_instance[t] == instance_id)

                xc = x[instance_mask].mean().round().long()
                yc = y[instance_mask].mean().round().long()
                zc = z[instance_mask].mean().round().long()

                off_x = xc - x
                off_y = yc - y
                off_z = zc - z

                g = torch.exp(-(off_x ** 2 + off_y ** 2 + off_z ** 2) / sigma ** 2)
                center_label[t, 0] = torch.maximum(center_label[t, 0], g)
                offset_label[t, 0, instance_mask] = off_x[instance_mask]
                offset_label[t, 1, instance_mask] = off_y[instance_mask]
                offset_label[t, 2, instance_mask] = off_z[instance_mask]

        return center_label, offset_label

    def __call__(self, results):
        assert 'segmentation' not in results.keys()
        assert 'instance' not in results.keys()
        assert 'attribute_label' not in results.keys()

        time_receptive_field = results['time_receptive_field']

        prefix = "MMO" if self.use_separate_classes else "GMO"
        if self.use_lyft:
            prefix = prefix + "_lyft"

        seg_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "segmentation")
        if not os.path.exists(seg_label_dir):
            os.mkdir(seg_label_dir)
        seg_label_path = os.path.join(seg_label_dir, \
            results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token'])

        instance_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "instance")
        if not os.path.exists(instance_label_dir):
            os.mkdir(instance_label_dir)
        instance_label_path = os.path.join(instance_label_dir, \
            results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token'])

        flow_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "flow")
        if not os.path.exists(flow_label_dir):
            os.mkdir(flow_label_dir)        
        flow_label_path = os.path.join(flow_label_dir, \
            results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token'])

        segmentation_list = []
        if os.path.exists(seg_label_path+".npz"):
            gt_segmentation_arr = np.load(seg_label_path+".npz",allow_pickle=True)['arr_0']
            for j in range(len(gt_segmentation_arr)):
                segmentation = np.zeros((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
                gt_segmentation = gt_segmentation_arr[j]
                gt_segmentation = torch.from_numpy(gt_segmentation)
                # for i in range(gt_segmentation.shape[0]):
                #     cur_ind = gt_segmentation[i, :3].long()
                #     cur_label = gt_segmentation[i, -1]
                #     segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label
                segmentation[gt_segmentation[:, 0].long(), gt_segmentation[:, 1].long(), gt_segmentation[:, 2].long()] = gt_segmentation[:, -1]
                segmentation = torch.from_numpy(segmentation).unsqueeze(0)
                segmentation_list.append(segmentation)

        instance_list = []
        if os.path.exists(instance_label_path+".npz"):
            gt_instance_arr = np.load(instance_label_path+".npz",allow_pickle=True)['arr_0']

            for j in range(len(gt_instance_arr)):
                instance = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background
                gt_instance = gt_instance_arr[j]
                gt_instance = torch.from_numpy(gt_instance)
                # for i in range(gt_instance.shape[0]):
                #     cur_ind = gt_instance[i, :3].long()
                #     cur_label = gt_instance[i, -1]
                #     instance[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label
                instance[gt_instance[:, 0].long(), gt_instance[:, 1].long(), gt_instance[:, 2].long()] = gt_instance[:, -1]
                instance = torch.from_numpy(instance).unsqueeze(0)
                instance_list.append(instance)
        
        flow_list = []
        if os.path.exists(flow_label_path+".npz"):
            gt_flow_arr = np.load(flow_label_path+".npz",allow_pickle=True)['arr_0']

            for j in range(len(gt_flow_arr)):
                flow = np.ones((3, self.dimension[0]//4, self.dimension[1]//4, self.dimension[2]//4)) * 255
                gt_flow = gt_flow_arr[j]
                gt_flow = torch.from_numpy(gt_flow)
                # for i in range(gt_flow.shape[0]):
                #     cur_ind = gt_flow[i, :3].long()
                #     cur_label = gt_flow[i, 3:]
                #     flow[0, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[0]
                #     flow[1, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[1]
                #     flow[2, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[2]
                flow[:, gt_flow[:, 0].long(), gt_flow[:, 1].long(), gt_flow[:, 2].long()] = gt_flow[:, 3:].permute(1, 0)
                flow = torch.from_numpy(flow).unsqueeze(0)
                flow_list.append(flow)


        if os.path.exists(seg_label_path+".npz") and os.path.exists(instance_label_path+".npz") and os.path.exists(flow_label_path+".npz"):
            results['segmentation'] = torch.cat(segmentation_list, dim=0)
            results['instance'] = torch.cat(instance_list, dim=0)
            results['attribute_label'] =  torch.from_numpy(np.zeros((self.dimension[0], self.dimension[1], self.dimension[2]))).unsqueeze(0)
            results['flow'] = torch.cat(flow_list, dim=0).float()

            for key, value in results.items():
                if key in ['sample_token', 'centerness', 'offset', 'flow', 'time_receptive_field', "indices", \
                  'segmentation','instance','attribute_label','sequence_length', 'instance_dict', 'instance_map', 'input_dict', 'egopose_list','ego2lidar_list','scene_token']:
                    continue
                results[key] = torch.cat(value, dim=0)
            return results

        else:
            results['segmentation'] = []
            results['instance'] = []
            results['attribute_label'] = []

            segmentation_saved_list = []
            instance_saved_list = []

            sequence_length = results['sequence_length']
            self.visible_instance_set = set()
            for self.counter in range(sequence_length):
                segmentation, instance, attribute_label = self.get_label(results)
                results['segmentation'].append(segmentation)
                results['instance'].append(instance)
                results['attribute_label'].append(attribute_label)

                x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.long)
                x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
                y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.long)
                y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
                z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.long)
                z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2])
                segmentation_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
                segmentation_for_save = segmentation_for_save.view(-1, 3)
                segmentation_label = segmentation.squeeze(0).view(-1,1)
                segmentation_for_save = torch.cat((segmentation_for_save, segmentation_label), dim=-1)
                kept = segmentation_for_save[:,-1]!=0
                segmentation_for_save= segmentation_for_save[kept]
                segmentation_saved_list.append(segmentation_for_save)


                x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.long)
                x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
                y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.long)
                y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2])
                z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.long)
                z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2])
                instance_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
                instance_for_save = instance_for_save.view(-1, 3)
                instance_label = instance.squeeze(0).view(-1,1)
                instance_for_save = torch.cat((instance_for_save, instance_label), dim=-1)
                kept = instance_for_save[:,-1]!=0
                instance_for_save= instance_for_save[kept]
                instance_saved_list.append(instance_for_save)
            
            segmentation_saved_list2 = [item.cpu().detach().numpy() for item in segmentation_saved_list]
            instance_saved_list2 = [item.cpu().detach().numpy() for item in instance_saved_list]

            np.savez(seg_label_path, segmentation_saved_list2)
            np.savez(instance_label_path, instance_saved_list2)

            results['segmentation'] = torch.cat(results['segmentation'], dim=0)
            results['instance'] = torch.cat(results['instance'], dim=0)
            results['attribute_label'] =  torch.from_numpy(np.zeros((self.dimension[0], self.dimension[1], self.dimension[2]))).unsqueeze(0)

            results['flow'] = self.get_flow_label(results, ignore_index=255)
            
            flow_saved_list = []
            sequence_length = results['sequence_length']
            d0 = self.dimension[0]//4
            d1 = self.dimension[1]//4 
            d2 = self.dimension[2]//4 
            for cnt in range(sequence_length):
                flow = results['flow'][cnt, ...]
                x_grid = torch.linspace(0, d0-1, d0, dtype=torch.long)
                x_grid = x_grid.view(d0, 1, 1).expand(d0, d1, d2)
                y_grid = torch.linspace(0, d1-1, d1, dtype=torch.long)
                y_grid = y_grid.view(1, d1, 1).expand(d0, d1, d2)
                z_grid = torch.linspace(0, d2-1, d2, dtype=torch.long)
                z_grid = z_grid.view(1, 1, d2).expand(d0, d1, d2)
                flow_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
                flow_for_save = flow_for_save.view(-1, 3)
                flow_label = flow.permute(1,2,3,0).view(-1,3)
                flow_for_save = torch.cat((flow_for_save, flow_label), dim=-1)
                kept = (flow_for_save[:,-1]!=255) & (flow_for_save[:,-2]!=255) & (flow_for_save[:,-3]!=255)
                flow_for_save= flow_for_save[kept]
                flow_saved_list.append(flow_for_save)

            flow_saved_list2 = [item.cpu().detach().numpy() for item in flow_saved_list]
            np.savez(flow_label_path, flow_saved_list2)

            for key, value in results.items():
                if key in ['sample_token', 'centerness', 'offset', 'flow', 'time_receptive_field', "indices", \
                   'segmentation','instance','attribute_label','sequence_length', 'instance_dict', 'instance_map', 'input_dict', 'egopose_list','ego2lidar_list','scene_token']:
                    continue
                results[key] = torch.cat(value, dim=0)

        return results

================================================
FILE: projects/occ_plugin/datasets/pipelines/loading_occupancy.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import numpy as np
import numba as nb

from mmdet.datasets.builder import PIPELINES
import yaml, os
import torch
import torch.nn.functional as F
import copy

@PIPELINES.register_module()
class LoadOccupancy(object):

    def __init__(self, to_float32=True, occ_path=None, grid_size=[512, 512, 40], pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], unoccupied=0, gt_resize_ratio=1, use_fine_occ=False, test_mode=False):
        '''
        Read sequential fine-grained occupancy labels from nuScenes-Occupancy if use_fine_occ=True
        '''
        self.to_float32 = to_float32
        self.occ_path = occ_path

        self.grid_size = np.array(grid_size)
        self.unoccupied = unoccupied
        self.pc_range = np.array(pc_range)
        self.voxel_size = (self.pc_range[3:] - self.pc_range[:3]) / self.grid_size
        self.gt_resize_ratio = gt_resize_ratio
        self.use_fine_occ = use_fine_occ

        self.test_mode = test_mode


    def get_seq_pseudo_occ(self, results):
        sequence_length = results['sequence_length']
        gt_occ_seq = []

        for count in range(sequence_length):
            processed_label = np.ones(self.grid_size, dtype=np.uint8) * self.unoccupied
            processed_label = torch.from_numpy(processed_label)
            gt_occ_seq.append(processed_label)

        gt_occ_seq = torch.stack(gt_occ_seq)
        return gt_occ_seq

    def get_seq_occ(self, results):
        sequence_length = results['sequence_length']
        gt_occ_seq = []

        for count in range(sequence_length):

            scene_token_cur = results['input_dict'][count]['scene_token']
            lidar_token_cur = results['input_dict'][count]['lidar_token']

            rel_path = 'scene_{0}/occupancy/{1}.npy'.format(scene_token_cur, lidar_token_cur)
            #  [z y x cls] or [z y x vx vy vz cls]
            pcd = np.load(os.path.join(self.occ_path, rel_path))
            pcd_label = pcd[..., -1:]
            pcd_label[pcd_label==0] = 255
            pcd_np_cor = self.voxel2world(pcd[..., [2,1,0]] + 0.5)
            untransformed_occ = copy.deepcopy(pcd_np_cor)

            egopose_list = results['egopose_list']
            ego2lidar_list = results['ego2lidar_list']
            time_receptive_field = results['time_receptive_field']
            present_global2ego = egopose_list[time_receptive_field - 1]
            present_ego2lidar = ego2lidar_list[time_receptive_field - 1]
            cur_global2ego = egopose_list[count]
            cur_ego2lidar = ego2lidar_list[count]

            pcd_np_cor = np.dot(cur_ego2lidar[1].inverse.rotation_matrix, pcd_np_cor.T)
            pcd_np_cor = pcd_np_cor.T
            pcd_np_cor = pcd_np_cor - cur_ego2lidar[0]   # trans
            # cur_ego -> global 
            pcd_np_cor = np.dot(cur_global2ego[1].inverse.rotation_matrix, pcd_np_cor.T)    # rot    
            pcd_np_cor = pcd_np_cor.T
            pcd_np_cor = pcd_np_cor - cur_global2ego[0]   # trans
            # global -> present_ego  
            pcd_np_cor = pcd_np_cor + present_global2ego[0]   # trans
            pcd_np_cor = np.dot(present_global2ego[1].rotation_matrix, pcd_np_cor.T)
            pcd_np_cor = pcd_np_cor.T
            # present_ego -> present_lidar
            pcd_np_cor = pcd_np_cor + present_ego2lidar[0]   # trans
            pcd_np_cor = np.dot(present_ego2lidar[1].rotation_matrix, pcd_np_cor.T)    # rot    
            pcd_np_cor = pcd_np_cor.T            

            pcd_np_cor = self.world2voxel(pcd_np_cor)

            # make sure the point is in the grid
            pcd_np_cor = np.clip(pcd_np_cor, np.array([0,0,0]), self.grid_size - 1)
            transformed_occ = copy.deepcopy(pcd_np_cor)
            pcd_np = np.concatenate([pcd_np_cor, pcd_label], axis=-1)

            # 255: noise, 1-16 normal classes, 0 unoccupied
            pcd_np = pcd_np[np.lexsort((pcd_np_cor[:, 0], pcd_np_cor[:, 1], pcd_np_cor[:, 2])), :]
            pcd_np = pcd_np.astype(np.int64)
            processed_label = np.ones(self.grid_size, dtype=np.uint8) * self.unoccupied
            processed_label = nb_process_label(processed_label, pcd_np)

            processed_label = torch.from_numpy(processed_label)

            # TODO: hard coding
            for otheridx in [0,1,7,8,11,12,13,14,15,16,17,18,255]:
                processed_label[processed_label==otheridx] = 0
            for vehidx in [2,3,4,5,6,9,10]:
                processed_label[processed_label==vehidx] = 1

            gt_occ_seq.append(processed_label)

        gt_occ_seq = torch.stack(gt_occ_seq)
        return gt_occ_seq
    

    def __call__(self, results):
        if self.use_fine_occ:
            results['gt_occ'] = self.get_seq_occ(results)
        else:
            results['gt_occ'] = self.get_seq_pseudo_occ(results)

        return results

    def voxel2world(self, voxel):
        """
        voxel: [N, 3]
        """
        return voxel * self.voxel_size[None, :] + self.pc_range[:3][None, :]


    def world2voxel(self, world):
        """
        world: [N, 3]
        """
        return (world - self.pc_range[:3][None, :]) / self.voxel_size[None, :]


    def __repr__(self):
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(to_float32={self.to_float32}'
        return repr_str

    def project_points(self, points, rots, trans, intrins, post_rots, post_trans):
        
        # from lidar to camera
        points = points.reshape(-1, 1, 3)
        points = points - trans.reshape(1, -1, 3)
        inv_rots = rots.inverse().unsqueeze(0)
        points = (inv_rots @ points.unsqueeze(-1))
        
        # from camera to raw pixel
        points = (intrins.unsqueeze(0) @ points).squeeze(-1)
        points_d = points[..., 2:3]
        points_uv = points[..., :2] / points_d
        
        # from raw pixel to transformed pixel
        points_uv = post_rots[:, :2, :2].unsqueeze(0) @ points_uv.unsqueeze(-1)
        points_uv = points_uv.squeeze(-1) + post_trans[..., :2].unsqueeze(0)
        points_uvd = torch.cat((points_uv, points_d), dim=2)
        
        return points_uvd
    
# b1:boolean, u1: uint8, i2: int16, u2: uint16
@nb.jit('b1[:](i2[:,:],u2[:,:],b1[:])', nopython=True, cache=True, parallel=False)
def nb_process_img_points(basic_valid_occ, depth_canva, nb_valid_mask):
    # basic_valid_occ M 3
    # depth_canva H W
    # label_size = M   # for original occ, small: 2w mid: ~8w base: ~30w
    canva_idx = -1 * np.ones_like(depth_canva, dtype=np.int16)
    for i in range(basic_valid_occ.shape[0]):
        occ = basic_valid_occ[i]
        if occ[2] < depth_canva[occ[1], occ[0]]:
            if canva_idx[occ[1], occ[0]] != -1:
                nb_valid_mask[canva_idx[occ[1], occ[0]]] = False

            canva_idx[occ[1], occ[0]] = i
            depth_canva[occ[1], occ[0]] = occ[2]
            nb_valid_mask[i] = True
    return nb_valid_mask

# u1: uint8, u8: uint16, i8: int64
@nb.jit('u1[:,:,:](u1[:,:,:],i8[:,:])', nopython=True, cache=True, parallel=False)
def nb_process_label_withvel(processed_label, sorted_label_voxel_pair):
    label_size = 256
    counter = np.zeros((label_size,), dtype=np.uint16)
    counter[sorted_label_voxel_pair[0, 3]] = 1
    cur_sear_ind = sorted_label_voxel_pair[0, :3]
    for i in range(1, sorted_label_voxel_pair.shape[0]):
        cur_ind = sorted_label_voxel_pair[i, :3]
        if not np.all(np.equal(cur_ind, cur_sear_ind)):
            processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
            counter = np.zeros((label_size,), dtype=np.uint16)
            cur_sear_ind = cur_ind
        counter[sorted_label_voxel_pair[i, 3]] += 1
    processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
    
    return processed_label


# u1: uint8, u8: uint16, i8: int64
@nb.jit('u1[:,:,:](u1[:,:,:],i8[:,:])', nopython=True, cache=True, parallel=False)
def nb_process_label(processed_label, sorted_label_voxel_pair):
    label_size = 256
    counter = np.zeros((label_size,), dtype=np.uint16)
    counter[sorted_label_voxel_pair[0, 3]] = 1
    cur_sear_ind = sorted_label_voxel_pair[0, :3]
    for i in range(1, sorted_label_voxel_pair.shape[0]):
        cur_ind = sorted_label_voxel_pair[i, :3]
        if not np.all(np.equal(cur_ind, cur_sear_ind)):
            processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
            counter = np.zeros((label_size,), dtype=np.uint16)
            cur_sear_ind = cur_ind
        counter[sorted_label_voxel_pair[i, 3]] += 1
    processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter)
    
    return processed_label

================================================
FILE: projects/occ_plugin/datasets/pipelines/transform_3d.py
================================================
import numpy as np
from numpy import random
import mmcv
from mmdet.datasets.builder import PIPELINES
from mmcv.parallel import DataContainer as DC

@PIPELINES.register_module()
class PadMultiViewImage(object):
    """Pad the multi-view image.
    There are two padding modes: (1) pad to a fixed size and (2) pad to the
    minimum size that is divisible by some number.
    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
    Args:
        size (tuple, optional): Fixed padding size.
        size_divisor (int, optional): The divisor of padded size.
        pad_val (float, optional): Padding value, 0 by default.
    """

    def __init__(self, size=None, size_divisor=None, pad_val=0):
        self.size = size
        self.size_divisor = size_divisor
        self.pad_val = pad_val
        # only one of size and size_divisor should be valid
        assert size is not None or size_divisor is not None
        assert size is None or size_divisor is None

    def _pad_img(self, results):
        """Pad images according to ``self.size``."""
        if self.size is not None:
            padded_img = [mmcv.impad(
                img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
        elif self.size_divisor is not None:
            padded_img = [mmcv.impad_to_multiple(
                img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
        
        results['ori_shape'] = [img.shape for img in results['img']]
        results['img'] = padded_img
        results['img_shape'] = [img.shape for img in padded_img]
        results['pad_shape'] = [img.shape for img in padded_img]
        results['pad_fixed_size'] = self.size
        results['pad_size_divisor'] = self.size_divisor

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        self._pad_img(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.size}, '
        repr_str += f'size_divisor={self.size_divisor}, '
        repr_str += f'pad_val={self.pad_val})'
        return repr_str


@PIPELINES.register_module()
class NormalizeMultiviewImage(object):
    """Normalize the image.
    Added key is "img_norm_cfg".
    Args:
        mean (sequence): Mean values of 3 channels.
        std (sequence): Std values of 3 channels.
        to_rgb (bool): Whether to convert the image from BGR to RGB,
            default is true.
    """

    def __init__(self, mean, std, to_rgb=True):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb


    def __call__(self, results):
        """Call function to normalize images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Normalized results, 'img_norm_cfg' key is added into
                result dict.
        """

        results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
        results['img_norm_cfg'] = dict(
            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
        return repr_str


@PIPELINES.register_module()
class PhotoMetricDistortionMultiViewImage:
    """Apply photometric distortion to image sequentially, every transformation
    is applied with a probability of 0.5. The position of random contrast is in
    second or second to last.
    1. random brightness
    2. random contrast (mode 0)
    3. convert color from BGR to HSV
    4. random saturation
    5. random hue
    6. convert color from HSV to BGR
    7. random contrast (mode 1)
    8. randomly swap channels
    Args:
        brightness_delta (int): delta of brightness.
        contrast_range (tuple): range of contrast.
        saturation_range (tuple): range of saturation.
        hue_delta (int): delta of hue.
    """

    def __init__(self,
                 brightness_delta=32,
                 contrast_range=(0.5, 1.5),
                 saturation_range=(0.5, 1.5),
                 hue_delta=18):
        self.brightness_delta = brightness_delta
        self.contrast_lower, self.contrast_upper = contrast_range
        self.saturation_lower, self.saturation_upper = saturation_range
        self.hue_delta = hue_delta

    def __call__(self, results):
        """Call function to perform photometric distortion on images.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Result dict with images distorted.
        """
        imgs = results['img']
        new_imgs = []
        for img in imgs:
            assert img.dtype == np.float32, \
                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
            # random brightness
            if random.randint(2):
                delta = random.uniform(-self.brightness_delta,
                                    self.brightness_delta)
                img += delta

            # mode == 0 --> do random contrast first
            # mode == 1 --> do random contrast last
            mode = random.randint(2)
            if mode == 1:
                if random.randint(2):
                    alpha = random.uniform(self.contrast_lower,
                                        self.contrast_upper)
                    img *= alpha

            # convert color from BGR to HSV
            img = mmcv.bgr2hsv(img)

            # random saturation
            if random.randint(2):
                img[..., 1] *= random.uniform(self.saturation_lower,
                                            self.saturation_upper)

            # random hue
            if random.randint(2):
                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
                img[..., 0][img[..., 0] > 360] -= 360
                img[..., 0][img[..., 0] < 0] += 360

            # convert color from HSV to BGR
            img = mmcv.hsv2bgr(img)

            # random contrast
            if mode == 0:
                if random.randint(2):
                    alpha = random.uniform(self.contrast_lower,
                                        self.contrast_upper)
                    img *= alpha

            # randomly swap channels
            if random.randint(2):
                img = img[..., random.permutation(3)]
            new_imgs.append(img)
        results['img'] = new_imgs
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
        repr_str += 'contrast_range='
        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
        repr_str += 'saturation_range='
        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
        repr_str += f'hue_delta={self.hue_delta})'
        return repr_str


@PIPELINES.register_module()
class CustomCollect3D(object):
    """Collect data from the loader relevant to the specific task.
    This is usually the last stage of the data loader pipeline. Typically keys
    is set to some subset of "img", "proposals", "gt_bboxes",
    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
    The "img_meta" item is always populated.  The contents of the "img_meta"
    dictionary depends on "meta_keys". By default this includes:
        - 'img_shape': shape of the image input to the network as a tuple \
            (h, w, c).  Note that images may be zero padded on the \
            bottom/right if the batch tensor is larger than this shape.
        - 'scale_factor': a float indicating the preprocessing scale
        - 'flip': a boolean indicating if image flip transform was used
        - 'filename': path to the image file
        - 'ori_shape': original shape of the image as a tuple (h, w, c)
        - 'pad_shape': image shape after padding
        - 'lidar2img': transform from lidar to image
        - 'depth2img': transform from depth to image
        - 'cam2img': transform from camera to image
        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
            flipped horizontally
        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
            flipped vertically
        - 'box_mode_3d': 3D box mode
        - 'box_type_3d': 3D box type
        - 'img_norm_cfg': a dict of normalization information:
            - mean: per channel mean subtraction
            - std: per channel std divisor
            - to_rgb: bool indicating if bgr was converted to rgb
        - 'pcd_trans': point cloud transformations
        - 'sample_idx': sample index
        - 'pcd_scale_factor': point cloud scale factor
        - 'pcd_rotation': rotation applied to point cloud
        - 'pts_filename': path to point cloud file.
    Args:
        keys (Sequence[str]): Keys of results to be collected in ``data``.
        meta_keys (Sequence[str], optional): Meta keys to be converted to
            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
    """

    def __init__(self,
                 keys,
                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
                            'depth2img', 'cam2img', 'pad_shape',
                            'scale_factor', 'flip', 'pcd_horizontal_flip',
                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
                            'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
                            'transformation_3d_flow', 'scene_token',
                            'can_bus'
                            )):
        self.keys = keys
        self.meta_keys = meta_keys

    def __call__(self, results):
        """Call function to collect keys in results. The keys in ``meta_keys``
        will be converted to :obj:`mmcv.DataContainer`.
        Args:
            results (dict): Result dict contains the data to collect.
        Returns:
            dict: The result dict contains the following keys
                - keys in ``self.keys``
                - ``img_metas``
        """
       
        data = {}
        img_metas = {}
      
        for key in self.meta_keys:
            if key in results:
                img_metas[key] = results[key]

        data['img_metas'] = DC(img_metas, cpu_only=True)
        for key in self.keys:
            data[key] = results[key]
        return data

    def __repr__(self):
        """str: Return a string that describes the module."""
        return self.__class__.__name__ + \
            f'(keys={self.keys}, meta_keys={self.meta_keys})'

@PIPELINES.register_module()
class CustomOccCollect3D(object):
    """Collect data from the loader relevant to the specific task.
    This is usually the last stage of the data loader pipeline. Typically keys
    is set to some subset of "img", "proposals", "gt_bboxes",
    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
    The "img_meta" item is always populated.  The contents of the "img_meta"
    dictionary depends on "meta_keys". By default this includes:
        - 'img_shape': shape of the image input to the network as a tuple \
            (h, w, c).  Note that images may be zero padded on the \
            bottom/right if the batch tensor is larger than this shape.
        - 'scale_factor': a float indicating the preprocessing scale
        - 'flip': a boolean indicating if image flip transform was used
        - 'filename': path to the image file
        - 'ori_shape': original shape of the image as a tuple (h, w, c)
        - 'pad_shape': image shape after padding
        - 'lidar2img': transform from lidar to image
        - 'depth2img': transform from depth to image
        - 'cam2img': transform from camera to image
        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
            flipped horizontally
        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
            flipped vertically
        - 'box_mode_3d': 3D box mode
        - 'box_type_3d': 3D box type
        - 'img_norm_cfg': a dict of normalization information:
            - mean: per channel mean subtraction
            - std: per channel std divisor
            - to_rgb: bool indicating if bgr was converted to rgb
        - 'pcd_trans': point cloud transformations
        - 'sample_idx': sample index
        - 'pcd_scale_factor': point cloud scale factor
        - 'pcd_rotation': rotation applied to point cloud
        - 'pts_filename': path to point cloud file.
    Args:
        keys (Sequence[str]): Keys of results to be collected in ``data``.
        meta_keys (Sequence[str], optional): Meta keys to be converted to
            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
    """

    def __init__(self,
                 keys,
                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
                            'depth2img', 'cam2img', 'pad_shape',
                            'scale_factor', 'flip', 'pcd_horizontal_flip',
                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
                            'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
                            'transformation_3d_flow', 'scene_token',
                            'can_bus', 'pc_range', 'occ_size', 'lidar_token'
                            )):
        self.keys = keys
        self.meta_keys = meta_keys

    def __call__(self, results):
        """Call function to collect keys in results. The keys in ``meta_keys``
        will be converted to :obj:`mmcv.DataContainer`.
        Args:
            results (dict): Result dict contains the data to collect.
        Returns:
            dict: The result dict contains the following keys
                - keys in ``self.keys``
                - ``img_metas``
        """
       
        data = {}
        img_metas = {}
      
        for key in self.meta_keys:
            if key in results:
                img_metas[key] = results[key]

        data['img_metas'] = DC(img_metas, cpu_only=True)
        for key in self.keys:
            if key in results.keys():
                data[key] = results[key]
        print("self.keys", self.keys)
        # if 'gt_occ' in results.keys():
        #     data['gt_occ'] = results['gt_occ']
            
        return data

    def __repr__(self):
        """str: Return a string that describes the module."""
        return self.__class__.__name__ + \
            f'(keys={self.keys}, meta_keys={self.meta_keys})'

@PIPELINES.register_module()
class RandomScaleImageMultiViewImage(object):
    """Random scale the image
    Args:
        scales
    """

    def __init__(self, scales=[]):
        self.scales = scales
        assert len(self.scales)==1

    def __call__(self, results):
        """Call function to pad images, masks, semantic segmentation maps.
        Args:
            results (dict): Result dict from loading pipeline.
        Returns:
            dict: Updated result dict.
        """
        rand_ind = np.random.permutation(range(len(self.scales)))[0]
        rand_scale = self.scales[rand_ind]

        y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
        x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
        scale_factor = np.eye(4)
        scale_factor[0, 0] *= rand_scale
        scale_factor[1, 1] *= rand_scale
        results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
                          enumerate(results['img'])]
        lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
        results['lidar2img'] = lidar2img
        results['img_shape'] = [img.shape for img in results['img']]
        results['ori_shape'] = [img.shape for img in results['img']]

        return results


    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(size={self.scales}, '
        return repr_str

================================================
FILE: projects/occ_plugin/datasets/samplers/__init__.py
================================================
from .group_sampler import DistributedGroupSampler
from .distributed_sampler import DistributedSampler
from .sampler import SAMPLER, build_sampler


================================================
FILE: projects/occ_plugin/datasets/samplers/distributed_sampler.py
================================================
import math

import torch
from torch.utils.data import DistributedSampler as _DistributedSampler
from .sampler import SAMPLER


@SAMPLER.register_module()
class DistributedSampler(_DistributedSampler):

    def __init__(self,
                 dataset=None,
                 num_replicas=None,
                 rank=None,
                 shuffle=True,
                 seed=0):
        super().__init__(
            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
        # for the compatibility from PyTorch 1.3+
        self.seed = seed if seed is not None else 0

    def __iter__(self):
        # deterministically shuffle based on epoch
        if self.shuffle:
            assert False
        else:
            indices = torch.arange(len(self.dataset)).tolist()

        # add extra samples to make it evenly divisible
        # in case that indices is shorter than half of total_size
        indices = (indices *
                   math.ceil(self.total_size / len(indices)))[:self.total_size]
        assert len(indices) == self.total_size

        # subsample
        per_replicas = self.total_size//self.num_replicas
        # indices = indices[self.rank:self.total_size:self.num_replicas]
        indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
        assert len(indices) == self.num_samples

        return iter(indices)


================================================
FILE: projects/occ_plugin/datasets/samplers/group_sampler.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math

import numpy as np
import torch
from mmcv.runner import get_dist_info
from torch.utils.data import Sampler
from .sampler import SAMPLER
import random
from IPython import embed


@SAMPLER.register_module()
class DistributedGroupSampler(Sampler):
    """Sampler that restricts data loading to a subset of the dataset.
    It is especially useful in conjunction with
    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
    process can pass a DistributedSampler instance as a DataLoader sampler,
    and load a subset of the original dataset that is exclusive to it.
    .. note::
        Dataset is assumed to be of constant size.
    Arguments:
        dataset: Dataset used for sampling.
        num_replicas (optional): Number of processes participating in
            distributed training.
        rank (optional): Rank of the current process within num_replicas.
        seed (int, optional): random seed used to shuffle the sampler if
            ``shuffle=True``. This number should be identical across all
            processes in the distributed group. Default: 0.
    """

    def __init__(self,
                 dataset,
                 samples_per_gpu=1,
                 num_replicas=None,
                 rank=None,
                 seed=0):
        _rank, _num_replicas = get_dist_info()
        if num_replicas is None:
            num_replicas = _num_replicas
        if rank is None:
            rank = _rank
        self.dataset = dataset
        self.samples_per_gpu = samples_per_gpu
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.seed = seed if seed is not None else 0

        assert hasattr(self.dataset, 'flag')
        self.flag = self.dataset.flag
        self.group_sizes = np.bincount(self.flag)

        self.num_samples = 0
        for i, j in enumerate(self.group_sizes):
            self.num_samples += int(
                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
                          self.num_replicas)) * self.samples_per_gpu
        self.total_size = self.num_samples * self.num_replicas

    def __iter__(self):
        # deterministically shuffle based on epoch
        g = torch.Generator()
        g.manual_seed(self.epoch + self.seed)

        indices = []
        for i, size in enumerate(self.group_sizes):
            if size > 0:
                indice = np.where(self.flag == i)[0]
                assert len(indice) == size
                # add .numpy() to avoid bug when selecting indice in parrots.
                # TODO: check whether torch.randperm() can be replaced by
                # numpy.random.permutation().
                indice = indice[list(
                    torch.randperm(int(size), generator=g).numpy())].tolist()
                extra = int(
                    math.ceil(
                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
                ) * self.samples_per_gpu * self.num_replicas - len(indice)
                # pad indice
                tmp = indice.copy()
                for _ in range(extra // size):
                    indice.extend(tmp)
                indice.extend(tmp[:extra % size])
                indices.extend(indice)

        assert len(indices) == self.total_size

        indices = [
            indices[j] for i in list(
                torch.randperm(
                    len(indices) // self.samples_per_gpu, generator=g))
            for j in range(i * self.samples_per_gpu, (i + 1) *
                           self.samples_per_gpu)
        ]

        # subsample
        offset = self.num_samples * self.rank
        indices = indices[offset:offset + self.num_samples]
        assert len(indices) == self.num_samples

        return iter(indices)

    def __len__(self):
        return self.num_samples

    def set_epoch(self, epoch):
        self.epoch = epoch


================================================
FILE: projects/occ_plugin/datasets/samplers/sampler.py
================================================
from mmcv.utils.registry import Registry, build_from_cfg

SAMPLER = Registry('sampler')


def build_sampler(cfg, default_args):
    return build_from_cfg(cfg, SAMPLER, default_args)


================================================
FILE: projects/occ_plugin/occupancy/__init__.py
================================================
from .dense_heads import *
from .detectors import *
from .backbones import *
from .image2bev import *
from .voxel_encoder import *
from .necks import *
from .fuser import *


================================================
FILE: projects/occ_plugin/occupancy/apis/__init__.py
================================================
from .train import custom_train_model
from .mmdet_train import custom_train_detector
# from .test import custom_multi_gpu_test

================================================
FILE: projects/occ_plugin/occupancy/apis/mmdet_train.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Junyi Ma, following OpenOccupancy of Xiaofeng Wang
# ---------------------------------------------
import random
import warnings

import numpy as np
import torch
import torch.distributed as dist
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
                         build_runner, get_dist_info)
from mmcv.utils import build_from_cfg
from mmdet.core import EvalHook
from mmdet.datasets import (build_dataset,
                            replace_ImageToTensor)
from mmdet.utils import get_root_logger
import time
import os.path as osp
from projects.occ_plugin.datasets.builder import build_dataloader
from projects.occ_plugin.core.evaluation.eval_hooks import OccDistEvalHook, OccEvalHook
from projects.occ_plugin.datasets import custom_build_dataset

def custom_train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    
    logger = get_root_logger(cfg.log_level)
    
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed,
            shuffler_sampler=cfg.data.shuffler_sampler,
            nonshuffler_sampler=cfg.data.nonshuffler_sampler,
        ) for ds in dataset
    ]
    # torch.distributed.init_process_group(backend='nccl')

    
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(
            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    assert 'runner' in cfg
    runner = build_runner(
        cfg.runner,
        default_args=dict(
            model=model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta))

    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting TODO
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    
    if distributed:
        if isinstance(runner, EpochBasedRunner):
            runner.register_hook(DistSamplerSeedHook())

    rank, world_size = get_dist_info()
    if cfg.resume_from:
        if rank == 0:
            print("-------------")
            print("resume from " + cfg.resume_from)
            print("-------------")
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        if rank == 0:
            print("-------------")
            print("load from " + cfg.load_from)
            print("-------------")
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow)


================================================
FILE: projects/occ_plugin/occupancy/apis/test.py
================================================
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
#  Modified by Zhiqi Li
# ---------------------------------------------
import os.path as osp
import pickle
import shutil
import tempfile
import time

import mmcv
import torch
import torch.distributed as dist
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info
from mmdet.utils import get_root_logger
from mmdet.core import encode_mask_results


import numpy as np
import pycocotools.mask as mask_util
from fvcore.nn import FlopCountAnalysis, parameter_count_table

def custom_encode_mask_results(mask_results):
    """Encode bitmap mask to RLE code. Semantic Masks only
    Args:
        mask_results (list | tuple[list]): bitmap mask results.
            In mask scoring rcnn, mask_results is a tuple of (segm_results,
            segm_cls_score).
    Returns:
        list | tuple: RLE encoded mask.
    """
    cls_segms = mask_results
    num_classes = len(cls_segms)
    encoded_mask_results = []
    for i in range(len(cls_segms)):
        encoded_mask_results.append(
            mask_util.encode(
                np.array(
                    cls_segms[i][:, :, np.newaxis], order='F',
                        dtype='uint8'))[0])  # encoded with RLE
    return [encoded_mask_results]

def custom_single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3):
    model.eval()
    
    iou_metric = 0
    vpq_metric = 0
    dataset = data_loader.dataset
    prog_bar = mmcv.ProgressBar(len(dataset))
    logger = get_root_logger()
    
    logger.info(parameter_count_table(model))
    
    for i, data in enumerate(data_loader):
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)
            
            if 'hist_for_iou' in result.keys():
                iou_metric += result['hist_for_iou']
                vpq_metric += result['vpq']

        prog_bar.update()

    res = {
        'hist_for_iou': iou_metric,
        'vpq_len': len(dataset),
        'vpq_metric': vpq_metric,
    }

    return res

def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False, show=False, out_dir=None):
    """Test model with multiple gpus.
    This method tests model with multiple gpus and collects the results
    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
    it encodes results to gpu tensors and use gpu communication for results
    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
    and collects them by the rank 0 worker.
    Args:
        model (nn.Module): Model to be tested.
        data_loader (nn.Dataloader): Pytorch data loader.
        tmpdir (str): Path of directory to save the temporary results from
            different gpus under cpu mode.
        gpu_collect (bool): Option to use either gpu or cpu to collect results.
    Returns:
        list: The prediction results.
    """

    model.eval()
    
    # init predictions
    iou_metric = []
    vpq_metric = []

    dataset = data_loader.dataset
    rank, world_size = get_dist_info()
    if rank == 0:
        prog_bar = mmcv.ProgressBar(len(dataset))
    
    time.sleep(2)  # This line can prevent deadlock problem in some cases.
    
    logger = get_root_logger()
    logger.info(parameter_count_table(model))
    
    for i, data in enumerate(data_loader):

        with torch.no_grad():

            result = model(return_loss=False, rescale=True, **data)
            
            if 'hist_for_iou' in result.keys():
                iou_metric.append(result['hist_for_iou'])
            if 'vpq' in result.keys():
                vpq_metric.append(result['vpq'])

            batch_size = 1
                
        if rank == 0:
            for _ in range(batch_size * world_size):
                prog_bar.update()

    # collect lists from multi-GPUs
    res = {}

    if 'hist_for_iou' in result.keys():
        iou_metric = [sum(iou_metric)]
        iou_metric = collect_results_cpu(iou_metric, len(dataset), tmpdir)
        res['hist_for_iou'] = iou_metric

    if 'vpq' in result.keys():
        res['vpq_len'] = len(dataset)
        vpq_metric = [sum(vpq_metric)]
        vpq_metric = collect_results_cpu(vpq_metric, len(dataset), tmpdir)
        res['vpq_metric'] = vpq_metric

    return res


def collect_results_cpu(result_part, size, tmpdir=None, type='list'):
    rank, world_size = get_dist_info()
    # create a tmp dir if it is not specified
    
    if tmpdir is None:
        MAX_LEN = 512
        # 32 is whitespace
        dir_tensor = torch.full((MAX_LEN,), 32, dtype=torch.uint8, device='cuda')
        if rank == 0:
            mmcv.mkdir_or_exist('.dist_test')
            tmpdir = tempfile.mkdtemp(dir='.dist_test')
            tmpdir = torch.tensor(
                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
            dir_tensor[:len(tmpdir)] = tmpdir
        dist.broadcast(dir_tensor, 0)
        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
    else:
        mmcv.mkdir_or_exist(tmpdir)
    
    # dump the part result to the dir
    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
    dist.barrier()

    # collect all parts
    if rank == 0:
    
        # load results of all parts from tmp dir
        part_list = []
        for i in range(world_size):
            part_file = osp.join(tmpdir, f'part_{i}.pkl')
            part_list.append(mmcv.load(part_file))

        # sort the results
        if type == 'list':
            ordered_results = []
            for res in part_list:  
                ordered_results.extend(list(res))
            # the dataloader may pad some samples
            ordered_results = ordered_results[:size]
        
        else:
            raise NotImplementedError
        
        # remove tmp dir
        shutil.rmtree(tmpdir)
    
    dist.barrier()

    if rank != 0:
        return None
    
    return ordered_results


================================================
FILE: projects/occ_plugin/occupancy/apis/train.py
================================================
from .mmdet_train import custom_train_detector
from mmseg.apis import train_segmentor
from mmdet.apis import train_detector

def custom_train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    """A function wrapper for launching model training according to cfg.

    Because we need different eval_hook in runner. Should be deprecated in the
    future.
    """
    if cfg.model.type in ['EncoderDecoder3D']:
        assert False
    else:
        custom_train_detector(
            model,
            dataset,
            cfg,
            distributed=distributed,
            validate=validate,
            timestamp=timestamp,
            meta=meta)


def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    """A function wrapper for launching model training according to cfg.

    Because we need different eval_hook in runner. Should be deprecated in the
    future.
    """
    if cfg.model.type in ['EncoderDecoder3D']:
        train_segmentor(
            model,
            dataset,
            cfg,
            distributed=distributed,
            validate=validate,
            timestamp=timestamp,
            meta=meta)
    else:
        train_detector(
            model,
            dataset,
            cfg,
            distributed=distributed,
            validate=validate,
            timestamp=timestamp,
            meta=meta)


================================================
FILE: projects/occ_plugin/occupancy/backbones/__init__.py
================================================
from .resnet3d import CustomResNet3D
from .pred_block import Predictor

================================================
FILE: projects/occ_plugin/occupancy/backbones/pred_block.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet3d.models.builder import BACKBONES
from collections import OrderedDict
from mmcv.cnn import build_norm_layer


class Residual(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size=(3,3,1),
        dilation=1,
        norm_cfg=None
    ):
        super().__init__()
        out_channels = out_channels or in_channels
        # padding_size = ((kernel_size - 1) * dilation + 1) // 2
        padding_size = [0,0,0]
        if dilation!=0:
            padding_size[0] = ((kernel_size[0] - 1) * dilation + 1) // 2
            padding_size[1] = ((kernel_size[1] - 1) * dilation + 1) // 2
            padding_size[2] = ((kernel_size[2] - 1) * dilation + 1) // 2
        padding_size = tuple(padding_size)

        conv = nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size, bias=False, dilation=dilation, padding=padding_size)
        self.layers = nn.Sequential(conv, build_norm_layer(norm_cfg, out_channels)[1], nn.LeakyReLU(inplace=True))


        if out_channels == in_channels :
            self.projection = None
        else:
            projection = OrderedDict()
            projection.update(
                {
                    'conv_skip_proj': nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=False),
                    'bn_skip_proj': build_norm_layer(norm_cfg, out_channels)[1],
                }
            )
            self.projection = nn.Sequential(projection)


    def forward(self, x):
        x_residual = self.layers(x)
        if self.projection is not None:
            x_projected = self.projection(x)
            return x_residual + x_projected
        return x_residual + x


@BACKBONES.register_module()
class Predictor(nn.Module):
    def __init__(
        self,
        n_input_channels=None,
        in_timesteps=None,
        out_timesteps=None,
        norm_cfg=None,
    ):
        super(Predictor, self).__init__()
        
        self.predictor = nn.ModuleList()
        for nf in n_input_channels:
            self.predictor.append(nn.Sequential(
                Residual(nf * in_timesteps, nf * in_timesteps, norm_cfg=norm_cfg),
                Residual(nf * in_timesteps, nf * in_timesteps, norm_cfg=norm_cfg),
                Residual(nf * in_timesteps, nf * out_timesteps, norm_cfg=norm_cfg),
                Residual(nf * out_timesteps, nf * out_timesteps, norm_cfg=norm_cfg),
                Residual(nf * out_timesteps, nf * out_timesteps, norm_cfg=norm_cfg),
            ))

    def forward(self, x):
        assert len(x) == len(self.predictor), f'The number of input feature tensors ({len(x)}) must be the same as the number of STPredictor blocks {len(self.predictor)}.'
        
        y = []
        
        for i in range(len(x)):
            b, c, _, _, _ = x[i].shape
            y.append(self.predictor[i](x[i]))
                
        return y

================================================
FILE: projects/occ_plugin/occupancy/backbones/resnet3d.py
================================================
import math
from functools import partial
from mmdet3d.models.builder import BACKBONES
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule

import torch
import torch.nn as nn
import torch.nn.functional as F

import pdb

def get_inplanes():
    return [64, 128, 256, 512]


def conv3x3x3(in_planes, out_planes, stride=1):
    return nn.Conv3d(in_planes,
                     out_planes,
                     kernel_size=3,
                     stride=stride,
                     padding=1,
                     bias=False)


def conv1x1x1(in_planes, out_planes, stride=1):
    return nn.Conv3d(in_planes,
                     out_planes,
                     kernel_size=1,
                     stride=stride,
                     bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None):
        super().__init__()

        self.conv1 = conv3x3x3(in_planes, planes, stride)
        self.bn1 = build_norm_layer(norm_cfg, planes)[1]
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3x3(planes, planes)
        self.bn2 = build_norm_layer(norm_cfg, planes)[1]
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None):
        super().__init__()

        self.conv1 = conv1x1x1(in_planes, planes)
        self.bn1 = build_norm_layer(norm_cfg, planes)[1]
        self.conv2 = conv3x3x3(planes, planes, stride)
        self.bn2 = build_norm_layer(norm_cfg, planes)[1]
        self.conv3 = conv1x1x1(planes, planes * self.expansion)
        self.bn3 = build_norm_layer(norm_cfg, planes * self.expansion)[1]
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

@BACKBONES.register_module()
class CustomResNet3D(BaseModule):
    def __init__(self,
                 depth,
                 block_inplanes=[64, 128, 256, 512],
                 block_strides=[1, 2, 2, 2],
                 out_indices=(0, 1, 2, 3),
                 n_input_channels=3,
                 shortcut_type='B',
                 norm_cfg=dict(type='BN3d', requires_grad=True),
                 widen_factor=1.0):
        super().__init__()
        
        layer_metas = {
            10: [1, 1, 1, 1],
            18: [2, 2, 2, 2],
            34: [3, 4, 6, 3],
            50: [3, 4, 6, 3],
            101: [3, 4, 23, 3],
        }
        
        if depth in [10, 18, 34]:
            block = BasicBlock
        else:
            assert depth in [50, 101]
            block = Bottleneck
        
        layers = layer_metas[depth]
        block_inplanes = [int(x * widen_factor) for x in block_inplanes]
        self.in_planes = block_inplanes[0]
        self.out_indices = out_indices
        
        # replace the first several downsampling layers with the channel-squeeze layers
        self.input_proj = nn.Sequential(
            nn.Conv3d(n_input_channels, self.in_planes, kernel_size=(1, 1, 1),
                      stride=(1, 1, 1), bias=False),
            build_norm_layer(norm_cfg, self.in_planes)[1],
            nn.ReLU(inplace=True),
        )
        
        self.layers = nn.ModuleList()
        for i in range(len(block_inplanes)):
            self.layers.append(self._make_layer(block, block_inplanes[i], layers[i], 
                                shortcut_type, block_strides[i], norm_cfg=norm_cfg))

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
            
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _downsample_basic_block(self, x, planes, stride):
        out = F.avg_pool3d(x, kernel_size=1, stride=stride)
        zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2),
                                out.size(3), out.size(4))
        if isinstance(out.data, torch.cuda.FloatTensor):
            zero_pads = zero_pads.cuda()

        out = torch.cat([out.data, zero_pads], dim=1)

        return out

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1, norm_cfg=None):
        downsample = None
        if stride != 1 or self.in_planes != planes * block.expansion:
            if shortcut_type == 'A':
                downsample = partial(self._downsample_basic_block,
                                     planes=planes * block.expansion,
                                     stride=stride)
            else:
                downsample = nn.Sequential(
                    conv1x1x1(self.in_planes, planes * block.expansion, stride),
                    build_norm_layer(norm_cfg, planes * block.expansion)[1])

        layers = []
        layers.append(
            block(in_planes=self.in_planes,
                  planes=planes,
                  stride=stride,
                  downsample=downsample,
                  norm_cfg=norm_cfg))
        self.in_planes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.in_planes, planes, norm_cfg=norm_cfg))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.input_proj(x)
        res = []
        for index, layer in enumerate(self.layers):
            x = layer(x)
            
            if index in self.out_indices:
                res.append(x)
            
        return res

def generate_model(model_depth, **kwargs):
    assert model_depth in [10, 18, 34, 50, 101, 152, 200]

    if model_depth == 10:
        model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs)
    elif model_depth == 18:
        model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs)
    elif model_depth == 34:
        model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs)
    elif model_depth == 50:
        model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs)
    elif model_depth == 101:
        model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs)
    elif model_depth == 152:
        model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs)
    elif model_depth == 200:
        model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs)

    return model

================================================
FILE: projects/occ_plugin/occupancy/dense_heads/__init__.py
================================================
from .occ_head import OccHead
from .flow_head import FlowHead

================================================
FILE: projects/occ_plugin/occupancy/dense_heads/flow_head.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import reduce_mean
from mmdet.models import HEADS
from mmcv.cnn import build_conv_layer, build_norm_layer
from .lovasz_softmax import lovasz_softmax
from projects.occ_plugin.utils.nusc_param import nusc_class_names
from projects.occ_plugin.utils.semkitti import Smooth_L1_loss

@HEADS.register_module()
class FlowHead(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channel,
        num_level=1,
        num_img_level=1,
        soft_weights=False,
        loss_weight_cfg=None,
        conv_cfg=dict(type='Conv3d', bias=False),
        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
        fine_topk=20000,
        point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
        final_occ_size=[256, 256, 20],
        empty_idx=0,
        visible_loss=False,
        balance_cls_weight=True,
        train_cfg=None,
        test_cfg=None,
    ):
        super(FlowHead, self).__init__()
        
        if type(in_channels) is not list:
            in_channels = [in_channels]
        
        self.in_channels = in_channels 
        self.out_channel = out_channel
        self.num_level = num_level
        self.fine_topk = fine_topk
        
        self.point_cloud_range = torch.tensor(np.array(point_cloud_range)).float()
        self.final_occ_size = final_occ_size
        self.visible_loss = visible_loss

        # voxel-level prediction
        self.occ_convs = nn.ModuleList()
        for i in range(self.num_level):
            mid_channel = self.in_channels[i]
            occ_conv = nn.Sequential(
                build_conv_layer(conv_cfg, in_channels=self.in_channels[i], 
                        out_channels=mid_channel, kernel_size=3, stride=1, padding=1),
                build_norm_layer(norm_cfg, mid_channel)[1],
                nn.ReLU(inplace=True))
            self.occ_convs.append(occ_conv)


        self.occ_pred_conv = nn.Sequential(
                build_conv_layer(conv_cfg, in_channels=mid_channel, 
                        out_channels=mid_channel//2, kernel_size=1, stride=1, padding=0),
                build_norm_layer(norm_cfg, mid_channel//2)[1],
                nn.ReLU(inplace=True),)
        
        self.last_conv = build_conv_layer(conv_cfg, in_channels=mid_channel//2, 
                        out_channels=out_channel, kernel_size=1, stride=1, padding=0)
        self.last_conv.bias = nn.parameter.Parameter(torch.tensor([0.0, 0.0, 0.0], requires_grad=True))

        self.soft_weights = soft_weights
        self.num_img_level = num_img_level
        self.num_point_sampling_feat = self.num_level
        if self.soft_weights:
            soft_in_channel = mid_channel
            self.voxel_soft_weights = nn.Sequential(
                build_conv_layer(conv_cfg, in_channels=soft_in_channel, 
                        out_channels=soft_in_channel//2, kernel_size=1, stride=1, padding=0),
                build_norm_layer(norm_cfg, soft_in_channel//2)[1],
                nn.ReLU(inplace=True),
                build_conv_layer(conv_cfg, in_channels=soft_in_channel//2, 
                        out_channels=self.num_point_sampling_feat, kernel_size=1, stride=1, padding=0))
        self.class_names = nusc_class_names    
        self.empty_idx = empty_idx
        
    def forward_coarse_voxel(self, voxel_feats):
        output_occs = []
        output = {}
        for feats, occ_conv in zip(voxel_feats, self.occ_convs):
            output_occs.append(occ_conv(feats))  
        if self.soft_weights: 
            voxel_soft_weights = self.voxel_soft_weights(output_occs[0])
            voxel_soft_weights = torch.softmax(voxel_soft_weights, dim=1)
        else:
            voxel_soft_weights = torch.ones([output_occs[0].shape[0], self.num_point_sampling_feat, 1, 1, 1],).to(output_occs[0].device) / self.num_point_sampling_feat

        out_voxel_feats = 0
        _, _, H, W, D= output_occs[0].shape
        for feats, weights in zip(output_occs, torch.unbind(voxel_soft_weights, dim=1)):
            feats = F.interpolate(feats, size=[H, W, D], mode='trilinear', align_corners=False).contiguous()
            out_voxel_feats += feats * weights.unsqueeze(1)
        output['out_voxel_feats'] = [out_voxel_feats]

        out_voxel = self.occ_pred_conv(out_voxel_feats) 
        out_voxel = self.last_conv(out_voxel)
        output['occ'] = [out_voxel]

        return output

    def forward(self, voxel_feats, img_feats=None, transform=None, **kwargs):
        assert type(voxel_feats) is list and len(voxel_feats) == self.num_level
        
        # forward voxel 
        output = self.forward_coarse_voxel(voxel_feats)

        res = {
            'output_voxels': output['occ'],
        }
        
        return res

    def loss_voxel(self, output_voxels, target_voxels, tag):                    
        B, C, H, W, D = output_voxels.shape
        tB, tC, tF, tH, tW, tD = target_voxels.shape
        target_voxels = target_voxels.view(tB*tC, tF, tH, tW, tD)
        
        assert torch.isnan(output_voxels).sum().item() == 0

        output_voxels = output_voxels.permute(0,2,3,4,1) 
        target_voxels = target_voxels.permute(0,2,3,4,1)

        loss_dict = {}

        loss_dict['loss_flow_l1_{}'.format(tag)] = (0.5) * (0.1) * Smooth_L1_loss(output_voxels, target_voxels, ignore_index=255)

        return loss_dict

    def loss_point(self, fine_coord, fine_output, target_voxels, tag):

        selected_gt = target_voxels[:, fine_coord[0,:], fine_coord[1,:], fine_coord[2,:]].long()[0]
        assert torch.isnan(selected_gt).sum().item() == 0, torch.isnan(selected_gt).sum().item()
        assert torch.isnan(fine_output).sum().item() == 0, torch.isnan(fine_output).sum().item()

        loss_dict = {}

        # igore 255 = ignore noise. we keep the loss bascward for the label=0 (free voxels)
        loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * CE_ssc_loss(fine_output, selected_gt, ignore_index=255)
        loss_dict['loss_voxel_sem_scal_{}'.format(tag)] = self.loss_voxel_sem_scal_weight * sem_scal_loss(fine_output, selected_gt, ignore_index=255)
        loss_dict['loss_voxel_geo_scal_{}'.format(tag)] = self.loss_voxel_geo_scal_weight * geo_scal_loss(fine_output, selected_gt, ignore_index=255, non_empty_idx=self.empty_idx)
        loss_dict['loss_voxel_lovasz_{}'.format(tag)] = self.loss_voxel_lovasz_weight * lovasz_softmax(torch.softmax(fine_output, dim=1), selected_gt, ignore=255)


        return loss_dict

    def loss(self, output_voxels=None,
                output_coords_fine=None, output_voxels_fine=None, 
                target_voxels=None, **kwargs):
        loss_dict = {}
        for index, output_voxel in enumerate(output_voxels):
            loss_dict.update(self.loss_voxel(output_voxel, target_voxels,  tag='c_{}'.format(index)))
            
        return loss_dict

================================================
FILE: projects/occ_plugin/occupancy/dense_heads/lovasz_softmax.py
================================================
# -*- coding:utf-8 -*-
# author: Xinge


"""
Lovasz-Softmax and Jaccard hinge loss in PyTorch
Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)
"""

from __future__ import print_function, division

import torch
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
try:
    from itertools import  ifilterfalse
except ImportError: # py3k
    from itertools import  filterfalse as ifilterfalse

def lovasz_grad(gt_sorted):
    """
    Computes gradient of the Lovasz extension w.r.t sorted errors
    See Alg. 1 in paper
    """
    p = len(gt_sorted)
    gts = gt_sorted.sum()
    intersection = gts - gt_sorted.float().cumsum(0)
    union = gts + (1 - gt_sorted).float().cumsum(0)
    jaccard = 1. - intersection / union
    if p > 1: # cover 1-pixel case
        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
    return jaccard


def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True):
    """
    IoU for foreground class
    binary: 1 foreground, 0 background
    """
    if not per_image:
        preds, labels = (preds,), (labels,)
    ious = []
    for pred, label in zip(preds, labels):
        intersection = ((label == 1) & (pred == 1)).sum()
        union = ((label == 1) | ((pred == 1) & (label != ignore))).sum()
        if not union:
            iou = EMPTY
        else:
            iou = float(intersection) / float(union)
        ious.append(iou)
    iou = mean(ious)    # mean accross images if per_image
    return 100 * iou


def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
    """
    Array of IoU for each (non ignored) class
    """
    if not per_image:
        preds, labels = (preds,), (labels,)
    ious = []
    for pred, label in zip(preds, labels):
        iou = []    
        for i in range(C):
            if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes)
                intersection = ((label == i) & (pred == i)).sum()
                union = ((label == i) | ((pred == i) & (label != ignore))).sum()
                if not union:
                    iou.append(EMPTY)
                else:
                    iou.append(float(intersection) / float(union))
        ious.append(iou)
    ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image
    return 100 * np.array(ious)


# --------------------------- BINARY LOSSES ---------------------------


def lovasz_hinge(logits, labels, per_image=True, ignore=None):
    """
    Binary Lovasz hinge loss
      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
      per_image: compute the loss per image instead of per batch
      ignore: void class id
    """
    if per_image:
        loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
                          for log, lab in zip(logits, labels))
    else:
        loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
    return loss


def lovasz_hinge_flat(logits, labels):
    """
    Binary Lovasz hinge loss
      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
      labels: [P] Tensor, binary ground truth labels (0 or 1)
      ignore: label to ignore
    """
    if len(labels) == 0:
        # only void pixels, the gradients should be 0
        return logits.sum() * 0.
    signs = 2. * labels.float() - 1.
    errors = (1. - logits * Variable(signs))
    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
    perm = perm.data
    gt_sorted = labels[perm]
    grad = lovasz_grad(gt_sorted)
    loss = torch.dot(F.relu(errors_sorted), Variable(grad))
    return loss


def flatten_binary_scores(scores, labels, ignore=None):
    """
    Flattens predictions in the batch (binary case)
    Remove labels equal to 'ignore'
    """
    scores = scores.view(-1)
    labels = labels.view(-1)
    if ignore is None:
        return scores, labels
    valid = (labels != ignore)
    vscores = scores[valid]
    vlabels = labels[valid]
    return vscores, vlabels


class StableBCELoss(torch.nn.modules.Module):
    def __init__(self):
         super(StableBCELoss, self).__init__()
    def forward(self, input, target):
         neg_abs = - input.abs()
         loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
         return loss.mean()


def binary_xloss(logits, labels, ignore=None):
    """
    Binary Cross entropy loss
      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
      ignore: void class id
    """
    logits, labels = flatten_binary_scores(logits, labels, ignore)
    loss = StableBCELoss()(logits, Variable(labels.float()))
    return loss


# --------------------------- MULTICLASS LOSSES ---------------------------


def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=None):
    """
    Multi-class Lovasz-Softmax loss
      probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
              Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
      labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
      per_image: compute the loss per image instead of per batch
      ignore: void class labels
    """
    if per_image:
        loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes)
                          for prob, lab in zip(probas, labels))
    else:
        loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes)
    return loss


def lovasz_softmax_flat(probas, labels, classes='present'):
    """
    Multi-class Lovasz-Softmax loss
      probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
      labels: [P] Tensor, ground truth labels (between 0 and C - 1)
      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
    """
    if probas.numel() == 0:
        # only void pixels, the gradients should be 0
        return probas * 0.
    C = probas.size(1)
    losses = []
    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
    for c in class_to_sum:
        fg = (labels == c).float() # foreground for class c
        if (classes is 'present' and fg.sum() == 0):
            continue
        if C == 1:
            if len(classes) > 1:
                raise ValueError('Sigmoid output possible only with 1 class')
            class_pred = probas[:, 0]
        else:
            class_pred = probas[:, c]
        errors = (Variable(fg) - class_pred).abs()
        errors_sorted, perm = torch.sort(errors, 0, descending=True)
        perm = perm.data
        fg_sorted = fg[perm]
        losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
    return mean(losses)


def flatten_probas(probas, labels, ignore=None):
    """
    Flattens predictions in the batch
    """
    if probas.dim() == 2:
        if ignore is not None:
            valid = (labels != ignore)
            probas = probas[valid]
            labels = labels[valid]
        return probas, labels

    elif probas.dim() == 3:
        # assumes output of a sigmoid layer
        B, H, W = probas.size()
        probas = probas.view(B, 1, H, W)
    elif probas.dim() == 5:
        #3D segmentation
        B, C, L, H, W = probas.size()
        probas = probas.contiguous().view(B, C, L, H*W)
    B, C, H, W = probas.size()
    probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
    labels = labels.view(-1)
    if ignore is None:
        return probas, labels
    valid = (labels != ignore)
    vprobas = probas[valid.nonzero().squeeze()]
    vlabels = labels[valid]
    return vprobas, vlabels

def xloss(logits, labels, ignore=None):
    """
    Cross entropy loss
    """
    return F.cross_entropy(logits, Variable(labels), ignore_index=255)

def jaccard_loss(probas, labels,ignore=None, smooth = 100, bk_class = None):
    """
    Something wrong with this loss
    Multi-class Lovasz-Softmax loss
      probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
              Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
      labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
      per_image: compute the loss per image instead of per batch
      ignore: void class labels
    """
    vprobas, vlabels = flatten_probas(probas, labels, ignore)
    
    
    true_1_hot = torch.eye(vprobas.shape[1])[vlabels]
    
    if bk_class:
        one_hot_assignment = torch.ones_like(vlabels)
        one_hot_assignment[vlabels == bk_class] = 0
        one_hot_assignment = one_hot_assignment.float().unsqueeze(1)
        true_1_hot = true_1_hot*one_hot_assignment
    
    true_1_hot = true_1_hot.to(vprobas.device)
    intersection = torch.sum(vprobas * true_1_hot)
    cardinality = torch.sum(vprobas + true_1_hot)
    loss = (intersection + smooth / (cardinality - intersection + smooth)).mean()
    return (1-loss)*smooth

def hinge_jaccard_loss(probas, labels,ignore=None, classes = 'present', hinge = 0.1, smooth =100):
    """
    Multi-class Hinge Jaccard loss
      probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
              Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
      labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
      ignore: void class labels
    """
    vprobas, vlabels = flatten_probas(probas, labels, ignore)
    C = vprobas.size(1)
    losses = []
    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
    for c in class_to_sum:
        if c in vlabels:
            c_sample_ind = vlabels == c
            cprobas = vprobas[c_sample_ind,:]
            non_c_ind =np.array([a for a in class_to_sum if a != c])
            class_pred = cprobas[:,c]
            max_non_class_pred = torch.max(cprobas[:,non_c_ind],dim = 1)[0]
            TP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.) + smooth
            FN = torch.sum(torch.clamp(max_non_class_pred - class_pred, min = -hinge)+hinge)
            
            if (~c_sample_ind).sum() == 0:
                FP = 0
            else:
                nonc_probas = vprobas[~c_sample_ind,:]
                class_pred = nonc_probas[:,c]
                max_non_class_pred = torch.max(nonc_probas[:,non_c_ind],dim = 1)[0]
                FP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.)
            
            losses.append(1 - TP/(TP+FP+FN))
    
    if len(losses) == 0: return 0
    return mean(losses)

# --------------------------- HELPER FUNCTIONS ---------------------------
def isnan(x):
    return x != x
    
    
def mean(l, ignore_nan=False, empty=0):
    """
    nanmean compatible with generators.
    """
    l = iter(l)
    if ignore_nan:
        l = ifilterfalse(isnan, l)
    try:
        n = 1
        acc = next(l)
    except StopIteration:
        if empty == 'raise':
            raise ValueError('Empty mean')
        return empty
    for n, v in enumerate(l, 2):
        acc += v
    if n == 1:
        return acc
    return acc / n


================================================
FILE: projects/occ_plugin/occupancy/dense_heads/occ_head.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import reduce_mean
from mmdet.models import HEADS
from mmcv.cnn import build_conv_layer, build_norm_layer
from .lovasz_softmax import lovasz_softmax
from projects.occ_plugin.utils.nusc_param import nusc_class_names
from projects.occ_plugin.utils.semkitti import geo_scal_loss, sem_scal_loss, CE_ssc_loss

@HEADS.register_module()
class OccHead(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channel,
        num_level=1,
        num_img_level=1,
        soft_weights=False,
        loss_weight_cfg=None,
        conv_cfg=dict(type='Conv3d', bias=False),
        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
        fine_topk=20000,
        point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
        final_occ_size=[256, 256, 20],
        empty_idx=0,
        visible_loss=False,
        balance_cls_weight=True,
        train_cfg=None,
        test_cfg=None,
    ):
        super(OccHead, self).__init__()
        
        if type(in_channels) is not list:
            in_channels = [in_channels]
        
        self.in_channels = in_channels
        self.out_channel = out_channel
        self.num_level = num_level
        self.fine_topk = fine_topk
        
        self.point_cloud_range = torch.tensor(np.array(point_cloud_range)).float()
        self.final_occ_size = final_occ_size
        self.visible_loss = visible_loss


        if loss_weight_cfg is None:
            self.loss_weight_cfg = {
                "loss_voxel_ce_weight": 1.0,
                "loss_voxel_sem_scal_weight": 1.0,
                "loss_voxel_geo_scal_weight": 1.0,
                "loss_voxel_lovasz_weight": 1.0,
            }
        else:
            self.loss_weight_cfg = loss_weight_cfg
        
        # voxel losses
        self.loss_voxel_ce_weight = self.loss_weight_cfg.get('loss_voxel_ce_weight', 1.0)
        self.loss_voxel_sem_scal_weight = self.loss_weight_cfg.get('loss_voxel_sem_scal_weight', 1.0)
        self.loss_voxel_geo_scal_weight = self.loss_weight_cfg.get('loss_voxel_geo_scal_weight', 1.0)
        self.loss_voxel_lovasz_weight = self.loss_weight_cfg.get('loss_voxel_lovasz_weight', 1.0)
        
        # voxel-level prediction
        self.occ_convs = nn.ModuleList()
        for i in range(self.num_level):
            mid_channel = self.in_channels[i]
            occ_conv = nn.Sequential(
                build_conv_layer(conv_cfg, in_channels=self.in_channels[i], 
                        out_channels=mid_channel, kernel_size=3, stride=1, padding=1),
                build_norm_layer(norm_cfg, mid_channel)[1],
                nn.ReLU(inplace=True))
            self.occ_convs.append(occ_conv)


        self.occ_pred_conv = nn.Sequential(
                build_conv_layer(conv_cfg, in_channels=mid_channel, 
                        out_channels=mid_channel//2, kernel_size=1, stride=1, padding=0),
                build_norm_layer(norm_cfg, mid_channel//2)[1],
                nn.ReLU(inplace=True),
                build_conv_layer(conv_cfg, in_channels=mid_channel//2, 
                        out_channels=out_channel, kernel_size=1, stride=1, padding=0))

        self.soft_weights = soft_weights
        self.num_img_level = num_img_level
        self.num_point_sampling_feat = self.num_level
        if self.soft_weights:
            soft_in_channel = mid_channel
            self.voxel_soft_weights = nn.Sequential(
                build_conv_layer(conv_cfg, in_channels=soft_in_channel, 
                        out_channels=soft_in_channel//2, kernel_size=1, stride=1, padding=0),
                build_norm_layer(norm_cfg, soft_in_channel//2)[1],
                nn.ReLU(inplace=True),
                build_conv_layer(conv_cfg, in_channels=soft_in_channel//2, 
                        out_channels=self.num_point_sampling_feat, kernel_size=1, stride=1, padding=0))  # num_point_sampling_feat=4
        
        if balance_cls_weight:
            # out_channel
            self.class_weights = np.ones((out_channel,))
            self.class_weights[1:] = 5
            self.class_weights = torch.from_numpy(self.class_weights)
        else:
            self.class_weights = np.ones((out_channel,))

        self.class_names = nusc_class_names    
        self.empty_idx = empty_idx
        
    def forward_coarse_voxel(self, voxel_feats):
        output_occs = []
        output = {}
        for feats, occ_conv in zip(voxel_feats, self.occ_convs):
            output_occs.append(occ_conv(feats))

        if self.soft_weights:
            voxel_soft_weights = self.voxel_soft_weights(output_occs[0])
            voxel_soft_weights = torch.softmax(voxel_soft_weights, dim=1)
        else:
            voxel_soft_weights = torch.ones([output_occs[0].shape[0], self.num_point_sampling_feat, 1, 1, 1],).to(output_occs[0].device) / self.num_point_sampling_feat

        out_voxel_feats = 0
        _, _, H, W, D= output_occs[0].shape
        for feats, weights in zip(output_occs, torch.unbind(voxel_soft_weights, dim=1)): 
            feats = F.interpolate(feats, size=[H, W, D], mode='trilinear', align_corners=False).contiguous()
            out_voxel_feats += feats * weights.unsqueeze(1)
        output['out_voxel_feats'] = [out_voxel_feats]

        out_voxel = self.occ_pred_conv(out_voxel_feats) 
        output['occ'] = [out_voxel]

        return output
    
    
    def forward(self, voxel_feats, img_feats=None, transform=None, **kwargs):
        assert type(voxel_feats) is list and len(voxel_feats) == self.num_level
        
        # forward voxel 
        output = self.forward_coarse_voxel(voxel_feats)

        res = {
            'output_voxels': output['occ'],
        }
        
        return res

    def loss_voxel(self, output_voxels, target_voxels, tag):
        B, C, H, W, D = output_voxels.shape
        tB, tC, tH, tW, tD = target_voxels.shape
        target_voxels = target_voxels.view(tB*tC, tH, tW, tD)
        ratio = target_voxels.shape[2] // H
        if ratio != 1:
            target_voxels = target_voxels.reshape(B, H, ratio, W, ratio, D, ratio).permute(0,1,3,5,2,4,6).reshape(B, H, W, D, ratio**3)
            empty_mask = target_voxels.sum(-1) == self.empty_idx
            target_voxels = target_voxels.to(torch.int64)
            occ_space = target_voxels[~empty_mask]
            occ_space[occ_space==0] = -torch.arange(len(occ_space[occ_space==0])).to(occ_space.device) - 1
            target_voxels[~empty_mask] = occ_space
            target_voxels = torch.mode(target_voxels, dim=-1)[0]
            target_voxels[target_voxels<0] = 255
            target_voxels = target_voxels.long()

        assert torch.isnan(output_voxels).sum().item() == 0
        assert torch.isnan(target_voxels).sum().item() == 0

        loss_dict = {}

        loss_dict['loss_voxel_ce_{}'.format(tag)] = (0.5) * CE_ssc_loss(output_voxels, target_voxels, self.class_weights.type_as(output_voxels), ignore_index=255)

        return loss_dict

    def loss_point(self, fine_coord, fine_output, target_voxels, tag):

        selected_gt = target_voxels[:, fine_coord[0,:], fine_coord[1,:], fine_coord[2,:]].long()[0]
        assert torch.isnan(selected_gt).sum().item() == 0, torch.isnan(selected_gt).sum().item()
        assert torch.isnan(fine_output).sum().item() == 0, torch.isnan(fine_output).sum().item()

        loss_dict = {}

        # igore 255 = ignore noise. we keep the loss bascward for the label=0 (free voxels)
        loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * CE_ssc_loss(fine_output, selected_gt, ignore_index=255)
        loss_dict['loss_voxel_sem_scal_{}'.format(tag)] = self.loss_voxel_sem_scal_weight * sem_scal_loss(fine_output, selected_gt, ignore_index=255)
        loss_dict['loss_voxel_geo_scal_{}'.format(tag)] = self.loss_voxel_geo_scal_weight * geo_scal_loss(fine_output, selected_gt, ignore_index=255, non_empty_idx=self.empty_idx)
        loss_dict['loss_voxel_lovasz_{}'.format(tag)] = self.loss_voxel_lovasz_weight * lovasz_softmax(torch.softmax(fine_output, dim=1), selected_gt, ignore=255)


        return loss_dict

    def loss(self, output_voxels=None,
                output_coords_fine=None, output_voxels_fine=None, 
                target_voxels=None, **kwargs):
        loss_dict = {}
        for index, output_voxel in enumerate(output_voxels):
            loss_dict.update(self.loss_voxel(output_voxel, target_voxels,  tag='c_{}'.format(index)))
            
        return loss_dict
    
        
================================================
FILE: projects/occ_plugin/occupancy/dense_heads/utils.py
================================================
# borrowed from https://github.com/GuoPingPan/RPVNet/blob/main/core/models/utils/utils.py

import time
import numpy as np

import torch
from torch.nn.functional import grid_sample

import torchsparse.nn.functional as F
from torchsparse import PointTensor, SparseTensor
from torchsparse.nn.utils import get_kernel_offsets

__all__ = ['initial_voxelize', 'point_to_voxel', 'voxel_to_point',
           'range_to_point','point_to_range']


def initial_voxelize(z: PointTensor, after_res) -> SparseTensor:

    new_float_coord = torch.cat(
        [z.C[:, :3]  / after_res, z.C[:, -1].view(-1, 1)], 1)

    pc_hash = F.sphash(torch.round(new_float_coord).int())

    sparse_hash = torch.unique(pc_hash)

    idx_query = F.sphashquery(pc_hash, sparse_hash)

    counts = F.spcount(idx_query.int(), len(sparse_hash))

    inserted_coords = F.spvoxelize(torch.round(new_float_coord), idx_query,counts)

    inserted_coords = torch.round(inserted_coords).int()

    inserted_feat = F.spvoxelize(z.F, idx_query, counts)

    new_tensor = SparseTensor(inserted_feat, inserted_coords, 1)

    new_tensor.cmaps.setdefault((1,1,1), new_tensor.coords)

    z.additional_features['idx_query'][(1,1,1)] = idx_query
    z.additional_features['counts'][(1,1,1)] = counts

    return new_tensor.to(z.F.device)


def point_to_voxel(x: SparseTensor, z: PointTensor) -> SparseTensor:
    if z.additional_features is None or z.additional_features['idx_query'] is None \
            or z.additional_features['idx_query'].get(x.s) is None:

        pc_hash = F.sphash(
            torch.cat([
                torch.round(z.C[:, :3] / x.s[0]).int(),
                z.C[:, -1].int().view(-1, 1)
            ], 1))
        sparse_hash = F.sphash(x.C)

        idx_query = F.sphashquery(pc_hash, sparse_hash)
        counts = F.spcount(idx_query.int(), x.C.shape[0])
    else:
        idx_query = z.additional_features['idx_query'][x.s]
        counts = z.additional_features['counts'][x.s]

    inserted_feat = F.spvoxelize(z.F, idx_query, counts)
    new_tensor = SparseTensor(inserted_feat, x.C, x.s)
    new_tensor.cmaps = x.cmaps
    new_tensor.kmaps = x.kmaps

    return new_tensor


def voxel_to_point(x: SparseTensor, z: PointTensor, nearest=False) -> torch.Tensor:
    if z.idx_query is None or z.weights is None or z.idx_query.get(x.s) is None \
            or z.weights.get(x.s) is None:

        off = get_kernel_offsets(2, x.s, 1, device=z.F.device)

        old_hash = F.sphash(
            torch.cat([
                torch.round(z.C[:, :3] / x.s[0]).int(),
                z.C[:, -1].int().view(-1, 1)
            ], 1), off)


        pc_hash = F.sphash(x.C.to(z.F.device))

        idx_query = F.sphashquery(old_hash, pc_hash)

        weights = F.calc_ti_weights(z.C, idx_query,
                                    scale=x.s[0]).transpose(0, 1).contiguous()

        idx_query = idx_query.transpose(0, 1).contiguous()

        if nearest:
            weights[:, 1:] = 0.
            idx_query[:, 1:] = -1

        new_feat = F.spdevoxelize(x.F, idx_query, weights)

        if x.s == (1,1,1):
            z.idx_query[x.s] = idx_query
            z.weights[x.s] = weights
    else:
        new_feat = F.spdevoxelize(x.F, z.idx_query.get(x.s), z.weights.get(x.s))

    return new_feat

def range_to_point(x,px,py):

    r2p = []

    for batch,(p_x,p_y) in enumerate(zip(px,py)):
        pypx = torch.stack([p_x,p_y],dim=2).to(px[0].device)
        resampled = grid_sample(x[batch].unsqueeze(0),pypx.unsqueeze(0))
        r2p.append(resampled.squeeze().permute(1,0))
    return torch.concat(r2p,dim=0)


def point_to_range(range_shape,pF,px,py):
    H, W = range_shape
    cnt = 0
    r = []
    # t1 = time.time()
    for batch,(p_x,p_y) in enumerate(zip(px,py)):
        image = torch.zeros(size=(H,W,pF.shape[1])).to(px[0].device)
        image_cumsum = torch.zeros(size=(H,W,pF.shape[1])) + 1e-5

        p_x = torch.floor((p_x/2. + 0.5) * W).long()
        p_y = torch.floor((p_y/2. + 0.5) * H).long()

        ''' v1: directly assign '''
        # image[p_y,p_x] = pF[cnt:cnt+p_x.shape[1]]

        ''' v2: use average '''
        image[p_y,p_x] += pF[cnt:cnt+p_x.shape[1]]
        image_cumsum[p_y,p_x] += torch.ones(pF.shape[1])
        image = image/image_cumsum.to(px[0].device)

        r.append(image.permute(2,0,1))
        cnt += p_x.shape[1]
    return torch.stack(r,dim=0).to(px[0].device)

================================================
FILE: projects/occ_plugin/occupancy/detectors/__init__.py
================================================
from .ocfnet import OCFNet


================================================
FILE: projects/occ_plugin/occupancy/detectors/bevdepth.py
================================================
# Copyright (c) Phigent Robotics. All rights reserved.
import math
import torch
from mmcv.runner import force_fp32
import torch.nn.functional as F

from mmdet.models import DETECTORS
from mmdet3d.models import builder
from torch.utils.checkpoint import checkpoint
from mmdet3d.models.detectors import CenterPoint

import pdb

@DETECTORS.register_module()
class BEVDet(CenterPoint):
    def __init__(self, img_view_transformer=None,
                 img_bev_encoder_backbone=None,
                 img_bev_encoder_neck=None, **kwargs):
        super(BEVDet, self).__init__(**kwargs)
        
        if img_view_transformer is not None:
            self.img_view_transformer = builder.build_neck(img_view_transformer)
        else:
            self.img_view_transformer = None
        
        if img_bev_encoder_backbone is not None:
            self.img_bev_encoder_backbone = builder.build_backbone(img_bev_encoder_backbone)
        else:
            self.img_bev_encoder_backbone = torch.nn.Identity()
        
        if img_bev_encoder_neck is not None:
            self.img_bev_encoder_neck = builder.build_neck(img_bev_encoder_neck)
        else:
            self.img_bev_encoder_neck = torch.nn.Identity()

    def image_encoder(self, img):
        imgs = img
        B, N, C, imH, imW = imgs.shape
        imgs = imgs.view(B * N, C, imH, imW)
        x = self.img_backbone(imgs)
        if self.with_img_neck:
            x = self.img_neck(x)
            if type(x) in [list, tuple]:
                x = x[0]
        _, output_dim, ouput_H, output_W = x.shape
        x = x.view(B, N, output_dim, ouput_H, output_W)
        return x

    @force_fp32()
    def bev_encoder(self, x):
        x = self.img_bev_encoder_backbone(x)
        x = self.img_bev_encoder_neck(x)
        if type(x) in [list, tuple]:
            x = x[0]
        return x

    def extract_img_feat(self, img, img_metas):
        """Extract features of images."""
        x = self.image_encoder(img[0])
        x = self.img_view_transformer([x] + img[1:7])
        x = self.bev_encoder(x)
        return [x]

    def extract_feat(self, points, img, img_metas):
        """Extract features from images and points."""
        img_feats = self.extract_img_feat(img, img_metas)
        pts_feats = None
        return (img_feats, pts_feats)

    def forward_train(self,
                      points=None,
                      img_metas=None,
                      gt_bboxes_3d=None,
                      gt_labels_3d=None,
                      gt_labels=None,
                      gt_bboxes=None,
                      img_inputs=None,
                      proposals=None,
                      gt_bboxes_ignore=None):
        """Forward training function.

        Args:
            points (list[torch.Tensor], optional): Points of each sample.
                Defaults to None.
            img_metas (list[dict], optional): Meta information of each sample.
                Defaults to None.
            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
                Ground truth 3D boxes. Defaults to None.
            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
                of 3D boxes. Defaults to None.
            gt_labels (list[torch.Tensor], optional): Ground truth labels
                of 2D boxes in images. Defaults to None.
            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
                images. Defaults to None.
            img (torch.Tensor optional): Images of each sample with shape
                (N, C, H, W). Defaults to None.
            proposals ([list[torch.Tensor], optional): Predicted proposals
                used for training Fast RCNN. Defaults to None.
            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
                2D boxes in images to be ignored. Defaults to None.

        Returns:
            dict: Losses of different branches.
        """
        img_feats, pts_feats = self.extract_feat(
            points, img=img_inputs, img_metas=img_metas)
        assert self.with_pts_bbox
        losses = dict()
        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
                                            gt_labels_3d, img_metas,
                                            gt_bboxes_ignore)
        losses.update(losses_pts)
        return losses

    def forward_test(self, points=None, img_metas=None, img_inputs=None, **kwargs):
        """
        Args:
            points (list[torch.Tensor]): the outer list indicates test-time
                augmentations and inner torch.Tensor should have a shape NxC,
                which contains all points in the batch.
            img_metas (list[list[dict]]): the outer list indicates test-time
                augs (multiscale, flip, etc.) and the inner list indicates
                images in a batch
            img (list[torch.Tensor], optional): the outer
                list indicates test-time augmentations and inner
                torch.Tensor should have a shape NxCxHxW, which contains
                all images in the batch. Defaults to None.
        """
        for var, name in [(img_inputs, 'img_inputs'), (img_metas, 'img_metas')]:
            if not isinstance(var, list):
                raise TypeError('{} must be a list, but got {}'.format(
                    name, type(var)))

        num_augs = len(img_inputs)
        if num_augs != len(img_metas):
            raise ValueError(
                'num of augmentations ({}) != num of image meta ({})'.format(
                    len(img_inputs), len(img_metas)))

        if not isinstance(img_inputs[0][0],list):
            img_inputs = [img_inputs] if img_inputs is None else img_inputs
            points = [points] if points is None else points
            return self.simple_test(points[0], img_metas[0], img_inputs[0], **kwargs)
        else:
            return self.aug_test(None, img_metas[0], img_inputs[0], **kwargs)

    def aug_test(self, points, img_metas, img=None, rescale=False):
        """Test function without augmentaiton."""
        combine_type = self.test_cfg.get('combine_type','output')
        if combine_type=='output':
            return self.aug_test_combine_output(points, img_metas, img, rescale)
        elif combine_type=='feature':
            return self.aug_test_combine_feature(points, img_metas, img, rescale)
        else:
            assert False

    def simple_test(self, points, img_metas, img=None, rescale=False):
        """Test function without augmentaiton."""
        img_feats, _ = self.extract_feat(points, img=img, img_metas=img_metas)
        bbox_list = [dict() for _ in range(len(img_metas))]
        bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
            result_dict['pts_bbox'] = pts_bbox
        return bbox_list


    def forward_dummy(self, points=None, img_metas=None, img_inputs=None, **kwargs):
        img_feats, _ = self.extract_feat(points, img=img_inputs, img_metas=img_metas)
        from mmdet3d.core.bbox.structures.box_3d_mode import LiDARInstance3DBoxes
        img_metas=[dict(box_type_3d=LiDARInstance3DBoxes)]
        bbox_list = [dict() for _ in range(1)]
        assert self.with_pts_bbox
        bbox_pts = self.simple_test_pts(
            img_feats, img_metas, rescale=False)
        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
            result_dict['pts_bbox'] = pts_bbox
        return bbox_list


@DETECTORS.register_module()
class BEVDet4D(BEVDet):
    def __init__(self, pre_process=None,
                 align_after_view_transfromation=False,
                 detach=True,
                 detach_pre_process=False, **kwargs):
        super(BEVDet4D, self).__init__(**kwargs)
        self.pre_process = pre_process is not None
        if self.pre_process:
            self.pre_process_net = builder.build_backbone(pre_process)
        self.align_after_view_transfromation = align_after_view_transfromation
        self.detach = detach
        self.detach_pre_process = detach_pre_process

    @force_fp32()
    def shift_feature(self, input, trans, rots):
        n, c, h, w = input.shape
        _, v, _ = trans[0].shape

        # generate grid
        xs = torch.linspace(0, w - 1, w, dtype=input.dtype,
                            device=input.device).view(1, w).expand(h, w)
        ys = torch.linspace(0, h - 1, h, dtype=input.dtype,
                            device=input.device).view(h, 1).expand(h, w)
        grid = torch.stack((xs, ys, torch.ones_like(xs)), -1)
        grid = grid.view(1, h, w, 3).expand(n,h,w,3).view(n, h, w, 3, 1)

        # get transformation from current lidar frame to adjacent lidar frame
        # transformation from current camera frame to current lidar frame
        c02l0 = torch.zeros((n, v, 4, 4), dtype=grid.dtype).to(grid)
        c02l0[:, :, :3, :3] = rots[0]
        c02l0[:, :, :3, 3] = trans[0]
        c02l0[:, :, 3, 3] = 1

        # transformation from adjacent camera frame to current lidar frame
        c12l0 = torch.zeros((n, v, 4, 4), dtype=grid.dtype).to(grid)
        c12l0[:, :, :3, :3] = rots[1]
        c12l0[:, :, :3, 3] = trans[1]
        c12l0[:, :, 3, 3] = 1

        # transformation from current lidar frame to adjacent lidar frame
        l02l1 = c02l0.matmul(torch.inverse(c12l0))[:, 0, :, :].view(n, 1, 1, 4, 4)
        '''
          c02l0 * inv(c12l0)
        = c02l0 * inv(l12l0 * c12l1)
        = c02l0 * inv(c12l1) * inv(l12l0)
        = l02l1 # c02l0==c12l1
        '''

        l02l1 = l02l1[:, :, :, [True, True, False, True], :][:, :, :, :,
                [True, True, False, True]]

        feat2bev = torch.zeros((3, 3), dtype=grid.dtype).to(grid)
        feat2bev[0, 0] = self.img_view_transformer.dx[0]
        feat2bev[1, 1] = self.img_view_transformer.dx[1]
        feat2bev[0, 2] = self.img_view_transformer.bx[0] - \
                         self.img_view_transformer.dx[0] / 2.
        feat2bev[1, 2] = self.img_view_transformer.bx[1] - \
                         self.img_view_transformer.dx[1] / 2.
        feat2bev[2, 2] = 1
        feat2bev = feat2bev.view(1, 3, 3)
        tf = torch.inverse(feat2bev).matmul(l02l1).matmul(feat2bev)

        # transform and normalize
        grid = tf.matmul(grid)
        normalize_factor = torch.tensor([w - 1.0, h - 1.0], dtype=input.dtype,
                                        device=input.device)
        grid = grid[:, :, :, :2, 0] / normalize_factor.view(1, 1, 1,
                                                            2) * 2.0 - 1.0
        output = F.grid_sample(input, grid.to(input.dtype), align_corners=True)
        return output

    def prepare_bev_feat(self, img, rot, tran, intrin, post_rot, post_tran, bda):
        x = self.image_encoder(img)
        bev_feat = self.img_view_transformer([x, rot, tran, intrin,
                                              post_rot, post_tran, bda])
        if self.pre_process:
            bev_feat = self.pre_process_net(bev_feat)[0]
        return bev_feat

    def extract_img_feat(self, img, img_metas):
        inputs = img
        """Extract features of images."""
        B, N, _, H, W = inputs[0].shape
        N = N//2
        imgs = inputs[0].view(B,N,2,3,H,W)
        imgs = torch.split(imgs,1,2)
        imgs = [t.squeeze(2) for t in imgs]
        rots, trans, intrins, post_rots, post_trans, bda = inputs[1:7]
        extra = [rots.view(B,2,N,3,3),
                 trans.view(B,2,N,3),
                 intrins.view(B,2,N,3,3),
                 post_rots.view(B,2,N,3,3),
                 post_trans.view(B,2,N,3)]
        extra = [torch.split(t, 1, 1) for t in extra]
        extra = [[p.squeeze(1) for p in t] for t in extra]
        rots, trans, intrins, post_rots, post_trans = extra
        bev_feat_list = []
        key_frame=True # back propagation for key frame only
        for img, rot, tran, intrin, post_rot, \
            post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans):
            if self.align_after_view_transfromation:
                rot, tran = rots[0], trans[0]
            inputs_curr = (img, rot, tran, intrin, post_rot, post_tran, bda)
            if not key_frame and self.detach:
                with torch.no_grad():
                    bev_feat = self.prepare_bev_feat(*inputs_curr)
            else:
                bev_feat = self.prepare_bev_feat(*inputs_curr)
            bev_feat_list.append(bev_feat)
            key_frame = False
        if self.align_after_view_transfromation:
            bev_feat_list[1] = self.shift_feature(bev_feat_list[1],
                                                  trans, rots)
        bev_feat = torch.cat(bev_feat_list, dim=1)
        x = self.bev_encoder(bev_feat)
        return [x]


class BEVDepth_Base(object):
    def extract_feat(self, points, img, img_metas):
        """Extract features from images and points."""
        img_feats, depth = self.extract_img_feat(img, img_metas)
        pts_feats = None
        return (img_feats, pts_feats, depth)

    def simple_test(self, points, img_metas, img=None, rescale=False):
        """Test function without augmentaiton."""
        img_feats, _, _ = self.extract_feat(points, img=img, img_metas=img_metas)
        bbox_list = [dict() for _ in range(len(img_metas))]
        bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
            result_dict['pts_bbox'] = pts_bbox
        return bbox_list

    def forward_train(self,
                      points=None,
                      img_metas=None,
                      gt_bboxes_3d=None,
                      gt_labels_3d=None,
                      gt_labels=None,
                      gt_bboxes=None,
                      img_inputs=None,
                      proposals=None,
                      gt_bboxes_ignore=None):
        """Forward training function.

        Args:
            points (list[torch.Tensor], optional): Points of each sample.
                Defaults to None.
            img_metas (list[dict], optional): Meta information of each sample.
                Defaults to None.
            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
                Ground truth 3D boxes. Defaults to None.
            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
                of 3D boxes. Defaults to None.
            gt_labels (list[torch.Tensor], optional): Ground truth labels
                of 2D boxes in images. Defaults to None.
            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
                images. Defaults to None.
            img (torch.Tensor optional): Images of each sample with shape
                (N, C, H, W). Defaults to None.
            proposals ([list[torch.Tensor], optional): Predicted proposals
                used for training Fast RCNN. Defaults to None.
            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
                2D boxes in images to be ignored. Defaults to None.

        Returns:
            dict: Losses of different branches.
        """

        img_feats, pts_feats, depth = self.extract_feat(
            points, img=img_inputs, img_metas=img_metas)
        assert self.with_pts_bbox
        # assert len(img_inputs) == 8
        depth_gt = img_inputs[7]
        loss_depth = self.img_view_transformer.get_depth_loss(depth_gt, depth)
        losses = dict(loss_depth=loss_depth)
        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
                                            gt_labels_3d, img_metas,
                                            gt_bboxes_ignore)
        losses.update(losses_pts)
        
        # some modifications
        if hasattr(self.img_view_transformer, 'loss_depth_reg_weight') and self.img_view_transformer.loss_depth_reg_weight > 0:
            losses['loss_depth_reg'] = self.img_view_transformer.get_depth_reg_loss(depth_gt, depth)

        return losses

@DETECTORS.register_module()
class BEVDepth(BEVDepth_Base, BEVDet):
    def extract_img_feat(self, img, img_metas):
        """Extract features of images."""
        x = self.image_encoder(img[0])

        # img: imgs, rots, trans, intrins, post_rots, post_trans, gt_depths, sensor2sensors
        rots, trans, intrins, post_rots, post_trans, bda = img[1:7]
        
        mlp_input = self.img_view_transformer.get_mlp_input(rots, trans, intrins, post_rots, post_trans, bda)
        geo_inputs = [rots, trans, intrins, post_rots, post_trans, bda, mlp_input]
        
        x, depth = self.img_view_transformer([x] + geo_inputs)
        x = self.bev_encoder(x)
        
        return [x], depth


@DETECTORS.register_module()
class BEVDepth4D(BEVDepth_Base, BEVDet4D):
    def prepare_bev_feat(self, img, rot, tran, intrin,
                         post_rot, post_tran, bda, mlp_input):
        x = self.image_encoder(img)
        bev_feat, depth = self.img_view_transformer([x, rot, tran, intrin,
                                              post_rot, post_tran, bda, mlp_input])
        if self.detach_pre_process and self.pre_process:
            bev_feat = self.pre_process_net(bev_feat)[0]
        return bev_feat, depth

    def extract_img_feat(self, img, img_metas):
        inputs = img
        """Extract features of images."""
        B, N, _, H, W = inputs[0].shape
        N = N//2
        imgs = inputs[0].view(B,N,2,3,H,W)
        imgs = torch.split(imgs,1,2)
        imgs = [t.squeeze(2) for t in imgs]
        rots, trans, intrins, post_rots, post_trans, bda = inputs[1:7]
        extra = [rots.view(B,2,N,3,3),
                 trans.view(B,2,N,3),
                 intrins.view(B,2,N,3,3),
                 post_rots.view(B,2,N,3,3),
                 post_trans.view(B,2,N,3)]
        extra = [torch.split(t, 1, 1) for t in extra]
        extra = [[p.squeeze(1) for p in t] for t in extra]
        rots, trans, intrins, post_rots, post_trans = extra
        bev_feat_list = []
        depth_list = []
        key_frame=True # back propagation for key frame only
        for img, rot, tran, intrin, post_rot, \
            post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans):
            if self.align_after_view_transfromation:
                rot, tran = rots[0], trans[0]
            mlp_input = self.img_view_transformer.get_mlp_input(
                rots[0], trans[0], intrin,post_rot, post_tran, bda)
            inputs_curr = (img, rot, tran, intrin, post_rot, post_tran, bda, mlp_input)
            if not key_frame and self.detach:
                with torch.no_grad():
                    bev_feat, depth = self.prepare_bev_feat(*inputs_curr)
            else:
                bev_feat, depth = self.prepare_bev_feat(*inputs_curr)
            if not self.detach_pre_process and self.pre_process:
                bev_feat = self.pre_process_net(bev_feat)[0]
            bev_feat_list.append(bev_feat)
            depth_list.append(depth)
            key_frame = False
        if self.align_after_view_transfromation:
            bev_feat_list[1] = self.shift_feature(bev_feat_list[1],
                                                  trans, rots)
            
        bev_feat = torch.cat(bev_feat_list, dim=1)
        x = self.bev_encoder(bev_feat)
        return [x], depth_list[0]


@DETECTORS.register_module()
class BEVStereo(BEVDepth4D):
    def __init__(self, bevdet_model=False, **kwargs):
        super(BEVStereo, self).__init__(**kwargs)
        self.bevdet_model = bevdet_model

    def image_encoder(self, img):
        imgs = img
        B, N, C, imH, imW = imgs.shape
        imgs = imgs.view(B * N, C, imH, imW)
        x = self.img_backbone(imgs)
        stereo_feat = x[0].detach()

        # if isinstance(self.img_backbone, CustomSwin):
        #     stereo_feat = stereo_feat.permute(0,2,3,1)
        #     stereo_feat = self.img_backbone.norm0(stereo_feat)
        #     stereo_feat = stereo_feat.permute(0,3,1,2)
        
        if self.bevdet_model:
            x = x[-2:]
        
        if self.with_img_neck:
            x = self.img_neck(x)
            if type(x) in [list, tuple]:
                x = x[0]
        
        _, output_dim, ouput_H, output_W = x.shape
        x = x.view(B, N, output_dim, ouput_H, output_W)
        return x, stereo_feat

    def extract_img_feat(self, img, img_metas):
        inputs = img
        """Extract features of images."""
        B, N, _, H, W = inputs[0].shape
        N = N//2
        imgs = inputs[0].view(B,N,2,3,H,W)
        imgs = torch.split(imgs,1,2)
        imgs = [t.squeeze(2) for t in imgs]
        rots, trans, intrins, post_rots, post_trans, bda, _, sensor2sensors = inputs[1:9]
        extra = [rots.view(B,2,N,3,3),
                 trans.view(B,2,N,3),
                 intrins.view(B,2,N,3,3),
                 post_rots.view(B,2,N,3,3),
                 post_trans.view(B,2,N,3),
                 sensor2sensors.view(B,2,N,4,4)]

        sensor2ego_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots)
        sensor2ego_mats[:,:,:,:3,:3] = extra[0]
        sensor2ego_mats[:,:,:,:3,3] = extra[1]
        intrin_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots)
        intrin_mats[:,:,:,:3,:3] = extra[2]
        ida_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots)
        ida_mats[:,:,:,:3,:3] = extra[3]
        ida_mats[:,:,:,:3,3] = extra[4]
        mats_dict = dict(sensor2ego_mats=sensor2ego_mats,
                         intrin_mats=intrin_mats,
                         ida_mats=ida_mats,
                         sensor2sensor_mats=extra[5],
                         bda_mat=bda)
        extra = [torch.split(t, 1, 1) for t in extra]
        extra = [[p.squeeze(1) for p in t] for t in extra]
        rots, trans, intrins, post_rots, post_trans, sensor2sensors = extra

        # forward stereo depth
        context_all_sweeps = list()
        depth_feat_all_sweeps = list()
        img_feats_all_sweeps = list()
        stereo_feats_all_sweeps = list()
        mu_all_sweeps = list()
        sigma_all_sweeps = list()
        mono_depth_all_sweeps = list()
        range_score_all_sweeps = list()
        key_frame=True # back propagation for key frame only
        for img, rot, tran, intrin, post_rot, post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans):
            if not key_frame:
                with torch.no_grad():
                    img_feats, stereo_feats = self.image_encoder(img)
                    img_feats = img_feats.view(B * N, *img_feats.shape[2:])
                    mlp_input = \
                        self.img_view_transformer.get_mlp_input(rots[0], trans[0], intrin, post_rot, post_tran, bda)
                    depth_feat, context, mu, sigma, range_score, mono_depth = \
                        self.img_view_transformer.depth_net(img_feats,
                                                            mlp_input)
                    context = self.img_view_transformer.context_downsample_net(
                        context)
            else:
                img_feats, stereo_feats = self.image_encoder(img)
                img_feats = img_feats.view(B * N, *img_feats.shape[2:])
                mlp_input = \
                    self.img_view_transformer.get_mlp_input(rots[0], trans[0], intrin,
                                                            post_rot,
                                                            post_tran, bda)
                depth_feat, context, mu, sigma, range_score, mono_depth = \
                    self.img_view_transformer.depth_net(img_feats,
                                                        mlp_input)
                context = self.img_view_transformer.context_downsample_net(
                    context)
            img_feats_all_sweeps.append(img_feats)
            stereo_feats_all_sweeps.append(stereo_feats)
            depth_feat_all_sweeps.append(depth_feat)
            context_all_sweeps.append(context)
            mu_all_sweeps.append(mu)
            sigma_all_sweeps.append(sigma)
            mono_depth_all_sweeps.append(mono_depth)
            range_score_all_sweeps.append(range_score)
            key_frame = False

        depth_score_all_sweeps = list()
        num_sweeps = 2
        for ref_idx in range(num_sweeps):
            sensor2sensor_mats = list()
            for src_idx in range(num_sweeps):
                ref2keysensor_mats = sensor2sensors[ref_idx].inverse()
                key2srcsensor_mats = sensor2sensors[src_idx]
                ref2srcsensor_mats = key2srcsensor_mats @ ref2keysensor_mats
                sensor2sensor_mats.append(ref2srcsensor_mats)
            if ref_idx == 0:
                # last iteration on stage 1 does not have propagation
                # (photometric consistency filtering)
                if self.img_view_transformer.use_mask:
                    stereo_depth, mask = self.img_view_transformer._forward_stereo(
                        ref_idx,
                        stereo_feats_all_sweeps,
                        mono_depth_all_sweeps,
                        mats_dict,
                        sensor2sensor_mats,
                        mu_all_sweeps,
                        sigma_all_sweeps,
                        range_score_all_sweeps,
                        depth_feat_all_sweeps,
                    )
                else:
                    stereo_depth = self.img_view_transformer._forward_stereo(
                        ref_idx,
                        stereo_feats_all_sweeps,
                        mono_depth_all_sweeps,
                        mats_dict,
                        sensor2sensor_mats,
                        mu_all_sweeps,
                        sigma_all_sweeps,
                        range_score_all_sweeps,
                        depth_feat_all_sweeps,
                    )
            else:
                with torch.no_grad():
                    # last iteration on stage 1 does not have
                    # propagation (photometric consistency filtering)
                    if self.img_view_transformer.use_mask:
                        stereo_depth, mask = self.img_view_transformer._forward_stereo(
                            ref_idx,
                            stereo_feats_all_sweeps,
                            mono_depth_all_sweeps,
                            mats_dict,
                            sensor2sensor_mats,
                            mu_all_sweeps,
                            sigma_all_sweeps,
                            range_score_all_sweeps,
                            depth_feat_all_sweeps,
                        )
                    else:
                        stereo_depth = self.img_view_transformer._forward_stereo(
                            ref_idx,
                            stereo_feats_all_sweeps,
                            mono_depth_all_sweeps,
                            mats_dict,
                            sensor2sensor_mats,
                            mu_all_sweeps,
                            sigma_all_sweeps,
                            range_score_all_sweeps,
                            depth_feat_all_sweeps,
                        )
            if self.img_view_transformer.use_mask:
                depth_score = (
                        mono_depth_all_sweeps[ref_idx] +
                        self.img_view_transformer.depth_downsample_net(
                            stereo_depth) * mask).softmax(1)
            else:
                depth_score = (
                        mono_depth_all_sweeps[ref_idx] +
                        self.img_view_transformer.depth_downsample_net(stereo_depth)).softmax(1)
            depth_score_all_sweeps.append(depth_score)

        # forward view transformation
        bev_feat_list = []
        key_frame=True # back propagation for key frame only
        for image_feat, depth_prob, rot, tran, intrin, post_rot, post_tran in \
            zip(context_all_sweeps, depth_score_all_sweeps, rots, trans,
                intrins, post_rots, post_trans):
            if not key_frame:
                with torch.no_grad():
                    input_curr = (image_feat.view(B,N,*image_feat.shape[1:]),
                                  depth_prob, rot, tran, intrin, post_rot,
                                  post_tran, bda)
                    bev_feat = self.img_view_transformer(input_curr)
            else:
                input_curr = (image_feat.view(B,N,*image_feat.shape[1:]),
                                  depth_prob, rot, tran, intrin, post_rot,
                                  post_tran, bda)
                bev_feat = self.img_view_transformer(input_curr)
            if self.pre_process:
                bev_feat = self.pre_process_net(bev_feat)[0]
            bev_feat_list.append(bev_feat)
            key_frame = False

        bev_feat = torch.cat(bev_feat_list, dim=1)
        x = self.bev_encoder(bev_feat)
        return [x], depth_score_all_sweeps[0]

================================================
FILE: projects/occ_plugin/occupancy/detectors/ocfnet.py
================================================
# Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

from sys import api_version
import torch
import collections 
import torch.nn.functional as F
import os

from mmdet.models import DETECTORS
from mmcv.runner import auto_fp16, force_fp32
from .bevdepth import BEVDepth
from mmdet3d.models import builder

import numpy as np
import time
import copy
from typing import Tuple


@DETECTORS.register_module()
class OCFNet(BEVDepth):
    def __init__(self, 
            loss_cfg=None,
            only_generate_dataset=False,
            disable_loss_depth=False,
            test_present=False,
            empty_idx=0,
            max_label=2,
            occ_encoder_backbone=None,
            occ_predictor=None,
            occ_encoder_neck=None,
            flow_encoder_backbone=None,
            flow_predictor=None,
            flow_encoder_neck=None,
            flow_head=None,
            loss_norm=False,
            point_cloud_range=None,
            time_receptive_field=None,
            n_future_frames=None,
            n_future_frames_plus=None,
            iou_thresh_for_vpq=None,
            record_time=False,
            save_pred=False,
            save_path=None,
            **kwargs):
        '''
        OCFNet is our end-to-end baseline for 4D camera-only occupancy forecasting
        
        there are two streams for the forecasting task with aggregated voxel features as inputs:
            1. occ_encoder_backbone -> occ_predictor -> occ_encoder_neck -> pts_bbox_head
            2. flow_encoder_backbone -> flow_predictor -> flow_encoder_neck -> flow_head
        
        time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3
        n_future_frames: number of forecasted future frames, default: 4
        n_future_frames_plus: number of estimated frames (> n_future_frames), default: 6 (if only forecasting occupancy states rather than instances, n_future_frames=n_future_frames_plus can be set)
        iou_thresh_for_vpq: iou threshold to associate instances in 3D instance prediction, default: 0.2 (adjusted by occupancy forecasting performance)
        '''
        super().__init__(**kwargs)

        self.loss_cfg = loss_cfg
        self.disable_loss_depth = disable_loss_depth
        self.only_generate_dataset = only_generate_dataset
        self.loss_norm = loss_norm
        self.time_receptive_field = time_receptive_field
        self.n_future_frames = n_future_frames
        self.n_future_frames_plus = n_future_frames_plus
        self.eval_start_moment = self.n_future_frames_plus - self.n_future_frames - 1

        self.iou_thresh_for_vpq = iou_thresh_for_vpq
        
        self.record_time = record_time
        self.time_stats = collections.defaultdict(list)
        self.empty_idx = empty_idx
        self.max_label = max_label

        self.occ_encoder_backbone = builder.build_backbone(occ_encoder_backbone)
        self.occ_predictor = builder.build_neck(occ_predictor)
        self.occ_encoder_neck = builder.build_neck(occ_encoder_neck)

        self.flow_encoder_backbone = builder.build_backbone(flow_encoder_backbone)
        self.flow_encoder_neck = builder.build_neck(flow_encoder_neck)
        self.flow_predictor = builder.build_neck(flow_predictor)
        self.flow_head = builder.build_head(flow_head)

        self.point_cloud_range = point_cloud_range
        self.spatial_extent3d = (self.point_cloud_range[3]-self.point_cloud_range[0], \
                                    self.point_cloud_range[4]-self.point_cloud_range[1], \
                                         self.point_cloud_range[5]-self.point_cloud_range[2])
        self.ego_center_shift_proportion_x = abs(self.point_cloud_range[0])/(self.point_cloud_range[3]-self.point_cloud_range[0])
        self.ego_center_shift_proportion_y = abs(self.point_cloud_range[1])/(self.point_cloud_range[4]-self.point_cloud_range[1])
        self.ego_center_shift_proportion_z = abs(self.point_cloud_range[2])/(self.point_cloud_range[5]-self.point_cloud_range[2])

        self.n_cam = 6
        self.fine_grained = False
        self.vehicles_id = 1

        self.test_present = test_present
        self.save_pred = save_pred
        self.save_path = save_path

    def image_encoder(self, img):
        imgs = img
        B, N, C, imH, imW = imgs.shape
        imgs = imgs.view(B * N, C, imH, imW)
        
        backbone_feats = self.img_backbone(imgs)

        if self.with_img_neck:
            x = self.img_neck(backbone_feats)
            if type(x) in [list, tuple]:
                x = x[0]
        else:
            x = backbone_feats
        _, output_dim, ouput_H, output_W = x.shape
        x = x.view(B, N, output_dim, ouput_H, output_W)
        
        return {'x': x,
                'img_feats': [x.clone()]}
    
    @force_fp32()
    def occ_encoder(self, x):
        b, t, _, _, _, _ = x.shape
        x = x.reshape(b, -1, *x.shape[3:])
        x = self.occ_encoder_backbone(x)
        x = self.occ_predictor(x)
        x = self.occ_encoder_neck(x) 
        return x

    @force_fp32()
    def flow_encoder(self, x):
        b, t, _, _, _, _ = x.shape
        x = x.reshape(b, -1, *x.shape[3:])
        x = self.flow_encoder_backbone(x)
        x = self.flow_predictor(x)
        x = self.flow_encoder_neck(x) 
        return x

    def mat2pose_vec(self, matrix: torch.Tensor):
        """
        Converts a 4x4 pose matrix into a 6-dof pose vector
        Args:
            matrix (ndarray): 4x4 pose matrix
        Returns:
            vector (ndarray): 6-dof pose vector comprising translation components (tx, ty, tz) and
            rotation components (rx, ry, rz)
        """

        # M[1, 2] = -sinx*cosy, M[2, 2] = +cosx*cosy
        rotx = torch.atan2(-matrix[..., 1, 2], matrix[..., 2, 2])

        # M[0, 2] = +siny, M[1, 2] = -sinx*cosy, M[2, 2] = +cosx*cosy
        cosy = torch.sqrt(matrix[..., 1, 2] ** 2 + matrix[..., 2, 2] ** 2)
        roty = torch.atan2(matrix[..., 0, 2], cosy)

        # M[0, 0] = +cosy*cosz, M[0, 1] = -cosy*sinz
        rotz = torch.atan2(-matrix[..., 0, 1], matrix[..., 0, 0])

        rotation = torch.stack((rotx, roty, rotz), dim=-1)

        # Extract translation params
        translation = matrix[..., :3, 3]
        return torch.cat((translation, rotation), dim=-1)

    def pack_dbatch_and_dtime(self, x):
        b = x.shape[0]
        s = x.shape[1]
        x = x.view(b*s, *x.shape[2:])
        return x
    
    def unpack_dbatch_and_dtime(self, x, b, s):
        assert (b*s) == x.shape[0]
        x = x.view(b, s, *x.shape[1:])
        return x

    def extract_img_feat(self, img_inputs_seq, img_metas):
        '''
        Extract features of sequential input images
        '''
        
        if self.record_time:
            torch.cuda.synchronize()
            t0 = time.time()
        
        imgs_seq, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, gt_depths_seq, sensor2sensors_seq = img_inputs_seq

        self.batch_size = imgs_seq.shape[0]
        self.sequence_length = imgs_seq.shape[1]

        imgs_seq = imgs_seq[:,0:self.time_receptive_field,...].contiguous()
        rots_seq = rots_seq[:,0:self.time_receptive_field,...].contiguous()
        trans_seq = trans_seq[:,0:self.time_receptive_field,...].contiguous()
        intrins_seq = intrins_seq[:,0:self.time_receptive_field,...].contiguous()
        post_rots_seq = post_rots_seq[:,0:self.time_receptive_field,...].contiguous()
        post_trans_seq = post_trans_seq[:,0:self.time_receptive_field,...].contiguous()
        gt_depths_seq = gt_depths_seq[:,0:self.time_receptive_field,...].contiguous()
        sensor2sensors_seq = sensor2sensors_seq[:,0:self.time_receptive_field,...].contiguous()
        
        imgs_seq = self.pack_dbatch_and_dtime(imgs_seq)
        rots_seq = self.pack_dbatch_and_dtime(rots_seq)
        trans_seq = self.pack_dbatch_and_dtime(trans_seq)
        intrins_seq = self.pack_dbatch_and_dtime(intrins_seq)
        post_rots_seq = self.pack_dbatch_and_dtime(post_rots_seq)
        post_trans_seq = self.pack_dbatch_and_dtime(post_trans_seq)
        gt_depths_seq = self.pack_dbatch_and_dtime(gt_depths_seq)
        sensor2sensors_seq = self.pack_dbatch_and_dtime(sensor2sensors_seq)
        
        self.n_cam = imgs_seq.shape[1]
        
        img_enc_feats = self.image_encoder(imgs_seq) 
        x = img_enc_feats['x']
        img_feats = img_enc_feats['img_feats']
        
        if self.record_time:
            torch.cuda.synchronize()
            t1 = time.time()
            self.time_stats['img_encoder'].append(t1 - t0)
        
        mlp_input_seq = self.img_view_transformer.get_mlp_input(rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq)
        geo_inputs = [rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, None, mlp_input_seq]

        x, depth = self.img_view_transformer([x] + geo_inputs)

        if self.record_time:
            torch.cuda.synchronize()
            t2 = time.time()
            self.time_stats['view_transformer'].append(t2 - t1)
        
        return x, depth, img_feats   

    def warp_features(self, x, flow, tseq):
        '''
        Warp features by motion flow
        '''

        if flow is None:
            return x

        b, dc, dx, dy, dz = x.shape

        # normalize 3D motion flow
        flow[:,0,-1] =flow[:,0,-1]*dx/self.spatial_extent3d[0]
        flow[:,1,-1] =flow[:,1,-1]*dy/self.spatial_extent3d[1]
        flow[:,2,-1] =flow[:,2,-1]*dz/self.spatial_extent3d[2]

        nx, ny, nz = torch.meshgrid(torch.arange(dx, dtype=torch.float, device=x.device), \
                                    torch.arange(dy, dtype=torch.float, device=x.device), \
                                    torch.arange(dz, dtype=torch.float, device=x.device))
        tmp = torch.ones((dx, dy, dz), device=x.device)
        grid = torch.stack((nx, ny, nz, tmp), dim=-1)

        # centralize shift
        shift_x = self.ego_center_shift_proportion_x * dx
        shift_y = self.ego_center_shift_proportion_y * dy
        shift_z = self.ego_center_shift_proportion_z * dz

        grid[:, :, :, 0] = grid[:, :, :, 0] - shift_x
        grid[:, :, :, 1] = grid[:, :, :, 1] - shift_y
        grid[:, :, :, 2] = grid[:, :, :, 2] - shift_z
        grid = grid.view(dx*dy*dz, grid.shape[-1]).unsqueeze(-1)   #[N,4,1] 

        transformation = flow.unsqueeze(1)  # [bs, 1, 4, 4]
        transformed_grid = transformation @ grid  # [bs, N, 4, 1]
        transformed_grid = transformed_grid.squeeze(-1) # [bs, N, 4]
        transformed_grid = transformed_grid.view(-1, 4)

        # de-centralize
        transformed_grid[:, 0] = (transformed_grid[:, 0] + shift_x)
        transformed_grid[:, 1] = (transformed_grid[:, 1] + shift_y)
        transformed_grid[:, 2] = (transformed_grid[:, 2] + shift_z)
        transformed_grid = transformed_grid.round().long()

        # de-normalize
        grid = grid.squeeze(-1)
        grid = grid.view(-1, 4)
        grid[:, 0] = (grid[:, 0] + shift_x)
        grid[:, 1] = (grid[:, 1] + shift_y)
        grid[:, 2] = (grid[:, 2] + shift_z)
        grid = grid.round().long()

        batch_ix = torch.cat([torch.full([transformed_grid.shape[0] // b, 1], ix, device=x.device, dtype=torch.long) for ix in range(b)])

        kept = (transformed_grid[:,0] >= 0) & (transformed_grid[:,0] <dx) \
               & (transformed_grid[:,1] >= 0) & (transformed_grid[:,1] <dy) \
               & (transformed_grid[:,2] >= 0) & (transformed_grid[:,2] < dz)

        transformed_grid = transformed_grid[kept]
        batch_ix = batch_ix[kept]
        grid = grid[kept]

        warped_x =  torch.zeros_like(x, device=x.device) 


        # hard coding for reducing memory usage
        # erratum for new version
        split_num = 32
        gap = transformed_grid.shape[0]//split_num
        for tt in range(split_num-1):
            start_idx_tt = int(tt*gap)
            end_idx_tt = int((tt+1)*gap)
            current_batch = batch_ix[start_idx_tt:end_idx_tt]
            ixx = transformed_grid[start_idx_tt:end_idx_tt, 0]
            ixy = transformed_grid[start_idx_tt:end_idx_tt, 1]
            ixz = transformed_grid[start_idx_tt:end_idx_tt, 2]

            ixx_ori = grid[start_idx_tt:end_idx_tt, 0]
            ixy_ori = grid[start_idx_tt:end_idx_tt, 1]
            ixz_ori = grid[start_idx_tt:end_idx_tt, 2]

            warped_x[current_batch, :, ixx, ixy, ixz] = x[current_batch, :, ixx_ori, ixy_ori, ixz_ori]
            
        # for i in range(transformed_grid.shape[0]):  
        #     current_batch = batch_ix[i]
        #     ixx = transformed_grid[i, 0]
        #     ixy = transformed_grid[i, 1]
        #     ixz = transformed_grid[i, 2]

        #     ixx_ori = grid[i, 0]
        #     ixy_ori = grid[i, 1]
        #     ixz_ori = grid[i, 2]

        #     warped_x[current_batch, :, ixx, ixy, ixz] = x[current_batch, :, ixx_ori, ixy_ori, ixz_ori]

        return warped_x 

    def cumulative_warp_occ(self, lifted_feature_seq, future_egomotion, mode='bilinear'):
        '''
        Warp sequential voxel features to the present frame by ego pose updates
        '''
        future_egomotion = future_egomotion[:, :self.time_receptive_field, ...].contiguous()
        
        out = [lifted_feature_seq[:, -1]]
        cum_future_egomotion = future_egomotion[:, -2]
        for t in reversed(range(self.time_receptive_field - 1)):
            out.append(self.warp_features(lifted_feature_seq[:, t], cum_future_egomotion, t))
            cum_future_egomotion = cum_future_egomotion @ future_egomotion[:, t - 1]

        return torch.stack(out[::-1], 1)


    def extract_feat(self, img_inputs_seq, img_metas, future_egomotion):
        '''
        Extract voxel features from input sequential images
        '''
        voxel_feats = None
        depth, img_feats = None, None
        if img_inputs_seq is not None:
            voxel_feats, depth, img_feats = self.extract_img_feat(img_inputs_seq, img_metas)

        if self.record_time:
            torch.cuda.synchronize()
            t0 = time.time()
        
        voxel_feats = self.unpack_dbatch_and_dtime(voxel_feats, self.batch_size, self.time_receptive_field)
        voxel_feats = self.cumulative_warp_occ(voxel_feats.clone(), future_egomotion, mode='bilinear')

        if self.record_time:
            torch.cuda.synchronize()
            t1 = time.time()
            self.time_stats['feature warping'].append(t1 - t0)

        # egomotion-aware
        future_egomotion_vec = self.mat2pose_vec(future_egomotion)
        batch_size, sequence_length, nbr_pose_channels = future_egomotion_vec.shape
        dx, dy, dz = voxel_feats.shape[-3:]
        future_egomotions_spatial = future_egomotion_vec.view(batch_size, sequence_length, nbr_pose_channels, 1, 1, 1).expand(batch_size, sequence_length, nbr_pose_channels, dx, dy, dz)
        # at time 0, no egomotion so feed zero vector
        future_egomotions_spatial = torch.cat([torch.zeros_like(future_egomotions_spatial[:, :1]),
                                            future_egomotions_spatial[:, :(self.time_receptive_field-1)]], dim=1)
        voxel_feats = torch.cat([voxel_feats, future_egomotions_spatial], dim=-4)

        voxel_feats_enc = self.occ_encoder(voxel_feats)
        if type(voxel_feats_enc) is not list:
            voxel_feats_enc = [voxel_feats_enc]
        
        if self.record_time:
            torch.cuda.synchronize()
            t2 = time.time()
            self.time_stats['occ_encoder'].append(t2 - t1)

        flow_feats_enc = self.flow_encoder(voxel_feats)
        if type(flow_feats_enc) is not list:
            flow_feats_enc = [flow_feats_enc]

        if self.record_time:
            torch.cuda.synchronize()
            t3 = time.time()
            self.time_stats['flow_encoder'].append(t3 - t2)

        depth = depth.view(-1, self.n_cam, *depth.shape[-3:])
        return (voxel_feats_enc, flow_feats_enc, img_feats, depth)
    
    @force_fp32(apply_to=('voxel_feats'))
    def forward_pts_train(
            self,
            voxel_feats,
            gt_occ=None,
            points_occ=None,
            img_metas=None,
            transform=None,
            img_feats=None,
        ):
        
        if self.record_time:
            torch.cuda.synchronize()
            t0 = time.time()
        
        outs = self.pts_bbox_head(
            voxel_feats=voxel_feats,
            points=points_occ,
            img_metas=img_metas,
            img_feats=img_feats,
            transform=transform,
        )
        
        if self.record_time:
            torch.cuda.synchronize()
            t1 = time.time()
            self.time_stats['occ_head'].append(t1 - t0)
        
        losses = self.pts_bbox_head.loss(
            output_voxels=outs['output_voxels'],
            target_voxels=gt_occ,
            target_points=points_occ,
            img_metas=img_metas,
        )
        
        if self.record_time:
            torch.cuda.synchronize()
            t2 = time.time()
            self.time_stats['loss_occ'].append(t2 - t1)
        
        return losses

    @force_fp32(apply_to=('voxel_feats'))
    def forward_flow_train(
            self,
            voxel_feats,
            gt_occ=None,
            points_occ=None,
            img_metas=None,
            transform=None,
            img_feats=None,
        ):

        if self.record_time:
            torch.cuda.synchronize()
            t0 = time.time()

        outs = self.flow_head(
            voxel_feats=voxel_feats,
            points=points_occ,
            img_metas=img_metas,
            img_feats=img_feats,
            transform=transform,
        )

        if self.record_time:
            torch.cuda.synchronize()
            t1 = time.time()
            self.time_stats['flow_head'].append(t1 - t0)
        
        losses = self.flow_head.loss(
            output_voxels=outs['output_voxels'],
            target_voxels=gt_occ,
            target_points=points_occ,
            img_metas=img_metas,
        )
        
        if self.record_time:
            torch.cuda.synchronize()
            t2 = time.time()
            self.time_stats['loss_flow'].append(t2 - t1)
        
        return losses

    def forward_train(self,
            img_inputs_seq=None,
            segmentation=None,
            instance=None,
            attribute_label=None,
            flow=None,
            future_egomotion=None,
            gt_occ=None,
            img_metas=None,
            points_occ=None,
            **kwargs,
        ):
        '''
        Train OCFNet using bbox-wise occupancy labels if self.fine_grained=False, else using voxel-wise labels from nuScenes-Occupancy
        '''

        # manually stop forward
        if self.only_generate_dataset:
            return {"pseudo_loss": torch.tensor(0.0, device=segmentation.device, requires_grad=True)}

        if not self.fine_grained:
            gt_occ = segmentation

        voxel_feats, flow_feats, img_feats, depth = self.extract_feat(
            img_inputs_seq=img_inputs_seq, img_metas=img_metas, future_egomotion=future_egomotion)
        
        # training losses
        losses = dict()
        
        if self.record_time:        
            torch.cuda.synchronize()
            t0 = time.time()
        
        # TODO: we will release the version with depth fine-tuning in the future
        if not self.disable_loss_depth and depth is not None:
            depth_gt = img_inputs_seq[-2][:,0:self.time_receptive_field,...].contiguous()
            depth_gt = depth_gt.view(depth_gt.shape[0]*depth_gt.shape[1],*depth_gt.shape[2:])
            depth = depth.view(-1, *depth.shape[2:])
            losses['loss_depth'] = self.img_view_transformer.get_depth_loss(depth_gt, depth)
        
        if self.record_time:
            torch.cuda.synchronize()
            t1 = time.time()
            self.time_stats['loss_depth'].append(t1 - t0)
        
        transform = img_inputs_seq[1:8] if img_inputs_seq is not None else None
        voxel_feats_seq = []
        for voxel_feats_stage in voxel_feats:
            bs, sfeatures = voxel_feats_stage.shape[:2]
            voxel_feats_stage_ = voxel_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *voxel_feats_stage.shape[2:])
            voxel_feats_seq.append(voxel_feats_stage_)
        gt_occ = gt_occ[:, -self.n_future_frames_plus:, ...] 
        flow = flow[:, -self.n_future_frames_plus:, ...] 

        losses_occupancy = self.forward_pts_train(voxel_feats_seq, gt_occ,
                        points_occ, img_metas, img_feats=img_feats, transform=transform)
        losses.update(losses_occupancy)

        flow_feats_seq = []
        for flow_feats_stage in flow_feats:
            bs, sfeatures = flow_feats_stage.shape[:2]
            flow_feats_stage_ = flow_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *flow_feats_stage.shape[2:])
            flow_feats_seq.append(flow_feats_stage_)

        losses_flow = self.forward_flow_train(flow_feats_seq, flow,
                        points_occ, img_metas, img_feats=img_feats, transform=transform)
        losses.update(losses_flow)

        if self.loss_norm:
            for loss_key in losses.keys():
                if loss_key.startswith('loss'):
                    losses[loss_key] = losses[loss_key] / (losses[loss_key].detach() + 1e-9)

        def logging_latencies():
            # logging latencies
            avg_time = {key: sum(val) / len(val) for key, val in self.time_stats.items()}
            sum_time = sum(list(avg_time.values()))
            out_res = ''
            for key, val in avg_time.items():
                out_res += '{}: {:.4f}, {:.1f}, '.format(key, val, val / sum_time)
            
            print(out_res)
        
        if self.record_time:
            logging_latencies()
        
        return losses
        
    def forward_test(self,
            img_inputs_seq=None,
            segmentation=None,
            instance=None,
            attribute_label=None,
            flow=None,
            future_egomotion=None,
            gt_occ=None,
            img_metas=None,
            points_occ=None,
            **kwargs,
        ):
        '''
        Test OCFNet using IOU and VPQ metrics
        '''

        # let batch size equals 1 while testing
        assert segmentation.shape[0] == 1

        return self.simple_test(img_metas, img_inputs_seq, gt_occ=gt_occ, gt_flow=flow, segmentation=segmentation, instance=instance, future_egomotion=future_egomotion, **kwargs)
    
    def simple_test(self, img_metas, img_inputs_seq=None, rescale=False, points_occ=None, 
            gt_occ=None, gt_flow=None, segmentation=None, instance=None, future_egomotion=None):
        
        # manually stop forward
        if self.only_generate_dataset:
            return {'hist_for_iou': 0, 'pred_c': 0, 'vpq':0}

        if not self.fine_grained:
            gt_occ = segmentation
                
        voxel_feats, flow_feats, img_feats, depth = self.extract_feat(
            img_inputs_seq=img_inputs_seq, img_metas=img_metas, future_egomotion=future_egomotion)

        transform = img_inputs_seq[1:8] if img_inputs_seq is not None else None

        voxel_feats_seq = []
        for voxel_feats_stage in voxel_feats:
            bs, sfeatures = voxel_feats_stage.shape[:2]
            voxel_feats_stage_ = voxel_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *voxel_feats_stage.shape[2:])
            voxel_feats_seq.append(voxel_feats_stage_)
        
        gt_occ = gt_occ[:, -self.n_future_frames_plus:, ...].contiguous()
        gt_occ = gt_occ.view(gt_occ.shape[0]*gt_occ.shape[1], *gt_occ.shape[2:])
        instance = instance[:, -self.n_future_frames_plus:, ...].contiguous()
        instance = instance.view(instance.shape[0]*instance.shape[1], *instance.shape[2:])

        output = self.pts_bbox_head(
            voxel_feats=voxel_feats_seq,
            points=points_occ,
            img_metas=img_metas,
            img_feats=img_feats,
            transform=transform,
        )

        pred_c = output['output_voxels'][0]  
        
        flow_feats_seq = []
        for flow_feats_stage in flow_feats:
            bs, sfeatures = flow_feats_stage.shape[:2]
            flow_feats_stage_ = flow_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *flow_feats_stage.shape[2:])
            flow_feats_seq.append(flow_feats_stage_)

        output_flow = self.flow_head(
            voxel_feats=flow_feats_seq,
            points=points_occ,
            img_metas=img_metas,
            img_feats=img_feats,
            transform=transform,
        )

        gt_flow = gt_flow[:, -self.n_future_frames_plus:, ...].contiguous()
        gt_flow = gt_flow.view(gt_flow.shape[0]*gt_flow.shape[1], *gt_flow.shape[2:])

        # pred_flow = output_flow['output_voxels'][0]
        # vpq = self.evaluate_instance_prediction(pred_c, pred_flow, gt_occ, instance)
        vpq = 0.1

        if self.test_present:
            pred_c = pred_c[self.eval_start_moment:(self.eval_start_moment+1), ...]
            gt_occ = gt_occ[self.eval_start_moment:(self.eval_start_moment+1), ...]
        else:
            pred_c = pred_c[self.eval_start_moment+1:, ...]
            gt_occ = gt_occ[self.eval_start_moment+1:, ...]
        
        hist_for_iou = self.evaluate_occupancy_forecasting(pred_c, gt_occ, img_metas=img_metas, save_pred=self.save_pred, save_path=self.save_path)

        test_output = {
            'hist_for_iou': hist_for_iou,
            'pred_c': pred_c,
            'vpq': vpq,
        }

        return test_output


    def evaluate_occupancy_forecasting(self, pred, gt, img_metas=None, save_pred=False, save_path=None):

        B, H, W, D = gt.shape
        pred = F.interpolate(pred, size=[H, W, D], mode='trilinear', align_corners=False).contiguous()

        hist_all = 0
        iou_per_pred_list = []
        pred_list = []
        gt_list = []
        for i in range(B):
            pred_cur = pred[i,...]
            pred_cur = torch.argmax(pred_cur, dim=0).cpu().numpy()
            gt_cur = gt[i, ...].cpu().numpy()
            gt_cur = gt_cur.astype(np.int)

            pred_list.append(pred_cur)
            gt_list.append(gt_cur)

            # ignore noise
            noise_mask = gt_cur != 255

            # GMO and others for max_label=2
            # multiple movable objects for max_label=9
            hist_cur, iou_per_pred = fast_hist(pred_cur[noise_mask], gt_cur[noise_mask], max_label=self.max_label)
            hist_all = hist_all + hist_cur
            iou_per_pred_list.append(iou_per_pred)

        # whether save prediction results
        if save_pred:
            if not os.path.exists(save_path):
                os.mkdir(save_path)
            pred_for_save_list = []
            for k in range(B):
                pred_for_save = torch.argmax(pred[k], dim=0).cpu()
                x_grid = torch.linspace(0, H-1, H, dtype=torch.long)
                x_grid = x_grid.view(H, 1, 1).expand(H, W, D)
                y_grid = torch.linspace(0, W-1, W, dtype=torch.long)
                y_grid = y_grid.view(1, W, 1).expand(H, W, D)
                z_grid = torch.linspace(0, D-1, D, dtype=torch.long)
                z_grid = z_grid.view(1, 1, D).expand(H, W, D)
                segmentation_for_save = torch.stack((x_grid, y_grid, z_grid), -1)
                segmentation_for_save = segmentation_for_save.view(-1, 3)
                segmentation_label = pred_for_save.squeeze(0).view(-1,1)
                segmentation_for_save = torch.cat((segmentation_for_save, segmentation_label), dim=-1) # N,4
                kept = segmentation_for_save[:,-1]!=0
                segmentation_for_save= segmentation_for_save[kept].cpu().numpy()
                pred_for_save_list.append(segmentation_for_save)
            np.savez(os.path.join(save_path, img_metas[0]["scene_token"]), pred_for_save_list)

        return hist_all


    def find_instance_centers(self, center_prediction: torch.Tensor, conf_threshold: float = 0.1, nms_kernel_size: float = 3):
        assert len(center_prediction.shape) == 4

        center_prediction = F.threshold(center_prediction, threshold=conf_threshold, value=-1)

        nms_padding = (nms_kernel_size - 1) // 2
        maxpooled_center_prediction = F.max_pool3d(
            center_prediction, kernel_size=nms_kernel_size, stride=1, padding=nms_padding
        )

        # Filter all elements that are not the maximum (i.e. the center of the heatmap instance)
        center_prediction[center_prediction != maxpooled_center_prediction] = -1
        return torch.nonzero(center_prediction > 0)[:, 1:]

    def group_pixels(self, centers: torch.Tensor, offset_predictions: torch.Tensor) -> torch.Tensor:
        dx, dy, dz = offset_predictions.shape[-3:]
        x_grid = (
            torch.arange(dx, dtype=offset_predictions.dtype, device=offset_predictions.device)
            .view(1, dx, 1, 1)
            .repeat(1, 1, dy, dz)
        )
        y_grid = (
            torch.arange(dy, dtype=offset_predictions.dtype, device=offset_predictions.device)
            .view(1, 1, dy, 1)
            .repeat(1, dx, 1, dz)
        )
        z_grid = (
            torch.arange(dz, dtype=offset_predictions.dtype, device=offset_predictions.device)
            .view(1, 1, 1, dz)
            .repeat(1, dx, dy, 1)
        )

        pixel_grid = torch.cat((x_grid, y_grid, z_grid), dim=0)
        center_locations = (pixel_grid + offset_predictions).view(3, dx*dy*dz, 1).permute(2, 1, 0)
        centers = centers.view(-1, 1, 3)

        distances = torch.norm(centers - center_locations, dim=-1)

        instance_id = torch.argmin(distances, dim=0).reshape(1, dx, dy, dz) + 1
        return instance_id

    def update_instance_ids(self, instance_seg, old_ids, new_ids):
        indices = torch.arange(old_ids.max() + 1, device=instance_seg.device)
        for old_id, new_id in zip(old_ids, new_ids):
            indices[old_id] = new_id

        return indices[instance_seg].long()

    def make_instance_seg_consecutive(self, instance_seg):
        # Make the indices of instance_seg consecutive
        unique_ids = torch.unique(instance_seg)
        new_ids = torch.arange(len(unique_ids), device=instance_seg.device)
        instance_seg = self.update_instance_ids(instance_seg, unique_ids, new_ids)
        return instance_seg

    def get_instance_segmentation_and_centers(self,
        center_predictions: torch.Tensor,
        offset_predictions: torch.Tensor,
        foreground_mask: torch.Tensor,
        conf_threshold: float = 0.1,
        nms_kernel_size: float = 5,
        max_n_instance_centers: int = 100,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        dx, dy, dz = offset_predictions.shape[-3:]
        center_predictions = center_predictions.view(1, dx, dy, dz)
        offset_predictions = offset_predictions.view(3, dx, dy, dz)
        foreground_mask = foreground_mask.view(1, dx, dy, dz)

        centers = self.find_instance_centers(center_predictions, conf_threshold=conf_threshold, nms_kernel_size=nms_kernel_size)
        if not len(centers):
            return torch.zeros(center_predictions.shape, dtype=torch.int64, device=center_predictions.device)

        if len(centers) > max_n_instance_centers:
            centers = centers[:max_n_instance_centers].clone()
        
        instance_ids = self.group_pixels(centers, offset_predictions * foreground_mask.float()) 
        instance_seg = (instance_ids * foreground_mask.float()).long()

        # Make the indices of instance_seg consecutive
        instance_seg = self.make_instance_seg_consecutive(instance_seg) 

        return instance_seg.long()

    def flow_warp(self, occupancy, flow, mode='nearest', padding_mode='zeros'):
        '''
        Warp ground-truth flow-origin occupancies according to predicted flows
        '''

        _, num_waypoints, _, grid_dx_cells, grid_dy_cells, grid_dz_cells = occupancy.size()

        dx = torch.linspace(-1, 1, steps=grid_dx_cells)
        dy = torch.linspace(-1, 1, steps=grid_dy_cells)
        dz = torch.linspace(-1, 1, steps=grid_dz_cells)

        x_idx, y_idx, z_idx = torch.meshgrid(dx, dy, dz)
        identity_indices = torch.stack((x_idx, y_idx, z_idx), dim=0).to(device=occupancy.device)

        warped_occupancy = []
        for k in range(num_waypoints):  # 1
            flow_origin_occupancy = occupancy[:, k]  # B T 1 dx dy dz -> B 1 dx dy dz
            pred_flow = flow[:, k]  # B T 3 dx dy dz -> B 3 dx dy dz
            # Normalize along the width and height direction
            normalize_pred_flow = torch.stack(
                (2.0 * pred_flow[:, 0] / (grid_dx_cells - 1),  
                2.0 * pred_flow[:, 1] / (grid_dy_cells - 1),
                2.0 * pred_flow[:, 2] / (grid_dz_cells - 1),),
                dim=1,
            )

            warped_indices = identity_indices + normalize_pred_flow
            warped_indices = warped_indices.permute(0, 2, 3, 4, 1)

            flow_origin_occupancy = flow_origin_occupancy.permute(0, 1, 4, 3, 2)

            sampled_occupancy = F.grid_sample(
                input=flow_origin_occupancy,
                grid=warped_indices,
                mode=mode,
                padding_mode='zeros',
                align_corners=True,
            )
            warped_occupancy.append(sampled_occupancy)
        return warped_occupancy[0]

    def make_instance_id_temporally_consecutive(self, pred_inst, preds, backward_flow, ignore_index=255.0):

        assert pred_inst.shape[0] == 1, 'Assumes batch size = 1'

        # Initialise instance segmentations with prediction corresponding to the present
        consistent_instance_seg = [pred_inst.unsqueeze(0)]
        backward_flow = backward_flow.clone().detach()
        backward_flow[backward_flow == ignore_index] = 0.0
        seq_len, _, dx, dy, dz = preds.shape

        for t in range(1, seq_len):

            init_warped_instance_seg = self.flow_warp(consistent_instance_seg[-1].unsqueeze(0).float(), backward_flow[t:t+1].unsqueeze(0)).int()

            warped_instance_seg = init_warped_instance_seg * preds[t:t+1, 0]
        
            consistent_instance_seg.append(warped_instance_seg)
        
        consistent_instance_seg = torch.cat(consistent_instance_seg, dim=1)
        return consistent_instance_seg


    def predict_instance_segmentation(self, pred_seg, pred_flow):
        pred_seg_sm = pred_seg.detach()
        pred_seg_sm = torch.argmax(pred_seg_sm, dim=1, keepdims=True)
        foreground_masks = pred_seg_sm.squeeze(1) == self.vehicles_id

        pred_inst_batch = self.get_instance_segmentation_and_centers(
            torch.softmax(pred_seg, dim=1)[0:1, self.vehicles_id].detach(),
            pred_flow[1:2].detach(), 
            foreground_masks[1:2].detach(),
            nms_kernel_size=7,
        )  
        
        consistent_instance_seg = self.make_instance_id_temporally_consecutive(
                pred_inst_batch,
                pred_seg_sm[1:],
                pred_flow[1:].detach(),
                )

        consistent_instance_seg = torch.cat([torch.zeros_like(pred_inst_batch.unsqueeze(0)), consistent_instance_seg], dim=1)

        return consistent_instance_seg.permute(1, 0, 2, 3, 4).long()  # [1, 6, 512, 512, 40]

    def combine_mask(self, segmentation: torch.Tensor, instance: torch.Tensor, n_classes: int, n_all_things: int):
        '''
        Shift all things ids by num_classes and combine things and stuff into a single mask
        '''
        instance = instance.view(-1)
        instance_mask = instance > 0
        instance = instance - 1 + n_classes

        segmentation = segmentation.clone().view(-1)
        segmentation_mask = segmentation < n_classes

        # Build an index from instance id to class id.
        instance_id_to_class_tuples = torch.cat(
            (
                instance[instance_mask & segmentation_mask].unsqueeze(1),
                segmentation[instance_mask & segmentation_mask].unsqueeze(1),
            ),
            dim=1,
        )

        instance_id_to_class = -instance_id_to_class_tuples.new_ones((n_all_things,))
        instance_id_to_class[instance_id_to_class_tuples[:, 0]] = instance_id_to_class_tuples[:, 1]
        instance_id_to_class[torch.arange(n_classes, device=segmentation.device)] = torch.arange(
            n_classes, device=segmentation.device
        )

        segmentation[instance_mask] = instance[instance_mask]
        segmentation += 1
        segmentation[~segmentation_mask] = 0

        return segmentation, instance_id_to_class


    def panoptic_metrics(self, pred_segmentation, pred_instance, gt_segmentation, gt_instance, unique_id_mapping):
        # GMO and others
        n_classes = 2 
        self.keys = ['iou', 'true_positive', 'false_positive', 'false_negative'] # hard coding
        result = {key: torch.zeros(n_classes, dtype=torch.float32, device=gt_instance.device) for key in self.keys}

        assert pred_segmentation.dim() == 3
        assert pred_segmentation.shape == pred_instance.shape == gt_segmentation.shape == gt_instance.shape

        n_instances = int(torch.cat([pred_instance, gt_instance]).max().item())
        n_all_things = n_instances + n_classes  # Classes + instances.
        n_things_and_void = n_all_things + 1

        pred_segmentation = pred_segmentation.long().detach().cpu()
        pred_instance = pred_instance.long().detach().cpu()
        gt_segmentation = gt_segmentation.long().detach().cpu()
        gt_instance = gt_instance.long().detach().cpu()
        
        prediction, pred_to_cls = self.combine_mask(pred_segmentation, pred_instance, n_classes, n_all_things)
        target, target_to_cls = self.combine_mask(gt_segmentation, gt_instance, n_classes, n_all_things)

        # Compute ious between all stuff and things
        # hack for bincounting 2 arrays together
        x = prediction + n_things_and_void * target  
        bincount_2d = torch.bincount(x.long(), minlength=n_things_and_void ** 2) 
        if bincount_2d.shape[0] != n_things_and_void ** 2:
            raise ValueError('Incorrect bincount size.')
        conf = bincount_2d.reshape((n_things_and_void, n_things_and_void))
        # Drop void class
        conf = conf[1:, 1:]  
        # Confusion matrix contains intersections between all combinations of classes
        union = conf.sum(0).unsqueeze(0) + conf.sum(1).unsqueeze(1) - conf
        iou = torch.where(union > 0, (conf.float() + 1e-9) / (union.float() + 1e-9), torch.zeros_like(union).float())

        mapping = (iou > self.iou_thresh_for_vpq).nonzero(as_tuple=False)
 
        # Check that classes match.
        is_matching = pred_to_cls[mapping[:, 1]] == target_to_cls[mapping[:, 0]]
        mapping = mapping[is_matching.detach().cpu().numpy()]
        tp_mask = torch.zeros_like(conf, dtype=torch.bool)
        tp_mask[mapping[:, 0], mapping[:, 1]] = True

        # First ids correspond to "stuff" i.e. semantic seg.
        # Instance ids are offset accordingly
        for target_id, pred_id in mapping:
            cls_id = pred_to_cls[pred_id]

            self.temporally_consistent = True # hard coding !
            if self.temporally_consistent and cls_id == self.vehicles_id:
                if target_id.item() in unique_id_mapping and unique_id_mapping[target_id.item()] != pred_id.item():
                    # Not temporally consistent
                    result['false_negative'][target_to_cls[target_id]] += 1
                    result['false_positive'][pred_to_cls[pred_id]] += 1
                    unique_id_mapping[target_id.item()] = pred_id.item()
                    continue

            result['true_positive'][cls_id] += 1
            result['iou'][cls_id] += iou[target_id][pred_id]
            unique_id_mapping[target_id.item()] = pred_id.item()

        for target_id in range(n_classes, n_all_things):
            # If this is a true positive do nothing.
            if tp_mask[target_id, n_classes:].any():
                continue
            # If this target instance didn't match with any predictions and was present set it as false negative.
            if target_to_cls[target_id] != -1:
                result['false_negative'][target_to_cls[target_id]] += 1

        for pred_id in range(n_classes, n_all_things):
            # If this is a true positive do nothing.
            if tp_mask[n_classes:, pred_id].any():
                continue
            # If this predicted instance didn't match with any prediction, set that predictions as false positive.
            if pred_to_cls[pred_id] != -1 and (conf[:, pred_id] > 0).any():
                result['false_positive'][pred_to_cls[pred_id]] += 1

        return result

    def evaluate_instance_prediction(self, pred_seg, pred_flow, gt_seg, gt_instance):

        B, H, W, D = gt_seg.shape

        pred_consistent_instance_seg = self.predict_instance_segmentation(pred_seg, pred_flow)

        # add one feature dimension for interpolate
        pred_consistent_instance_seg = F.interpolate(pred_consistent_instance_seg.float(), size=[H, W, D], mode='nearest').contiguous()
        pred_consistent_instance_seg = pred_consistent_instance_seg.squeeze(1) # [6,512,512,40]

        iou = 0
        true_positive = 0
        false_positive = 0
        false_negative = 0

        # starting from the present frame
        pred_instance = pred_consistent_instance_seg[self.eval_start_moment:]
        gt_instance = gt_instance[self.eval_start_moment:].long()

        assert gt_instance.min() == 0, 'ID 0 of gt_instance must be background'
        pred_segmentation = (pred_instance > 0).long()
        gt_segmentation = (gt_instance > 0).long()

        unique_id_mapping = {}
        for t in range(pred_segmentation.shape[0]):
            result = self.panoptic_metrics(
                pred_segmentation[t].detach(),
                pred_instance[t].detach(),
                gt_segmentation[t],
                gt_instance[t],
                unique_id_mapping,
            )

            iou += result['iou']
            true_positive += result['true_positive']
            false_positive += result['false_positive']
            false_negative += result['false_negative']

        denominator = torch.maximum(
            (true_positive + false_positive / 2 + false_negative / 2),
            torch.ones_like(true_positive)
        )
        pq = iou / denominator

        return pq.cpu().numpy()

    def forward_dummy(self,
            points=None,
            img_metas=None,
            img_inputs=None,
            points_occ=None,
            **kwargs,
        ):

        voxel_feats, flow_feats, img_feats, depth = self.extract_feat(img=img_inputs, img_metas=img_metas)

        transform = img_inputs[1:8] if img_inputs is not None else None
        output = self.pts_bbox_head(
            voxel_feats=voxel_feats,
            points=points_occ,
            img_metas=img_metas,
            img_feats=img_feats,
            transform=transform,
        )
        
        return output
    
    
def fast_hist(pred, label, max_label=18):
    pred = copy.deepcopy(pred.flatten())
    label = copy.deepcopy(label.flatten())
    bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2) 
    iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2]))
    return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred


================================================
FILE: projects/occ_plugin/occupancy/fuser/__init__.py
================================================
from .addfuse import AddFuser
from .visfuse import VisFuser
from .convfuse import ConvFuser


================================================
FILE: projects/occ_plugin/occupancy/fuser/addfuse.py
================================================
import random
from typing import List

import torch
from torch import nn

from mmdet3d.models.builder import FUSION_LAYERS


@FUSION_LAYERS.register_module()
class AddFuser(nn.Module):
    def __init__(self, in_channels, out_channels, dropout, input_modality=None) -> None:
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.dropout = dropout
        if input_modality == None:
            input_modality = dict(
                use_lidar=True,
                use_camera=True,
                use_radar=False,
                use_map=False,
                use_external=False)
        self.use_lidar = input_modality['use_lidar']
        self.use_img = input_modality['use_camera']

        if self.use_img:
            self.img_enc = nn.Sequential(
                nn.Conv3d(in_channels, out_channels, 3, padding=1, bias=False),
                nn.BatchNorm3d(out_channels),
                nn.ReLU(True),
            )
        if self.use_lidar:
            self.pts_enc = nn.Sequential(
                nn.Conv3d(in_channels, out_channels, 3, padding=1, bias=False),
                nn.BatchNorm3d(out_channels),
                nn.ReLU(True),
            )

    def forward(self, img_voxel_feats, pts_voxel_feats):
        features = []
        if self.use_img:
            img_voxel_feats = self.img_enc(img_voxel_feats)
            features.append(img_voxel_feats)
        if self.use_lidar:
            pts_voxel_feats = self.pts_enc(pts_voxel_feats)
            features.append(pts_voxel_feats)

        weights = [1] * len(features)
        if self.training and random.random() < self.dropout:
            index = random.randint(0, len(features) - 1)
            weights[index] = 0

        return sum(w * f for w, f in zip(weights, features)) / sum(weights)


================================================
FILE: projects/occ_plugin/occupancy/fuser/convfuse.py
================================================
import random
from typing import List

import torch
from torch import nn

from mmdet3d.models.builder import FUSION_LAYERS


@FUSION_LAYERS.register_module()
class ConvFuser(nn.Module):
    def __init__(self, in_channels, out_channels) -> None:
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels

        self.occ_enc = nn.Sequential(
            nn.Conv3d(in_channels*2, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm3d(out_channels),
            nn.ReLU(True),
        )
    def forward(self, img_voxel_feats, pts_voxel_feats):
        return self.occ_enc(torch.cat([img_voxel_feats, pts_voxel_feats], dim=1))


================================================
FILE: projects/occ_plugin/occupancy/fuser/visfuse.py
================================================
import random
from typing import List

import torch
from torch import nn
import torch.nn.functional as F

from mmdet3d.models.builder import FUSION_LAYERS
from mmcv.cnn import build_norm_layer


@FUSION_LAYERS.register_module()
class VisFuser(nn.Module):
    def __init__(self, in_channels, out_channels, norm_cfg=None) -> None:
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        if norm_cfg is None:
            norm_cfg = dict(type='BN3d', eps=1e-3, momentum=0.01)

        self.img_enc = nn.Sequential(
            nn.Conv3d(in_channels, out_channels, 7, padding=3, bias=False),
            build_norm_layer(norm_cfg, out_channels)[1],
            # nn.BatchNorm3d(out_channels),
            nn.ReLU(True),
        )
        self.pts_enc = nn.Sequential(
            nn.Conv3d(in_channels, out_channels, 7, padding=3, bias=False),
            build_norm_layer(norm_cfg, out_channels)[1],
            # nn.BatchNorm3d(out_channels),
            nn.ReLU(True),
        )
        self.vis_enc = nn.Sequential(
            nn.Conv3d(2*out_channels, 16, 3, padding=1, bias=False),
            build_norm_layer(norm_cfg, 16)[1],
            # nn.BatchNorm3d(16),
            nn.ReLU(True),
            nn.Conv3d(16, 1, 1, padding=0, bias=False),
            nn.Sigmoid(),
        )

    def forward(self, img_voxel_feats, pts_voxel_feats):

        img_voxel_feats = self.img_enc(img_voxel_feats)
        pts_voxel_feats = self.pts_enc(pts_voxel_feats)
        vis_weight = self.vis_enc(torch.cat([img_voxel_feats, pts_voxel_feats], dim=1))
        voxel_feats = vis_weight * img_voxel_feats + (1 - vis_weight) * pts_voxel_feats

        return voxel_feats


================================================
FILE: projects/occ_plugin/occupancy/image2bev/ViewTransformerLSSBEVDepth.py
================================================
# Copyright (c) Phigent Robotics. All rights reserved.
import math
import torch
import torch.nn as nn
from mmcv.runner import BaseModule
from mmdet3d.models.builder import NECKS
from projects.occ_plugin.ops.occ_pooling import occ_pool
from mmcv.cnn import build_conv_layer, build_norm_layer
from mmcv.runner import force_fp32
from torch.cuda.amp.autocast_mode import autocast
from mmdet.models.backbones.resnet import BasicBlock
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint
from scipy.special import erf
from scipy.stats import norm
import numpy as np
import copy
import pdb

def gen_dx_bx(xbound, ybound, zbound):
    dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]])
    bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]])
    nx = torch.Tensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]])
    return dx, bx, nx

def cumsum_trick(x, geom_feats, ranks):
    x = x.cumsum(0)
    kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
    kept[:-1] = (ranks[1:] != ranks[:-1])
    x, geom_feats = x[kept], geom_feats[kept]
    x = torch.cat((x[:1], x[1:] - x[:-1]))
    return x, geom_feats


class QuickCumsum(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, geom_feats, ranks):
        x = x.cumsum(0)
        kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
        kept[:-1] = (ranks[1:] != ranks[:-1])

        x, geom_feats = x[kept], geom_feats[kept]
        x = torch.cat((x[:1], x[1:] - x[:-1]))

        # save kept for backward
        ctx.save_for_backward(kept)

        # no gradient for geom_feats
        ctx.mark_non_differentiable(geom_feats)

        return x, geom_feats

    @staticmethod
    def backward(ctx, gradx, gradgeom):
        kept, = ctx.saved_tensors
        back = torch.cumsum(kept, 0)
        back[kept] -= 1

        val = gradx[back]

        return val, None, None

class ViewTransformerLiftSplatShoot(BaseModule):
    def __init__(self, grid_config=None, data_config=None,
                 numC_input=512, numC_Trans=64, downsample=16,
                 accelerate=False, use_bev_pool=True, vp_megvii=False,
                 vp_stero=False, **kwargs):
        super(ViewTransformerLiftSplatShoot, self).__init__()
        if grid_config is None:
            grid_config = {
                'xbound': [-51.2, 51.2, 0.8],
                'ybound': [-51.2, 51.2, 0.8],
                'zbound': [-10.0, 10.0, 20.0],
                'dbound': [1.0, 60.0, 1.0],}
        self.grid_config = grid_config
        dx, bx, nx = gen_dx_bx(self.grid_config['xbound'],
                               self.grid_config['ybound'],
                               self.grid_config['zbound'],
                               )
        self.dx = nn.Parameter(dx, requires_grad=False)
        self.bx = nn.Parameter(bx, requires_grad=False)
        self.nx = nn.Parameter(nx, requires_grad=False)

        if data_config is None:
            data_config = {'input_size': (256, 704)}
        self.data_config = data_config
        self.downsample = downsample

        self.frustum = self.create_frustum()   # D x H x W x 3
        self.D, _, _, _ = self.frustum.shape
        self.numC_input = numC_input
        self.numC_Trans = numC_Trans
        self.depth_net = nn.Conv2d(self.numC_input, self.D + self.numC_Trans, kernel_size=1, padding=0)
        self.geom_feats = None
        self.accelerate = accelerate
        self.use_bev_pool = use_bev_pool
        self.vp_megvii = vp_megvii
        self.vp_stereo = vp_stero

    def get_depth_dist(self, x):
        return x.softmax(dim=1)

    def create_frustum(self):
        # make grid in image plane
        ogfH, ogfW = self.data_config['input_size']
        fH, fW = ogfH // self.downsample, ogfW // self.downsample
        ds = torch.arange(*self.grid_config['dbound'], dtype=torch.float).view(-1, 1, 1).expand(-1, fH, fW)  # dbound=[2.0, 58.0, 0.5]
        D, _, _ = ds.shape
        xs = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(1, 1, fW).expand(D, fH, fW)
        ys = torch.linspace(0, ogfH - 1, fH, dtype=torch.float).view(1, fH, 1).expand(D, fH, fW)

        # D x H x W x 3
        frustum = torch.stack((xs, ys, ds), -1)
        return nn.Parameter(frustum, requires_grad=False)

    def get_geometry(self, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda):
        """Determine the (x,y,z) locations (in the ego frame)
        of the points in the point cloud.
        Returns B x N x D x H/downsample x W/downsample x 3
        """

        B, N, _ = trans_seq.shape

        # undo post-transformation
        # B x N x D x H x W x 3
        points = self.frustum - post_trans_seq.view(B, N, 1, 1, 1, 3)
        points = torch.inverse(post_rots_seq).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))

        # cam_to_ego
        points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
                            points[:, :, :, :, :, 2:3]
                            ), 5)
        
        if intrins_seq.shape[3] == 4:
            shift = intrins_seq[:, :, :3, 3]
            points = points - shift.view(B, N, 1, 1, 1, 3, 1)
            intrins_seq = intrins_seq[:, :, :3, :3]
        
        combine = rots_seq.matmul(torch.inverse(intrins_seq))
        points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
        points += trans_seq.view(B, N, 1, 1, 1, 3)
        
        return points

    def voxel_pooling(self, geom_feats, x):
        B, N, D, H, W, C = x.shape
        Nprime = B * N * D * H * W
        nx = self.nx.to(torch.long)
        # flatten x
        x = x.reshape(Nprime, C)

        # flatten indices
        geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).long()
        geom_feats = geom_feats.view(Nprime, 3)
        batch_ix = torch.cat([torch.full([Nprime // B, 1], ix,
                                         device=x.device, dtype=torch.long) for ix in range(B)])
        geom_feats = torch.cat((geom_feats, batch_ix), 1)

        # filter out points that are outside box
        kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \
               & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \
               & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])
        x = x[kept]
        geom_feats = geom_feats[kept]

        if self.use_bev_pool:
            final = occ_pool(x, geom_feats, B, self.nx[2], self.nx[0],
                                   self.nx[1])
            final = final.transpose(dim0=-2, dim1=-1)
        else:
            # get tensors from the same voxel next to each other
            ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B) \
                    + geom_feats[:, 1] * (self.nx[2] * B) \
                    + geom_feats[:, 2] * B \
                    + geom_feats[:, 3]
            sorts = ranks.argsort()
            x, geom_feats, ranks = x[sorts], geom_feats[sorts], ranks[sorts]

            # cumsum trick
            x, geom_feats = QuickCumsum.apply(x, geom_feats, ranks)

            # griddify (B x C x Z x X x Y)
            final = torch.zeros((B, C, nx[2], nx[1], nx[0]), device=x.device)
            final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 1], geom_feats[:, 0]] = x
        # collapse Z
        final = torch.cat(final.unbind(dim=2), 1)

        return final

    def voxel_pooling_accelerated(self, rots, trans, intrins, post_rots, post_trans, bda, x):
        B, N, D, H, W, C = x.shape
        Nprime = B * N * D * H * W
        nx = self.nx.to(torch.long)
        # flatten x
        x = x.reshape(Nprime, C)
        max = 300
        # flatten indices
        if self.geom_feats is None:
            geom_feats = self.get_geometry(rots, trans, intrins,
                                           post_rots, post_trans, bda)
            geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) /
                          self.dx).long()
            geom_feats = geom_feats.view(Nprime, 3)
            batch_ix = torch.cat([torch.full([Nprime // B, 1], ix,
                                             device=x.device, dtype=torch.long)
                                  for ix in range(B)])
            geom_feats = torch.cat((geom_feats, batch_ix), 1)

            # filter out points that are outside box
            kept1 = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \
                    & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \
                    & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])
            idx = torch.range(0, x.shape[0] - 1, dtype=torch.long)
            x = x[kept1]
            idx = idx[kept1]
            geom_feats = geom_feats[kept1]

            # get tensors from the same voxel next to each other
            ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B) \
                    + geom_feats[:, 1] * (self.nx[2] * B) \
                    + geom_feats[:, 2] * B \
                    + geom_feats[:, 3]
            sorts = ranks.argsort()
            x, geom_feats, ranks, idx = x[sorts], geom_feats[sorts], ranks[sorts], idx[sorts]
            repeat_id = torch.ones(geom_feats.shape[0], device=geom_feats.device, dtype=geom_feats.dtype)
            curr = 0
            repeat_id[0] = 0
            curr_rank = ranks[0]

            for i in range(1, ranks.shape[0]):
                if curr_rank == ranks[i]:
                    curr += 1
                    repeat_id[i] = curr
                else:
                    curr_rank = ranks[i]
                    curr = 0
                    repeat_id[i] = curr
            kept2 = repeat_id < max
            repeat_id, geom_feats, x, idx = repeat_id[kept2], geom_feats[kept2], x[kept2], idx[kept2]

            geom_feats = torch.cat([geom_feats,
                                    repeat_id.unsqueeze(-1)], dim=-1)
            self.geom_feats = geom_feats
            self.idx = idx
        else:
            geom_feats = self.geom_feats
            idx = self.idx
            x = x[idx]

        # griddify (B x C x Z x X x Y)
        final = torch.zeros((B, C, nx[2], nx[1], nx[0], max), device=x.device)
        final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 1],
        geom_feats[:, 0], geom_feats[:, 4]] = x
        final = final.sum(-1)
        # collapse Z
        final = torch.cat(final.unbind(dim=2), 1)
        return final

    def voxel_pooling_bevdepth(self, geom_feats, x):
        nx = self.nx.to(torch.long)
        geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).int()
        
        # FIXME
        # final = voxel_pooling(geom_feats, x.contiguous(), nx)
        final = self.voxel_pooling(geom_feats, x.contiguous(), nx)
        
        return final

    def forward(self, input):
        x, rots, trans, intrins, post_rots, post_trans, bda = input
        B, N, C, H, W = x.shape
        x = x.view(B * N, C, H, W)
        x = self.depth_net(x)
        depth = self.get_depth_dist(x[:, :self.D])
        img_feat = x[:, self.D:(self.D + self.numC_Trans)]

        # Lift
        volume = depth.unsqueeze(1) * img_feat.unsqueeze(2)
        volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
        volume = volume.permute(0, 1, 3, 4, 5, 2)

        # Splat
        if self.accelerate:
            bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins,
                                                      post_rots, post_trans,
                                                      bda, volume)
        else:
            geom = self.get_geometry(rots, trans, intrins,
                                     post_rots, post_trans, bda)
            if self.vp_megvii:
                bev_feat = self.voxel_pooling_bevdepth(geom, volume)
            else:
                bev_feat = self.voxel_pooling(geom, volume)
        return bev_feat


class _ASPPModule(nn.Module):
    def __init__(self, inplanes, planes, kernel_size, padding, dilation,
                 BatchNorm):
        super(_ASPPModule, self).__init__()
        self.atrous_conv = nn.Conv2d(inplanes,
                                     planes,
                                     kernel_size=kernel_size,
                                     stride=1,
                                     padding=padding,
                                     dilation=dilation,
                                     bias=False)
        self.bn = BatchNorm
        self.relu = nn.ReLU()

        self._init_weight()

    def forward(self, x):
        x = self.atrous_conv(x)
        x = self.bn(x)

        return self.relu(x)

    def _init_weight(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                torch.nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()


class ASPP(nn.Module):
    def __init__(self, inplanes, mid_channels=256, norm_cfg=dict(type='BN2d')):
        super(ASPP, self).__init__()

        dilations = [1, 6, 12, 18]

        self.aspp1 = _ASPPModule(inplanes,
                                 mid_channels,
                                 1,
                                 padding=0,
                                 dilation=dilations[0],
                                 BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])
        self.aspp2 = _ASPPModule(inplanes,
                                 mid_channels,
                                 3,
                                 padding=dilations[1],
                                 dilation=dilations[1],
                                 BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])
        self.aspp3 = _ASPPModule(inplanes,
                                 mid_channels,
                                 3,
                                 padding=dilations[2],
                                 dilation=dilations[2],
                                 BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])
        self.aspp4 = _ASPPModule(inplanes,
                                 mid_channels,
                                 3,
                                 padding=dilations[3],
                                 dilation=dilations[3],
                                 BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1])

        self.global_avg_pool = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False),
            build_norm_layer(norm_cfg, mid_channels)[1],
            nn.ReLU(),
        )
        self.conv1 = nn.Conv2d(int(mid_channels * 5),
                               mid_channels,
                               1,
                               bias=False)
        self.bn1 = build_norm_layer(norm_cfg, mid_channels)[1]
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self._init_weight()

    def forward(self, x):
        x1 = self.aspp1(x)
        x2 = self.aspp2(x)
        x3 = self.aspp3(x)
        x4 = self.aspp4(x)
        x5 = self.global_avg_pool(x)
        x5 = F.interpolate(x5,
                           size=x4.size()[2:],
                           mode='bilinear',
                           align_corners=True)
        x = torch.cat((x1, x2, x3, x4, x5), dim=1)

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        return self.dropout(x)

    def _init_weight(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                torch.nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()


class Mlp(nn.Module):
    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.ReLU,
                 drop=0.0):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.drop1 = nn.Dropout(drop)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop2 = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        return x


class SELayer(nn.Module):
    def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
        super().__init__()
        self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
        self.act1 = act_layer()
        self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
        self.gate = gate_layer()

    def forward(self, x, x_se):
        x_se = self.conv_reduce(x_se)
        x_se = self.act1(x_se)
        x_se = self.conv_expand(x_se)
        return x * self.gate(x_se)
    

class DepthNet(nn.Module):
    def __init__(self, in_channels, mid_channels, context_channels,
                 depth_channels, cam_channels=27, norm_cfg=None):
        super(DepthNet, self).__init__()

        self.reduce_conv = nn.Sequential(
            nn.Conv2d(in_channels,
                      mid_channels,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            build_norm_layer(norm_cfg, mid_channels)[1],
            nn.ReLU(inplace=True),
        )
        self.context_conv = nn.Conv2d(mid_channels,
                                      context_channels,
                                      kernel_size=1,
                                      stride=1,
                                      padding=0)
        
        self.bn = build_norm_layer(dict(type='GN', num_groups=9, requires_grad=True), cam_channels)[1]
        self.depth_mlp = Mlp(cam_channels, mid_channels, mid_channels)
        self.depth_se = SELayer(mid_channels)  # NOTE: add camera-aware
        self.context_mlp = Mlp(cam_channels, mid_channels, mid_channels)
        self.context_se = SELayer(mid_channels)  # NOTE: add camera-aware
        self.depth_conv = nn.Sequential(
            BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
            BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
            BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
            ASPP(mid_channels, mid_channels, norm_cfg=norm_cfg),
            build_conv_layer(cfg=dict(
                type='DCN',
                in_channels=mid_channels,
                out_channels=mid_channels,
                kernel_size=3,
                padding=1,
                groups=4,
                im2col_step=128,
            )),
            nn.Conv2d(mid_channels,
                      depth_channels,
                      kernel_size=1,
                      stride=1,
                      padding=0),
        )

    def forward(self, x, mlp_input):
        mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
        x = self.reduce_conv(x)
        context_se = self.context_mlp(mlp_input)[..., None, None]
        context = self.context_se(x, context_se)
        context = self.context_conv(context)
        depth_se = self.depth_mlp(mlp_input)[..., None, None]
        depth = self.depth_se(x, depth_se)
        depth = self.depth_conv(depth)
        return torch.cat([depth, context], dim=1)

class DepthAggregation(nn.Module):
    """
    pixel cloud feature extraction
    """
    def __init__(self, in_channels, mid_channels, out_channels, norm_cfg):
        super(DepthAggregation, self).__init__()

        self.reduce_conv = nn.Sequential(
            nn.Conv2d(in_channels,
                      mid_channels,
                      kernel_size=3,
                      stride=1,
                      padding=1,
                      bias=False),
            build_norm_layer(norm_cfg, mid_channels)[1],
            nn.ReLU(inplace=True),
        )

        self.conv = nn.Sequential(
            nn.Conv2d(mid_channels,
                      mid_channels,
                      kernel_size=3,
                      stride=1,
                      padding=1,
                      bias=False),
            build_norm_layer(norm_cfg, mid_channels)[1],
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels,
                      mid_channels,
                      kernel_size=3,
                      stride=1,
                      padding=1,
                      bias=False),
            build_norm_layer(norm_cfg, mid_channels)[1],
            nn.ReLU(inplace=True),
        )

        self.out_conv = nn.Sequential(
            nn.Conv2d(mid_channels,
                      out_channels,
                      kernel_size=3,
                      stride=1,
                      padding=1,
                      bias=True),
            # nn.BatchNorm3d(out_channels),
            # nn.ReLU(inplace=True),
        )

    @autocast(False)
    def forward(self, x):
        x = checkpoint(self.reduce_conv, x)
        short_cut = x
        x = checkpoint(self.conv, x)
        x = short_cut + x
        x = self.out_conv(x)
        return x


@NECKS.register_module()
class ViewTransformerLSSBEVDepth(ViewTransformerLiftSplatShoot):
    def __init__(self, loss_depth_weight, cam_channels=27, loss_depth_reg_weight=0.0, use_voxel_net=False, 
                 norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01), **kwargs):
        super(ViewTransformerLSSBEVDepth, self).__init__(**kwargs)
        self.loss_depth_weight = loss_depth_weight
        self.loss_depth_reg_weight = loss_depth_reg_weight
        self.cam_channels = cam_channels
        
        self.depth_net = DepthNet(self.numC_input, self.numC_input,
                                  self.numC_Trans, self.D, cam_channels=self.cam_channels,
                                  norm_cfg=norm_cfg)
        self.depth_aggregation_net = DepthAggregation(self.numC_Trans,
                                                      self.numC_Trans,
                                                      self.numC_Trans,
                                                      norm_cfg=norm_cfg) if use_voxel_net else None

    def _forward_voxel_net(self, img_feat_with_depth):
        # BEVConv2D [n, c, d, h, w] -> [n, h, c, w, d]
        if self.depth_aggregation_net is None:
            return img_feat_with_depth
        img_feat_with_depth = img_feat_with_depth.permute(
            0, 3, 1, 4, 2).contiguous()  # [n, c, d, h, w] -> [n, h, c, w, d]
        n, h, c, w, d = img_feat_with_depth.shape
        img_feat_with_depth = img_feat_with_depth.view(-1, c, w, d)
        img_feat_with_depth = (
            self.depth_aggregation_net(img_feat_with_depth).view(
                n, h, c, w, d).permute(0, 2, 4, 1, 3).contiguous().float())
        return img_feat_with_depth

    def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda=None):
        B,N,_,_ = rot.shape
        if bda is None:
            bda = torch.eye(3).to(rot).view(1,3,3).repeat(B,1,1)
        bda = bda.view(B,1,3,3).repeat(1,N,1,1)
        
        if intrin.shape[-1] == 4:
            # for KITTI, the intrin matrix is 3x4
            mlp_input = torch.stack([
                intrin[:, :, 0, 0],
                intrin[:, :, 1, 1],
                intrin[:, :, 0, 2],
                intrin[:, :, 1, 2],
                intrin[:, :, 0, 3],
                intrin[:, :, 1, 3],
                intrin[:, :, 2, 3],
                post_rot[:, :, 0, 0],
                post_rot[:, :, 0, 1],
                post_tran[:, :, 0],
                post_rot[:, :, 1, 0],
                post_rot[:, :, 1, 1],
                post_tran[:, :, 1],
                bda[:, :, 0, 0],
                bda[:, :, 0, 1],
                bda[:, :, 1, 0],
                bda[:, :, 1, 1],
                bda[:, :, 2, 2],
            ], dim=-1)
        else:
            mlp_input = torch.stack([
                intrin[:, :, 0, 0],
                intrin[:, :, 1, 1],
                intrin[:, :, 0, 2],
                intrin[:, :, 1, 2],
                post_rot[:, :, 0, 0],
                post_rot[:, :, 0, 1],
                post_tran[:, :, 0],
                post_rot[:, :, 1, 0],
                post_rot[:, :, 1, 1],
                post_tran[:, :, 1],
                bda[:, :, 0, 0],
                bda[:, :, 0, 1],
                bda[:, :, 1, 0],
                bda[:, :, 1, 1],
                bda[:, :, 2, 2],
            ], dim=-1)
        
        sensor2ego = torch.cat([rot, tran.reshape(B, N, 3, 1)], dim=-1).reshape(B, N, -1)
        mlp_input = torch.cat([mlp_input, sensor2ego], dim=-1)
        
        return mlp_input

    def get_downsampled_gt_depth(self, gt_depths):
        """
        Input:
            gt_depths: [B, N, H, W]
        Output:
            gt_depths: [B*N*h*w, d]
        """
        B, N, H, W = gt_depths.shape
        gt_depths = gt_depths.view(B * N,
                                   H // self.downsample, self.downsample,
                                   W // self.downsample, self.downsample, 1)
        gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
        gt_depths = gt_depths.view(-1, self.downsample * self.downsample)
        gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths)
        gt_depths = torch.min(gt_depths_tmp, dim=-1).values
        gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample)
        
        # [min - step / 2, min + step / 2] creates min depth
        gt_depths = (gt_depths - (self.grid_config['dbound'][0] - self.grid_config['dbound'][2] / 2)) / self.grid_config['dbound'][2]
        gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths))
        gt_depths = F.one_hot(gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:]
        
        return gt_depths.float()

    def _prepare_depth_gt(self, gt_depths):
        """
        Input:
            gt_depths: [B, N, H, W]
        Output:
            gt_depths: [B*N*H*W, d]
        """
        gt_depths = (gt_depths - (self.grid_config['dbound'][0] -
                                  self.grid_config['dbound'][2])) / \
                    self.grid_config['dbound'][2]
        gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0),
                                gt_depths, torch.zeros_like(gt_depths))
        gt_depths = F.one_hot(gt_depths.long(),
                              num_classes=self.D + 1).view(-1,
                                                           self.D + 1)[:, 1:]
        return gt_depths.float()

    @force_fp32()
    def get_depth_reg_loss(self, depth_labels, depth_preds):
        depth_labels = self.get_downsampled_gt_depth(depth_labels)
        # depth_labels = self._prepare_depth_gt(depth_labels)
        depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D)
        # foreground predictions & labels
        fg_mask = torch.max(depth_labels, dim=1).values > 0.0
        depth_labels = depth_labels[fg_mask]
        depth_preds = depth_preds[fg_mask]
        
        # cls_targets ==> reg_targets
        ds = torch.arange(*self.grid_config['dbound'], dtype=torch.float).view(1, -1).type_as(depth_preds)
        depth_reg_labels = torch.sum(depth_labels * ds, dim=1)
        depth_reg_preds = torch.sum(depth_preds * ds, dim=1)
        
        with autocast(enabled=False):
            loss_depth = F.smooth_l1_loss(depth_reg_preds, depth_reg_labels, reduction='mean')

        return self.loss_depth_reg_weight * loss_depth

    @force_fp32()
    def get_depth_loss(self, depth_labels, depth_preds):
        depth_labels = self.get_downsampled_gt_depth(depth_labels)
        # depth_labels = self._prepare_depth_gt(depth_labels)
        depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(
            -1, self.D)
        fg_mask = torch.max(depth_labels, dim=1).values > 0.0
        depth_labels = depth_labels[fg_mask]
        depth_preds = depth_preds[fg_mask]
        with autocast(enabled=False):
            depth_loss = F.binary_cross_entropy(
                depth_preds,
                depth_labels,
                reduction='none',
            ).sum() / max(1.0, fg_mask.sum())
        
        return self.loss_depth_weight * depth_loss

    def forward(self, input):
        (x, rots, trans, intrins, post_rots, post_trans, bda, mlp_input) = input[:8]

        B, N, C, H, W = x.shape
        x = x.view(B * N, C, H, W)
        x = self.depth_net(x, mlp_input)
        depth_digit = x[:, :self.D, ...]
        img_feat = x[:, self.D:self.D+self.numC_Trans, ...]
        depth_prob = self.get_depth_dist(depth_digit)
        # Lift
        volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2)
        volume = self._forward_voxel_net(volume)
        volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
        volume = volume.permute(0, 1, 3, 4, 5, 2)

        # Splat
        if self.accelerate:
            bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins,
                                                      post_rots, post_trans,
                                                      bda, volume)
        else:
            geom = self.get_geometry(rots, trans, intrins,
                                     post_rots, post_trans, bda)
            if self.vp_megvii:
                bev_feat = self.voxel_pooling_bevdepth(geom, volume)
            else:
                bev_feat = self.voxel_pooling(geom, volume)
        return bev_feat, depth_prob


class ConvBnReLU3D(nn.Module):
    """Implements of 3d convolution + batch normalization + ReLU."""
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        pad: int = 1,
        dilation: int = 1,
    ) -> None:
        """initialization method for convolution3D + batch normalization + relu module
        Args:
            in_channels: input channel number of convolution layer
            out_channels: output channel number of convolution layer
            kernel_size: kernel size of convolution layer
            stride: stride of convolution layer
            pad: pad of convolution layer
            dilation: dilation of convolution layer
        """
        super(ConvBnReLU3D, self).__init__()
        self.conv = nn.Conv3d(in_channels,
                              out_channels,
                              kernel_size,
                              stride=stride,
                              padding=pad,
                              dilation=dilation,
                              bias=False)
        self.bn = nn.BatchNorm3d(out_channels)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """forward method"""
        return F.relu(self.bn(self.conv(x)), inplace=True)


class DepthNetStereo(nn.Module):
    def __init__(self,
                 in_channels,
                 mid_channels,
                 context_channels,
                 depth_channels,
                 d_bound,
                 num_ranges=4,
                 norm_cfg=dict(type='BN', requires_grad=True)):
        super(DepthNetStereo, self).__init__()
        self.reduce_conv = nn.Sequential(
            nn.Conv2d(in_channels,
                      mid_channels,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
        )
        self.context_conv = nn.Conv2d(mid_channels,
                                      context_channels,
                                      kernel_size=1,
                                      stride=1,
                                      padding=0)
        self.bn = nn.BatchNorm1d(27)
        self.depth_mlp = Mlp(27, mid_channels, mid_channels)
        self.depth_se = SELayer(mid_channels)  # NOTE: add camera-aware
        self.context_mlp = Mlp(27, mid_channels, mid_channels)
        self.context_se = SELayer(mid_channels)  # NOTE: add camera-aware
        self.depth_feat_conv = nn.Sequential(
            BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
            BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg),
            ASPP(mid_channels, mid_channels, norm_cfg=norm_cfg),
            build_conv_layer(cfg=dict(
                type='DCN',
                in_channels=mid_channels,
                out_channels=mid_channels,
                kernel_size=3,
                padding=1,
                groups=4,
                im2col_step=128,
            )),
        )
        self.mu_sigma_range_net = nn.Sequential(
            BasicBlock(mid_channels, mid_channels),
            nn.ConvTranspose2d(mid_channels,
                               mid_channels,
                               3,
                               stride=2,
                               padding=1,
                               output_padding=1),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(mid_channels,
                               mid_channels,
                               3,
                               stride=2,
                               padding=1,
                               output_padding=1),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels,
                      num_ranges * 3,
                      kernel_size=1,
                      stride=1,
                      padding=0),
        )
        self.mono_depth_net = nn.Sequential(
            BasicBlock(mid_channels, mid_channels),
            nn.Conv2d(mid_channels,
                      depth_channels,
                      kernel_size=1,
                      stride=1,
                      padding=0),
        )
        self.d_bound = d_bound
        self.num_ranges = num_ranges

    # @autocast(False)
    def forward(self, x, mlp_input):
        B, _, H, W = x.shape

        mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
        x = self.reduce_conv(x)
        context_se = self.context_mlp(mlp_input)[..., None, None]
        context = self.context_se(x, context_se)
        context = self.context_conv(context)
        depth_se = self.depth_mlp(mlp_input)[..., None, None]
        depth_feat = self.depth_se(x, depth_se)
        depth_feat = checkpoint(self.depth_feat_conv, depth_feat)
        mono_depth = checkpoint(self.mono_depth_net, depth_feat)
        mu_sigma_score = checkpoint(self.mu_sigma_range_net, depth_feat)
        mu = mu_sigma_score[:, 0:self.num_ranges, ...]
        sigma = mu_sigma_score[:, self.num_ranges:2 * self.num_ranges, ...]
        range_score = mu_sigma_score[:,
                                     2 * self.num_ranges:3 * self.num_ranges,
                                     ...]
        sigma = F.elu(sigma) + 1.0 + 1e-10
        return x, context, mu, sigma, range_score, mono_depth


@NECKS.register_module()
class ViewTransformerLSSBEVStereo(ViewTransformerLSSBEVDepth):
    def __init__(self, num_ranges=4, use_mask=True, em_iteration=3,
                 range_list=[[2, 8], [8, 16], [16, 28], [28, 58]],
                 sampling_range=3, num_samples=3,
                 k_list=None, min_sigma=1.0,
                 num_groups=8,
                 stereo_downsample_factor=4, 
                 norm_cfg=dict(type='BN2d'), **kwargs):
        super(ViewTransformerLSSBEVStereo, self).__init__(**kwargs)
        self.num_ranges = num_ranges
        self.depth_net = DepthNetStereo(self.numC_input, self.numC_input,
                                  self.numC_Trans, self.D,
                                  self.grid_config['dbound'],
                                  self.num_ranges,
                                  norm_cfg=norm_cfg)
        self.context_downsample_net = nn.Identity()
        self.use_mask = use_mask
        self.stereo_downsample_factor = stereo_downsample_factor
        self.num_ranges = num_ranges
        self.min_sigma = min_sigma
        self.sampling_range = sampling_range
        self.num_samples = num_samples
        self.num_groups=num_groups
        self.similarity_net = nn.Sequential(
            ConvBnReLU3D(in_channels=num_groups,
                         out_channels=16,
                         kernel_size=1,
                         stride=1,
                         pad=0),
            ConvBnReLU3D(in_channels=16,
                         out_channels=8,
                         kernel_size=1,
                         stride=1,
                         pad=0),
            nn.Conv3d(in_channels=8,
                      out_channels=1,
                      kernel_size=1,
                      stride=1,
                      padding=0),
        )
        self.depth_downsample_net = nn.Sequential(
            nn.Conv2d(self.D, 256, 3, 2, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, 2, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, self.D, 1, 1, 0),
        )
        if range_list is None:
            range_length = (self.grid_config['dbound'][1] -
                            self.grid_config['dbound'][0]) / num_ranges
            self.range_list = [[
                self.grid_config['dbound'][0] + range_length * i,
                self.grid_config['dbound'][0] + range_length * (i + 1)
            ] for i in range(num_ranges)]
        else:
            assert len(range_list) == num_ranges
            self.range_list = range_list
        self.em_iteration = em_iteration
        if k_list is None:
            self.register_buffer('k_list', torch.Tensor(self.depth_sampling()))
        else:
            self.register_buffer('k_list', torch.Tensor(k_list))
        if self.use_mask:
            self.mask_net = nn.Sequential(
                nn.Conv2d(self.D*2, 64, 3, 1, 1),
                nn.BatchNorm2d(64),
                nn.ReLU(inplace=True),
                BasicBlock(64, 64),
                BasicBlock(64, 64),
                nn.Conv2d(64, 1, 1, 1, 0),
                nn.Sigmoid(),
            )

    def depth_sampling(self):
        """Generate sampling range of candidates.

        Returns:
            list[float]: List of all candidates.
        """
        P_total = erf(self.sampling_range /
                      np.sqrt(2))  # Probability covered by the sampling range
        idx_list = np.arange(0, self.num_samples + 1)
        p_list = (1 - P_total) / 2 + ((idx_list / self.num_samples) * P_total)
        k_list = norm.ppf(p_list)
        k_list = (k_list[1:] + k_list[:-1]) / 2
        return list(k_list)

    def create_depth_sample_frustum(self, depth_sample, downsample_factor=16):
        """Generate frustum"""
        # make grid in image plane
        ogfH, ogfW = self.data_config['input_size']
        fH, fW = ogfH // downsample_factor, ogfW // downsample_factor
        batch_size, num_depth, _, _ = depth_sample.shape
        x_coords = (torch.linspace(0,
                                   ogfW - 1,
                                   fW,
                                   dtype=torch.float,
                                   device=depth_sample.device).view(
                                       1, 1, 1,
                                       fW).expand(batch_size, num_depth, fH,
                                                  fW))
        y_coords = (torch.linspace(0,
                                   ogfH - 1,
                                   fH,
                                   dtype=torch.float,
                                   device=depth_sample.device).view(
                                       1, 1, fH,
                                       1).expand(batch_size, num_depth, fH,
                                                 fW))
        paddings = torch.ones_like(depth_sample)

        # D x H x W x 3
        frustum = torch.stack((x_coords, y_coords, depth_sample, paddings), -1)
        return frustum

    def homo_warping(
        self,
        stereo_feat,
        key_intrin_mats,
        sweep_intrin_mats,
        sensor2sensor_mats,
        key_ida_mats,
        sweep_ida_mats,
        depth_sample,
        frustum,
    ):
        """Used for mvs method to transfer sweep image feature to
            key image feature.

        Args:
            src_fea(Tensor): image features.
            key_intrin_mats(Tensor): Intrin matrix for key sensor.
            sweep_intrin_mats(Tensor): Intrin matrix for sweep sensor.
            sensor2sensor_mats(Tensor): Transformation matrix from key
                sensor to sweep sensor.
            key_ida_mats(Tensor): Ida matrix for key frame.
            sweep_ida_mats(Tensor): Ida matrix for sweep frame.
            depth_sample (Tensor): Depth map of all candidates.
            depth_sample_frustum (Tensor): Pre-generated frustum.
        """
        batch_size_with_num_cams, channels = stereo_feat.shape[
            0], stereo_feat.shape[1]
        height, width = stereo_feat.shape[2], stereo_feat.shape[3]
        with torch.no_grad():
            points = frustum
            points = points.reshape(points.shape[0], -1, points.shape[-1])
            points[..., 2] = 1
            # Undo ida for key frame.
            points = key_ida_mats.reshape(batch_size_with_num_cams,
                                          *key_ida_mats.shape[2:]).inverse(
                                          ).unsqueeze(1) @ points.unsqueeze(-1)
            # Convert points from pixel coord to key camera coord.
            points[..., :3, :] *= depth_sample.reshape(
                batch_size_with_num_cams, -1, 1, 1)
            num_depth = frustum.shape[1]
            points = (key_intrin_mats.reshape(
                batch_size_with_num_cams,
                *key_intrin_mats.shape[2:]).inverse().unsqueeze(1) @ points)
            points = (sensor2sensor_mats.reshape(
                batch_size_with_num_cams,
                *sensor2sensor_mats.shape[2:]).unsqueeze(1) @ points)
            # points in sweep sensor coord.
            points = (sweep_intrin_mats.reshape(
                batch_size_with_num_cams,
                *sweep_intrin_mats.shape[2:]).unsqueeze(1) @ points)
            # points in sweep pixel coord.
            points[..., :2, :] = points[..., :2, :] / points[
                ..., 2:3, :]  # [B, 2, Ndepth, H*W]

            points = (sweep_ida_mats.reshape(
                batch_size_with_num_cams,
                *sweep_ida_mats.shape[2:]).unsqueeze(1) @ points).squeeze(-1)
            neg_mask = points[..., 2] < 1e-3
            points[..., 0][neg_mask] = width * self.stereo_downsample_factor
            points[..., 1][neg_mask] = height * self.stereo_downsample_factor
            points[..., 2][neg_mask] = 1
            proj_x_normalized = points[..., 0] / (
                (width * self.stereo_downsample_factor - 1) / 2) - 1
            proj_y_normalized = points[..., 1] / (
                (height * self.stereo_downsample_factor - 1) / 2) - 1
            grid = torch.stack([proj_x_normalized, proj_y_normalized],
                               dim=2)  # [B, Ndepth, H*W, 2]

        warped_stereo_fea = F.grid_sample(
            stereo_feat,
            grid.view(batch_size_with_num_cams, num_depth * height, width, 2),
            mode='bilinear',
            padding_mode='zeros',
        )
        warped_stereo_fea = warped_stereo_fea.view(batch_size_with_num_cams,
                                                   channels, num_depth, height,
                                                   width)

        return warped_stereo_fea

    def _forward_mask(
        self,
        sweep_index,
        mono_depth_all_sweeps,
        mats_dict,
        depth_sample,
        depth_sample_frustum,
        sensor2sensor_mats,
    ):
        """Forward function to generate mask.

        Args:
            sweep_index (int): Index of sweep.
            mono_depth_all_sweeps (list[Tensor]): List of mono_depth for
                all sweeps.
            mats_dict (dict):
                sensor2ego_mats (Tensor): Transformation matrix from
                    camera to ego with shape of (B, num_sweeps,
                    num_cameras, 4, 4).
                intrin_mats (Tensor): Intrinsic matrix with shape
                    of (B, num_sweeps, num_cameras, 4, 4).
                ida_mats (Tensor): Transformation matrix for ida with
                    shape of (B, num_sweeps, num_cameras, 4, 4).
                sensor2sensor_mats (Tensor): Transformation matrix
                    from key frame camera to sweep frame camera with
                    shape of (B, num_sweeps, num_cameras, 4, 4).
                bda_mat (Tensor): Rotation matrix for bda with shape
                    of (B, 4, 4).
            depth_sample (Tensor): Depth map of all candidates.
            depth_sample_frustum (Tensor): Pre-generated frustum.
            sensor2sensor_mats (Tensor): Transformation matrix from reference
                sensor to source sensor.

        Returns:
            Tensor: Generated mask.
        """
        num_sweeps = len(mono_depth_all_sweeps)
        mask_all_sweeps = list()
        for idx in range(num_sweeps):
            if idx == sweep_index:
                continue
            warped_mono_depth = self.homo_warping(
                mono_depth_all_sweeps[idx],
                mats_dict['intrin_mats'][:, sweep_index, ...],
                mats_dict['intrin_mats'][:, idx, ...],
                sensor2sensor_mats[idx],
                mats_dict['ida_mats'][:, sweep_index, ...],
                mats_dict['ida_mats'][:, idx, ...],
                depth_sample,
                depth_sample_frustum.type_as(mono_depth_all_sweeps[idx]),
            )
            mask = self.mask_net(
                torch.cat([
                    mono_depth_all_sweeps[sweep_index].detach(),
                    warped_mono_depth.mean(2).detach()
                ], 1))
            mask_all_sweeps.append(mask)
        return torch.stack(mask_all_sweeps).mean(0)

    def _generate_cost_volume(
            self,
            sweep_index,
            stereo_feats_all_sweeps,
            mats_dict,
            depth_sample,
            depth_sample_frustum,
            sensor2sensor_mats,
    ):
        """Generate cost volume based on depth sample.

        Args:
            sweep_index (int): Index of sweep.
            stereo_feats_all_sweeps (list[Tensor]): Stereo feature
                of all sweeps.
            mats_dict (dict):
                sensor2ego_mats (Tensor): Transformation matrix from
                    camera to ego with shape of (B, num_sweeps,
                    num_cameras, 4, 4).
                intrin_mats (Tensor): Intrinsic matrix with shape
                    of (B, num_sweeps, num_cameras, 4, 4).
                ida_mats (Tensor): Transformation matrix for ida with
                    shape of (B, num_sweeps, num_cameras, 4, 4).
                sensor2sensor_mats (Tensor): Transformation matrix
                    from key frame camera to sweep frame camera with
                    shape of (B, num_sweeps, num_cameras, 4, 4).
                bda_mat (Tensor): Rotation matrix for bda with shape
                    of (B, 4, 4).
            depth_sample (Tensor): Depth map of all candidates.
            depth_sample_frustum (Tensor): Pre-generated frustum.
            sensor2sensor_mats (Tensor): Transformation matrix from reference
                sensor to source sensor.

        Returns:
            Tensor: Depth score for all sweeps.
        """
        batch_size, num_channels, height, width = stereo_feats_all_sweeps[
            0].shape
        # thres = int(self.mvs_weighting.split("CW")[1])
        num_sweeps = len(stereo_feats_all_sweeps)
        depth_score_all_sweeps = list()
        for idx in range(num_sweeps):
            if idx == sweep_index:
                continue
            warped_stereo_fea = self.homo_warping(
                stereo_feats_all_sweeps[idx],
                mats_dict['intrin_mats'][:, sweep_index, ...],
                mats_dict['intrin_mats'][:, idx, ...],
                sensor2sensor_mats[idx],
                mats_dict['ida_mats'][:, sweep_index, ...],
                mats_dict['ida_mats'][:, idx, ...],
                depth_sample,
                depth_sample_frustum.type_as(stereo_feats_all_sweeps[idx]),
            )
            warped_stereo_fea = warped_stereo_fea.reshape(
                batch_size, self.num_groups, num_channels // self.num_groups,
                self.num_samples, height, width)
            ref_stereo_feat = stereo_feats_all_sweeps[sweep_index].reshape(
                batch_size, self.num_groups, num_channels // self.num_groups,
                height, width)
            feat_cost = torch.mean(
                (ref_stereo_feat.unsqueeze(3) * warped_stereo_fea), axis=2)
            depth_score = self.similarity_net(feat_cost).squeeze(1)
            depth_score_all_sweeps.append(depth_score)
        return torch.stack(depth_score_all_sweeps).mean(0)

    def _forward_stereo(
        self,
        sweep_index,
        stereo_feats_all_sweeps,
        mono_depth_all_sweeps,
        mats_dict,
        sensor2sensor_mats,
        mu_all_sweeps,
        sigma_all_sweeps,
        range_score_all_sweeps,
        depth_feat_all_sweeps,
    ):
        """Forward function to generate stereo depth.

        Args:
            sweep_index (int): Index of sweep.
            stereo_feats_all_sweeps (list[Tensor]): Stereo feature
                of all sweeps.
            mono_depth_all_sweeps (list[Tensor]):
            mats_dict (dict):
                sensor2ego_mats (Tensor): Transformation matrix from
                    camera to ego with shape of (B, num_sweeps,
                    num_cameras, 4, 4).
                intrin_mats (Tensor): Intrinsic matrix with shape
                    of (B, num_sweeps, num_cameras, 4, 4).
                ida_mats (Tensor): Transformation matrix for ida with
                    shape of (B, num_sweeps, num_cameras, 4, 4).
                sensor2sensor_mats (Tensor): Transformation matrix
                    from key frame camera to sweep frame camera with
                    shape of (B, num_sweeps, num_cameras, 4, 4).
                bda_mat (Tensor): Rotation matrix for bda with shape
                    of (B, 4, 4).
            sensor2sensor_mats(Tensor): Transformation matrix from key
                sensor to sweep sensor.
            mu_all_sweeps (list[Tensor]): List of mu for all sweeps.
            sigma_all_sweeps (list[Tensor]): List of sigma for all sweeps.
            range_score_all_sweeps (list[Tensor]): List of all range score
                for all sweeps.
            depth_feat_all_sweeps (list[Tensor]): List of all depth feat for
                all sweeps.

        Returns:
            Tensor: stereo_depth
        """
        batch_size_with_cams, _, feat_height, feat_width = \
            stereo_feats_all_sweeps[0].shape
        device = stereo_feats_all_sweeps[0].device
        d_coords = torch.arange(*self.grid_config['dbound'],
                                dtype=torch.float,
                                device=device).reshape(1, -1, 1, 1)
        d_coords = d_coords.repeat(batch_size_with_cams, 1, feat_height,
                                   feat_width)
        stereo_depth = stereo_feats_all_sweeps[0].new_zeros(
            batch_size_with_cams, self.D, feat_height, feat_width)
        mask_score = stereo_feats_all_sweeps[0].new_zeros(
            batch_size_with_cams,
            self.D,
            feat_height * self.stereo_downsample_factor //
            self.downsample,
            feat_width * self.stereo_downsample_factor //
            self.downsample,
        )
        score_all_ranges = list()
        range_score = range_score_all_sweeps[sweep_index].softmax(1)
        for range_idx in range(self.num_ranges):
            # Map mu to the corresponding interval.
            range_start = self.range_list[range_idx][0]
            mu_all_sweeps_single_range = [
                mu[:, range_idx:range_idx + 1, ...].sigmoid() *
                (self.range_list[range_idx][1] - self.range_list[range_idx][0])
                + range_start for mu in mu_all_sweeps
            ]
            sigma_all_sweeps_single_range = [
                sigma[:, range_idx:range_idx + 1, ...]
                for sigma in sigma_all_sweeps
            ]
            batch_size_with_cams, _, feat_height, feat_width =\
                stereo_feats_all_sweeps[0].shape
            mu = mu_all_sweeps_single_range[sweep_index]
            sigma = sigma_all_sweeps_single_range[sweep_index]
            for _ in range(self.em_iteration):
                depth_sample = torch.cat([mu + sigma * k for k in self.k_list],
                                         1)
                depth_sample_frustum = self.create_depth_sample_frustum(
                    depth_sample, self.stereo_downsample_factor)
                mu_score = self._generate_cost_volume(
                    sweep_index,
                    stereo_feats_all_sweeps,
                    mats_dict,
                    depth_sample,
                    depth_sample_frustum,
                    sensor2sensor_mats,
                )
                mu_score = mu_score.softmax(1)
                scale_factor = torch.clamp(
                    0.5 / (1e-4 + mu_score[:, self.num_samples //
                                           2:self.num_samples // 2 + 1, ...]),
                    min=0.1,
                    max=10)

                sigma = torch.clamp(sigma * scale_factor, min=0.1, max=10)
                mu = (depth_sample * mu_score).sum(1, keepdim=True)
                del depth_sample
                del depth_sample_frustum
            mu = torch.clamp(mu,
                             max=self.range_list[range_idx][1],
                             min=self.range_list[range_idx][0])
            range_length = int(
                (self.range_list[range_idx][1] - self.range_list[range_idx][0])
                // self.grid_config['dbound'][2])
            if self.use_mask:
                depth_sample = F.avg_pool2d(
                    mu,
                    self.downsample // self.stereo_downsample_factor,
                    self.downsample // self.stereo_downsample_factor,
                )
                depth_sample_frustum = self.create_depth_sample_frustum(
                    depth_sample, self.downsample)
                mask = self._forward_mask(
                    sweep_index,
                    mono_depth_all_sweeps,
                    mats_dict,
                    depth_sample,
                    depth_sample_frustum,
                    sensor2sensor_mats,
                )
                mask_score[:,
                           int((range_start - self.grid_config['dbound'][0]) //
                               self.grid_config['dbound'][2]):range_length +
                           int((range_start - self.grid_config['dbound'][0]) //
                               self.grid_config['dbound'][2]), ..., ] += mask
                del depth_sample
                del depth_sample_frustum
            sigma = torch.clamp(sigma, self.min_sigma)
            mu_repeated = mu.repeat(1, range_length, 1, 1)
            eps = 1e-6
            depth_score_single_range = (-1 / 2 * (
                (d_coords[:,
                          int((range_start - self.grid_config['dbound'][0]) //
                              self.grid_config['dbound'][2]):range_length + int(
                                  (range_start - self.grid_config['dbound'][0]) //
                                  self.grid_config['dbound'][2]), ..., ] - mu_repeated) /
                torch.sqrt(sigma))**2)
            depth_score_single_range = depth_score_single_range.exp()
            score_all_ranges.append(mu_score.sum(1).unsqueeze(1))
            depth_score_single_range = depth_score_single_range / (
                sigma * math.sqrt(2 * math.pi) + eps)
            stereo_depth[:,
                         int((range_start - self.grid_config['dbound'][0]) //
                             self.grid_config['dbound'][2]):range_length +
                         int((range_start - self.grid_config['dbound'][0]) //
                             self.grid_config['dbound'][2]), ..., ] = (
                                 depth_score_single_range *
                                 range_score[:, range_idx:range_idx + 1, ...])
            # del range_score
            del depth_score_single_range
            del mu_repeated
        if self.use_mask:
            return stereo_depth, mask_score
        else:
            return stereo_depth

    def forward(self, input):
        img_feat, depth_prob, rots, trans, intrins, post_rots, post_trans, bda = input
        B, N, C, H, W = img_feat.shape
        img_feat = img_feat.view(B*N,C,H,W)
        # Lift
        volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2)
        volume = self._forward_voxel_net(volume)
        volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
        volume = volume.permute(0, 1, 3, 4, 5, 2)

        # Splat
        if self.accelerate:
            bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins,
                                                      post_rots, post_trans,
                                                      bda, volume)
        else:
            geom = self.get_geometry(rots, trans, intrins,
                                     post_rots, post_trans, bda)
            if self.vp_megvii:
                bev_feat = self.voxel_pooling_bevdepth(geom, volume)
            else:
                bev_feat = self.voxel_pooling(geom, volume)
        return bev_feat

================================================
FILE: projects/occ_plugin/occupancy/image2bev/ViewTransformerLSSVoxel.py
================================================
# Copyright (c) Phigent Robotics. All rights reserved.
import math
import torch
import torch.nn as nn
from mmcv.runner import BaseModule
from mmdet3d.models.builder import NECKS
from projects.occ_plugin.ops.occ_pooling import occ_pool
from mmcv.cnn import build_conv_layer
from mmcv.runner import force_fp32
from torch.cuda.amp.autocast_mode import autocast
from projects.occ_plugin.utils.gaussian import generate_guassian_depth_target

import torch.nn.functional as F
import numpy as np

import pdb

from .ViewTransformerLSSBEVDepth import *

import torch.cuda as cuda

def get_gpu_memory_usage():
    allocated = cuda.memory_allocated()
    reserved = cuda.memory_reserved()
    return allocated, reserved

@NECKS.register_module()
class ViewTransformerLiftSplatShootVoxel(ViewTransformerLSSBEVDepth):
    def __init__(self, loss_depth_weight, loss_depth_type='bce', **kwargs):
        super(ViewTransformerLiftSplatShootVoxel, self).__init__(loss_depth_weight=loss_depth_weight, **kwargs)
        
        self.loss_depth_type = loss_depth_type
        self.cam_depth_range = self.grid_config['dbound']
        self.constant_std = 0.5
    
    def get_downsampled_gt_depth(self, gt_depths):
        """
        Input:
            gt_depths: [B, N, H, W]
        Output:
            gt_depths: [B*N*h*w, d]
        """
        B, N, H, W = gt_depths.shape
        gt_depths = gt_depths.view(B * N,
                                   H // self.downsample, self.downsample,
                                   W // self.downsample, self.downsample, 1)
        gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
        gt_depths = gt_depths.view(-1, self.downsample * self.downsample)
        gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths)
        gt_depths = torch.min(gt_depths_tmp, dim=-1).values
        gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample)
        
        # [min - step / 2, min + step / 2] creates min depth
        gt_depths = (gt_depths - (self.grid_config['dbound'][0] - self.grid_config['dbound'][2] / 2)) / self.grid_config['dbound'][2]
        gt_depths_vals = gt_depths.clone()
        
        gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths))
        gt_depths = F.one_hot(gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:]
        
        return gt_depths_vals, gt_depths.float()
    
    @force_fp32()
    def get_bce_depth_loss(self, depth_labels, depth_preds):
        _, depth_labels = self.get_downsampled_gt_depth(depth_labels)
        depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D)
        fg_mask = torch.max(depth_labels, dim=1).values > 0.0
        depth_labels = depth_labels[fg_mask]
        depth_preds = depth_preds[fg_mask]
        
        with autocast(enabled=False):
            depth_loss = F.binary_cross_entropy(depth_preds, depth_labels, reduction='none').sum() / max(1.0, fg_mask.sum())
        
        return depth_loss

    @force_fp32()
    def get_klv_depth_loss(self, depth_labels, depth_preds):
        depth_gaussian_labels, depth_values = generate_guassian_depth_target(depth_labels,
            self.downsample, self.cam_depth_range, constant_std=self.constant_std)

        depth_values = depth_values.view(-1)
        fg_mask = (depth_values >= self.cam_depth_range[0]) & (depth_values <= (self.cam_depth_range[1] - self.cam_depth_range[2]))        
        
        depth_gaussian_labels = depth_gaussian_labels.view(-1, self.D)[fg_mask]
        depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D)[fg_mask]
        
        depth_loss = F.kl_div(torch.log(depth_preds + 1e-4), depth_gaussian_labels, reduction='batchmean', log_target=False)
        
        return depth_loss
    
    @force_fp32()
    def get_depth_loss(self, depth_labels, depth_preds):
        if self.loss_depth_type == 'bce':
            depth_loss = self.get_bce_depth_loss(depth_labels, depth_preds)
        
        elif self.loss_depth_type == 'kld':
            depth_loss = self.get_klv_depth_loss(depth_labels, depth_preds)
        
        else:
            pdb.set_trace()
        
        return self.loss_depth_weight * depth_loss
        
    def voxel_pooling(self, geom_feats, x):
        B, N, D, H, W, C = x.shape

        Nprime = B * N * D * H * W

        x = x.contiguous().view(Nprime, C)

        # flatten indices
        geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).long()
        geom_feats = geom_feats.view(Nprime, 3)
        batch_ix = torch.cat([torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long) for ix in range(B)])
        geom_feats = torch.cat((geom_feats, batch_ix), 1)

        # filter out points that are outside box
        kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \
               & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \
               & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])

        x = x[kept]
        geom_feats = geom_feats[kept]
        
        final = occ_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1])
        final = final.permute(0, 1, 3, 4, 2)

        return final

    def forward(self, input):
        (x, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda, mlp_input_seq) = input[:8]

        B, N, C, H, W = x.shape
        x = x.view(B * N, C, H, W)
        x = self.depth_net(x, mlp_input_seq)
        depth_digit = x[:, :self.D, ...]
        img_feat = x[:, self.D:self.D + self.numC_Trans, ...]
        depth_prob = self.get_depth_dist(depth_digit)

        volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2)
        volume = volume.view(B, N, self.numC_Trans, self.D, H, W)
        volume = volume.permute(0, 1, 3, 4, 5, 2)

        geom = self.get_geometry(rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda)

        bev_feat = self.voxel_pooling(geom, volume) 

        return bev_feat, depth_prob


================================================
FILE: projects/occ_plugin/occupancy/image2bev/__init__.py
================================================
from .ViewTransformerLSSBEVDepth import ViewTransformerLSSBEVDepth
from .ViewTransformerLSSVoxel import ViewTransformerLiftSplatShootVoxel

================================================
FILE: projects/occ_plugin/occupancy/necks/__init__.py
================================================
from .second_fpn_3d import SECONDFPN3D
from .fpn3d import FPN3D

================================================
FILE: projects/occ_plugin/occupancy/necks/fpn3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule, auto_fp16
from torch import nn as nn
from mmcv.cnn import ConvModule
from mmdet.models import NECKS

import torch.nn.functional as F
import pdb

@NECKS.register_module()
class FPN3D(BaseModule):
    """FPN used in SECOND/PointPillars/PartA2/MVXNet.

    Args:
        in_channels (list[int]): Input channels of multi-scale feature maps.
        out_channels (list[int]): Output channels of feature maps.
        upsample_strides (list[int]): Strides used to upsample the
            feature maps.
        norm_cfg (dict): Config dict of normalization layers.
        upsample_cfg (dict): Config dict of upsample layers.
        conv_cfg (dict): Config dict of conv layers.
        use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
    """
    def __init__(self,
                 in_channels=[80, 160, 320, 640],
                 out_channels=256,
                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
                 conv_cfg=dict(type='Conv3d'),
                 act_cfg=dict(type='ReLU'),
                 with_cp=False,
                 upsample_cfg=dict(mode='trilinear'),
                 init_cfg=None):
        super(FPN3D, self).__init__(init_cfg=init_cfg)
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.fp16_enabled = False
        self.upsample_cfg = upsample_cfg
        self.with_cp = with_cp
        
        self.num_out = len(self.in_channels)
        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()
        
        for i in range(self.num_out):
            l_conv = nn.Sequential(
                ConvModule(in_channels[i], out_channels, 
                    kernel_size=1, padding=0,
                    conv_cfg=conv_cfg, norm_cfg=norm_cfg, 
                    act_cfg=act_cfg, bias=False, 
                    inplace=True),
            )
            
            fpn_conv = nn.Sequential(
                ConvModule(out_channels, out_channels, 
                    kernel_size=3, padding=1,
                    conv_cfg=conv_cfg, norm_cfg=norm_cfg, 
                    act_cfg=act_cfg, bias=False, 
                    inplace=True),
            )

            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

    @auto_fp16()
    def forward(self, inputs):
        """Forward function.

        Args:
            x (torch.Tensor): 4D Tensor in (N, C, H, W) shape.

        Returns:
            list[torch.Tensor]: Multi-level feature maps.
        """
        assert len(inputs) == len(self.in_channels)

        # build laterals
        laterals = []
        for i, lateral_conv in enumerate(self.lateral_convs):
            if self.with_cp:
                lateral_i = torch.utils.checkpoint.checkpoint(lateral_conv, inputs[i])
            else:
                lateral_i = lateral_conv(inputs[i])
            laterals.append(lateral_i)

        # build down-top path
        for i in range(self.num_out - 1, 0, -1):
            prev_shape = laterals[i - 1].shape[2:]
            laterals[i - 1] = laterals[i - 1] + F.interpolate(laterals[i], 
                    size=prev_shape, align_corners=False, **self.upsample_cfg)
        
        # outs = [
        #     self.fpn_convs[i](laterals[i]) for i in range(self.num_out)
        # ]
        
        outs = []
        for i, fpn_conv in enumerate(self.fpn_convs):
            if self.with_cp:
                out_i = torch.utils.checkpoint.checkpoint(fpn_conv, laterals[i])
            else:
                out_i = fpn_conv(laterals[i])
            outs.append(out_i)
        
        return outs


================================================
FILE: projects/occ_plugin/occupancy/necks/second_fpn_3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule, auto_fp16
from torch import nn as nn

from mmdet.models import NECKS

import pdb

@NECKS.register_module()
class SECONDFPN3D(BaseModule):
    """FPN used in SECOND/PointPillars/PartA2/MVXNet.

    Args:
        in_channels (list[int]): Input channels of multi-scale feature maps.
        out_channels (list[int]): Output channels of feature maps.
        upsample_strides (list[int]): Strides used to upsample the
            feature maps.
        norm_cfg (dict): Config dict of normalization layers.
        upsample_cfg (dict): Config dict of upsample layers.
        conv_cfg (dict): Config dict of conv layers.
        use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
    """
    def __init__(self,
                 in_channels=[128, 128, 256],
                 out_channels=[256, 256, 256],
                 upsample_strides=[1, 2, 4],
                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
                 upsample_cfg=dict(type='deconv3d', bias=False),
                 conv_cfg=dict(type='Conv3d', bias=False),
                 use_conv_for_no_stride=False,
                 init_cfg=None):
        
        # replacing GN with BN3D, performance drops from 42.5 to 40.9. 
        # the difference may be exaggerated because the performance can fluncate a lot
        
        super(SECONDFPN3D, self).__init__(init_cfg=init_cfg)
        assert len(out_channels) == len(upsample_strides) == len(in_channels)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.fp16_enabled = False

        deblocks = []
        for i, out_channel in enumerate(out_channels):
            stride = upsample_strides[i]
            if stride > 1 or (stride == 1 and not use_conv_for_no_stride):
                upsample_layer = build_upsample_layer(
                    upsample_cfg,
                    in_channels=in_channels[i],
                    out_channels=out_channel,
                    kernel_size=upsample_strides[i],
                    stride=upsample_strides[i])
            else:
                stride = np.round(1 / stride).astype(np.int64)
                upsample_layer = build_conv_layer(
                    conv_cfg,
                    in_channels=in_channels[i],
                    out_channels=out_channel,
                    kernel_size=stride,
                    stride=stride)

            deblock = nn.Sequential(upsample_layer,
                                    build_norm_layer(norm_cfg, out_channel)[1],
                                    nn.ReLU(inplace=True))
            deblocks.append(deblock)
        
        self.deblocks = nn.ModuleList(deblocks)

        if init_cfg is None:
            self.init_cfg = [
                dict(type='Kaiming', layer='ConvTranspose2d'),
                dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)
            ]

    @auto_fp16()
    def forward(self, x):
        """Forward function.

        Args:
            x (torch.Tensor): 4D Tensor in (N, C, H, W) shape.

        Returns:
            list[torch.Tensor]: Multi-level feature maps.
        """
        assert len(x) == len(self.in_channels)
        ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)]
        
        if len(ups) > 1:
            out = torch.cat(ups, dim=1)
        else:
            out = ups[0]
        
        return [out]


================================================
FILE: projects/occ_plugin/occupancy/voxel_encoder/__init__.py
================================================
from .sparse_lidar_enc import SparseLiDAREnc4x, SparseLiDAREnc8x

================================================
FILE: projects/occ_plugin/occupancy/voxel_encoder/sparse_lidar_enc.py
================================================
import math
from functools import partial
from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
from mmcv.runner import BaseModule

import torch
import torch.nn as nn
import torch.nn.functional as F

import spconv.pytorch as spconv
from spconv.pytorch import functional as Fsp

from mmdet3d.models.builder import MIDDLE_ENCODERS

import copy

def post_act_block(in_channels, out_channels, kernel_size, indice_key=None, stride=1, padding=0,
                   conv_type='subm', norm_cfg=None):

    if conv_type == 'subm':
        conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, bias=False, indice_key=indice_key)
    elif conv_type == 'spconv':
        conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding,
                                   bias=False, indice_key=indice_key)
    elif conv_type == 'inverseconv':
        conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, indice_key=indice_key, bias=False)
    else:
        raise NotImplementedError

    m = spconv.SparseSequential(
        conv,
        build_norm_layer(norm_cfg, out_channels)[1],
        nn.ReLU(inplace=True),
    )

    return m


class SparseBasicBlock(spconv.SparseModule):

    def __init__(self, inplanes, planes, stride=1, norm_cfg=None, indice_key=None):
        super(SparseBasicBlock, self).__init__()

        self.net = spconv.SparseSequential(
            spconv.SubMConv3d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False, indice_key=indice_key),
            build_norm_layer(norm_cfg, planes)[1],
            nn.ReLU(inplace=True),
            spconv.SubMConv3d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False, indice_key=indice_key),
            build_norm_layer(norm_cfg, planes)[1],
        )

        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = x
        out = self.net(x)
        out = out.replace_feature(out.features + identity.features)
        out = out.replace_feature(self.relu(out.features))

        return out


@MIDDLE_ENCODERS.register_module()
class SparseLiDAREnc4x(nn.Module):
    def __init__(self, input_channel, norm_cfg, base_channel, out_channel, 
                sparse_shape_xyz, **kwargs):
        super().__init__()

        block = post_act_block
        self.sparse_shape_xyz = sparse_shape_xyz

        self.conv_input = spconv.SparseSequential(
            spconv.SubMConv3d(input_channel, base_channel, 3),
            nn.GroupNorm(16, base_channel),
            nn.ReLU(inplace=True))

        self.conv1 = spconv.SparseSequential(
            SparseBasicBlock(base_channel, base_channel, norm_cfg=norm_cfg, indice_key='res1'),
            SparseBasicBlock(base_channel, base_channel, norm_cfg=norm_cfg, indice_key='res1'),
        )

        self.conv2 = spconv.SparseSequential(
            block(base_channel, base_channel*2, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'),
            SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res2'),
            SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res2'),
        )

        self.conv3 = spconv.SparseSequential(
            block(base_channel*2, base_channel*4, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'),
            SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res3'),
            SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res3'),
        )

        self.conv_out = spconv.SparseSequential(
            spconv.SubMConv3d(base_channel*4, out_channel, 3),
            nn.GroupNorm(16, out_channel),
            nn.ReLU(inplace=True))


    def forward(self, voxel_features, coors, batch_size):
        # spconv encoding
        coors = coors.int()
        # FIXME bs=1 hardcode
        input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape_xyz[::-1], batch_size)
        x = self.conv_input(input_sp_tensor)

        x_conv1 = self.conv1(x)
        x_conv2 = self.conv2(x_conv1)
        x_conv3 = self.conv3(x_conv2)
        
        x = self.conv_out(x_conv3)

        return {'x': x.dense().permute(0,1,4,3,2), # B, C, W, H, D 
                'pts_feats': [x]}


@MIDDLE_ENCODERS.register_module()
class SparseLiDAREnc8x(nn.Module):
    def __init__(self, input_channel, norm_cfg, base_channel, out_channel, 
                sparse_shape_xyz, **kwargs):
        super().__init__()

        block = post_act_block
        self.sparse_shape_xyz = sparse_shape_xyz

        self.conv_input = spconv.SparseSequential(
            spconv.SubMConv3d(input_channel, base_channel, 3),
            nn.GroupNorm(16, base_channel),
            nn.ReLU(inplace=True))

        self.conv1 = spconv.SparseSequential(
            block(base_channel, base_channel*2, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv1', conv_type='spconv'),
            SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res1'),
            SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res1'),
        )

        self.conv2 = spconv.SparseSequential(
            block(base_channel*2, base_channel*4, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'),
            SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res2'),
            SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res2'),
        )

        self.conv3 = spconv.SparseSequential(
            block(base_channel*4, base_channel*8, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'),
            SparseBasicBlock(base_channel*8, base_channel*8, norm_cfg=norm_cfg, indice_key='res3'),
            SparseBasicBlock(base_channel*8, base_channel*8, norm_cfg=norm_cfg, indice_key='res3'),
        )

        self.conv_out = spconv.SparseSequential(
            spconv.SubMConv3d(base_channel*8, out_channel, 3),
            nn.GroupNorm(16, out_channel),
            nn.ReLU(inplace=True))


    def forward(self, voxel_features, coors, batch_size):
        # spconv encoding
        coors = coors.int()
        # FIXME bs=1 hardcode
        input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape_xyz[::-1], batch_size)
        x = self.conv_input(input_sp_tensor)

        x_conv1 = self.conv1(x)
        x_conv2 = self.conv2(x_conv1)
        x_conv3 = self.conv3(x_conv2)
        
        x = self.conv_out(x_conv3)

        return {'x': x.dense().permute(0,1,4,3,2), # B, C, W, H, D 
                'pts_feats': [x]}


================================================
FILE: projects/occ_plugin/ops/__init__.py
================================================
from .occ_pooling import *

================================================
FILE: projects/occ_plugin/ops/occ_pooling/OCC_Pool.py
================================================
import torch

from projects.occ_plugin.ops.occ_pooling import occ_pool_ext

__all__ = ["occ_pool"]


class QuickCumsum(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, geom_feats, ranks):
        x = x.cumsum(0)
        kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
        kept[:-1] = ranks[1:] != ranks[:-1]

        x, geom_feats = x[kept], geom_feats[kept]
        x = torch.cat((x[:1], x[1:] - x[:-1]))

        # save kept for backward
        ctx.save_for_backward(kept)

        # no gradient for geom_feats
        ctx.mark_non_differentiable(geom_feats)

        return x, geom_feats

    @staticmethod
    def backward(ctx, gradx, gradgeom):
        (kept,) = ctx.saved_tensors
        back = torch.cumsum(kept, 0)
        back[kept] -= 1

        val = gradx[back]

        return val, None, None


class QuickCumsumCuda(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, geom_feats, ranks, B, D, H, W):
        kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
        kept[1:] = ranks[1:] != ranks[:-1]
        interval_starts = torch.where(kept)[0].int()
        interval_lengths = torch.zeros_like(interval_starts)
        interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
        interval_lengths[-1] = x.shape[0] - interval_starts[-1]
        geom_feats = geom_feats.int()

        out = occ_pool_ext.occ_pool_forward(
            x,
            geom_feats,
            interval_lengths,
            interval_starts,
            B,
            D,
            H,
            W,
        )

        ctx.save_for_backward(interval_starts, interval_lengths, geom_feats)
        ctx.saved_shapes = B, D, H, W
        return out

    @staticmethod
    def backward(ctx, out_grad):
        interval_starts, interval_lengths, geom_feats = ctx.saved_tensors
        B, D, H, W = ctx.saved_shapes

        out_grad = out_grad.contiguous()
        x_grad = occ_pool_ext.occ_pool_backward(
            out_grad,
            geom_feats,
            interval_lengths,
            interval_starts,
            B,
            D,
            H,
            W,
        )

        return x_grad, None, None, None, None, None, None


def occ_pool(feats, coords, B, D, H, W):
    assert feats.shape[0] == coords.shape[0]

    ranks = (
        coords[:, 0] * (W * D * B)
        + coords[:, 1] * (D * B)
        + coords[:, 2] * B
        + coords[:, 3]
    )
    indices = ranks.argsort()
    feats, coords, ranks = feats[indices], coords[indices], ranks[indices]

    x = QuickCumsumCuda.apply(feats, coords, ranks, B, D, H, W)
    x = x.permute(0, 4, 1, 2, 3).contiguous()

    return x

================================================
FILE: projects/occ_plugin/ops/occ_pooling/__init__.py
================================================
from .OCC_Pool import occ_pool

================================================
FILE: projects/occ_plugin/ops/occ_pooling/src/occ_pool.cpp
================================================
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>

// CUDA function declarations
void occ_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
    const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);

void occ_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);


/*
  Function: pillar pooling (forward, cuda)
  Args:
    x                : input features, FloatTensor[n, c]
    geom_feats       : input coordinates, IntTensor[n, 4]
    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
  Return:
    out              : output features, FloatTensor[b, d, h, w, c]
*/
at::Tensor occ_pool_forward(
  const at::Tensor _x,
  const at::Tensor _geom_feats, 
  const at::Tensor _interval_lengths, 
  const at::Tensor _interval_starts,
  int b, int d, int h, int w
) {
  int n = _x.size(0);
  int c = _x.size(1);
  int n_intervals = _interval_lengths.size(0);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_x));
  const float* x = _x.data_ptr<float>();
  const int* geom_feats = _geom_feats.data_ptr<int>();
  const int* interval_lengths = _interval_lengths.data_ptr<int>();
  const int* interval_starts = _interval_starts.data_ptr<int>();
  
  auto options =
      torch::TensorOptions().dtype(_x.dtype()).device(_x.device());
  at::Tensor _out = torch::zeros({b, d, h, w, c}, options);
  float* out = _out.data_ptr<float>();
  occ_pool(
    b, d, h, w, n, c, n_intervals, x,
    geom_feats, interval_starts, interval_lengths, out
  );
  return _out;
}


/*
  Function: pillar pooling (backward, cuda)
  Args:
    out_grad         : input features, FloatTensor[b, d, h, w, c]
    geom_feats       : input coordinates, IntTensor[n, 4]
    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
  Return:
    x_grad           : output features, FloatTensor[n, 4]
*/
at::Tensor occ_pool_backward(
  const at::Tensor _out_grad,
  const at::Tensor _geom_feats, 
  const at::Tensor _interval_lengths, 
  const at::Tensor _interval_starts,
  int b, int d, int h, int w
) {
  int n = _geom_feats.size(0);
  int c = _out_grad.size(4);
  int n_intervals = _interval_lengths.size(0);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
  const float* out_grad = _out_grad.data_ptr<float>();
  const int* geom_feats = _geom_feats.data_ptr<int>();
  const int* interval_lengths = _interval_lengths.data_ptr<int>();
  const int* interval_starts = _interval_starts.data_ptr<int>();

  auto options =
      torch::TensorOptions().dtype(_out_grad.dtype()).device(_out_grad.device());
  at::Tensor _x_grad = torch::zeros({n, c}, options);
  float* x_grad = _x_grad.data_ptr<float>();
  
  occ_pool_grad(
    b, d, h, w, n, c, n_intervals, out_grad,
    geom_feats, interval_starts, interval_lengths, x_grad
  );
  
  return _x_grad;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("occ_pool_forward", &occ_pool_forward,
        "occ_pool_forward");
  m.def("occ_pool_backward", &occ_pool_backward,
        "occ_pool_backward");
}

================================================
FILE: projects/occ_plugin/ops/occ_pooling/src/occ_pool_cuda.cu
================================================
#include <stdio.h>
#include <stdlib.h>

/*
  Function: pillar pooling
  Args:
    b                : batch size
    d                : depth of the feature map
    h                : height of pooled feature map
    w                : width of pooled feature map
    n                : number of input points
    c                : number of channels
    n_intervals      : number of unique points
    x                : input features, FloatTensor[n, c]
    geom_feats       : input coordinates, IntTensor[n, 4]
    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
    out              : output features, FloatTensor[b, d, h, w, c]
*/
__global__ void occ_pool_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
                                  const float *__restrict__ x,
                                  const int *__restrict__ geom_feats,
                                  const int *__restrict__ interval_starts,
                                  const int *__restrict__ interval_lengths,
                                  float* __restrict__ out) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int index = idx / c;
  int cur_c = idx % c;
  if (index >= n_intervals) return;
  int interval_start = interval_starts[index];
  int interval_length = interval_lengths[index];
  const int* cur_geom_feats = geom_feats + interval_start * 4;
  const float* cur_x = x + interval_start * c + cur_c;
  float* cur_out = out + cur_geom_feats[3] * d * h * w * c + 
    cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c + 
    cur_geom_feats[1] * c + cur_c;
  float psum = 0;
  for(int i = 0; i < interval_length; i++){
    psum += cur_x[i * c];
  }
  *cur_out = psum;
}


/*
  Function: pillar pooling backward
  Args:
    b                : batch size
    d                : depth of the feature map
    h                : height of pooled feature map
    w                : width of pooled feature map
    n                : number of input points
    c                : number of channels
    n_intervals      : number of unique points
    out_grad         : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
    geom_feats       : input coordinates, IntTensor[n, 4]
    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
    x_grad           : gradient of the image fmap, FloatTensor
*/
__global__ void occ_pool_grad_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
                                  const float *__restrict__ out_grad,
                                  const int *__restrict__ geom_feats,
                                  const int *__restrict__ interval_starts,
                                  const int *__restrict__ interval_lengths,
                                  float* __restrict__ x_grad) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int index = idx / c;
  int cur_c = idx % c;
  if (index >= n_intervals) return;
  int interval_start = interval_starts[index];
  int interval_length = interval_lengths[index];
  
  const int* cur_geom_feats = geom_feats + interval_start * 4;
  float* cur_x_grad = x_grad + interval_start * c + cur_c;
  
  const float* cur_out_grad = out_grad + cur_geom_feats[3] * d * h * w * c + 
    cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c + 
    cur_geom_feats[1] * c + cur_c;
  for(int i = 0; i < interval_length; i++){
    cur_x_grad[i * c] = *cur_out_grad;
  }
  
}

void occ_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out) {
  occ_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
    b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out
  );
}

void occ_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad) {
  occ_pool_grad_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
    b, d, h, w, n, c, n_intervals, out_grad, geom_feats, interval_starts, interval_lengths, x_grad
  );
}

================================================
FILE: projects/occ_plugin/utils/__init__.py
================================================
from .formating import cm_to_ious, format_results
from .metric_util import per_class_iu, fast_hist_crop
from .coordinate_transform import coarse_to_fine_coordinates, project_points_on_img
from .geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy

================================================
FILE: projects/occ_plugin/utils/coordinate_transform.py
================================================

import torch

def coarse_to_fine_coordinates(coarse_cor, ratio, topk=30000):
    """
    Args:
        coarse_cor (torch.Tensor): [3, N]"""

    fine_cor = coarse_cor * ratio
    fine_cor = fine_cor[None].repeat(ratio**3, 1, 1)  # [8, 3, N]

    device = fine_cor.device
    value = torch.meshgrid([torch.arange(ratio).to(device), torch.arange(ratio).to(device), torch.arange(ratio).to(device)])
    value = torch.stack(value, dim=3).reshape(-1, 3)

    fine_cor = fine_cor + value[:,:,None]

    if fine_cor.shape[-1] < topk:
        return fine_cor.permute(1,0,2).reshape(3,-1)
    else:
        fine_cor = fine_cor[:,:,torch.randperm(fine_cor.shape[-1])[:topk]]
        return fine_cor.permute(1,0,2).reshape(3,-1)


def project_points_on_img(points, rots, trans, intrins, post_rots, post_trans, bda_mat, pts_range,
                        W_img, H_img, W_occ, H_occ, D_occ):
    with torch.no_grad():
        voxel_size = ((pts_range[3:] - pts_range[:3]) / torch.tensor([W_occ-1, H_occ-1, D_occ-1])).to(points.device)
        points = points * voxel_size[None, None] + pts_range[:3][None, None].to(points.device)

        # project 3D point cloud (after bev-aug) onto multi-view images for corresponding 2D coordinates
        inv_bda = bda_mat.inverse()
        points = (inv_bda @ points.unsqueeze(-1)).squeeze(-1)
        
        # from lidar to camera
        points = points.view(-1, 1, 3)
        points = points - trans.view(1, -1, 3)
        inv_rots = rots.inverse().unsqueeze(0)
        points = (inv_rots @ points.unsqueeze(-1))
        
        # from camera to raw pixel
        points = (intrins.unsqueeze(0) @ points).squeeze(-1)
        points_d = points[..., 2:3]
        points_uv = points[..., :2] / (points_d + 1e-5)
        
        # from raw pixel to transformed pixel
        points_uv = post_rots[..., :2, :2].unsqueeze(0) @ points_uv.unsqueeze(-1)
        points_uv = points_uv.squeeze(-1) + post_trans[..., :2].unsqueeze(0)

        points_uv[..., 0] = (points_uv[..., 0] / (W_img-1) - 0.5) * 2
        points_uv[..., 1] = (points_uv[..., 1] / (H_img-1) - 0.5) * 2

        mask = (points_d[..., 0] > 1e-5) \
            & (points_uv[..., 0] > -1) & (points_uv[..., 0] < 1) \
            & (points_uv[..., 1] > -1) & (points_uv[..., 1] < 1)
    
    return points_uv.permute(2,1,0,3), mask

================================================
FILE: projects/occ_plugin/utils/formating.py
================================================
from prettytable import PrettyTable
import numpy as np

def cm_to_ious(cm):
    # SC：[TN FP \n FN TP]
    mean_ious = []
    cls_num = len(cm)
    for i in range(cls_num):
        tp = cm[i, i]
        p = cm[:, i].sum()
        g = cm[i, :].sum()
        union = p + g - tp
        mean_ious.append(tp / union)
    
    return mean_ious

def format_results(mean_ious, return_dic=False):
    class_map = {
        1: 'barrier',
        2: 'bicycle',
        3: 'bus',
        4: 'car',
        5: 'construction_vehicle',
        6: 'motorcycle',
        7: 'pedestrian',
        8: 'traffic_cone',
        9: 'trailer',
        10: 'truck',
        11: 'driveable_surface',
        12: 'other_flat',
        13: 'sidewalk',
        14: 'terrain',
        15: 'manmade',
        16: 'vegetation',
    }
    
    x = PrettyTable()
    x.field_names = ['class', 'IoU']
    class_names = list(class_map.values()) + ['mean']
    class_ious = mean_ious + [sum(mean_ious) / len(mean_ious)]
    dic = {}
    
    for cls_name, cls_iou in zip(class_names, class_ious):
        dic[cls_name] = round(cls_iou, 3)
        x.add_row([cls_name, round(cls_iou, 3)])
    
    if return_dic:
        return x, dic 
    else:
        return x


def format_iou_results(mean_ious, return_dic=False):
    if len(mean_ious) == 2:
        class_map = {
            0: 'free',
            1: 'movable objects',
        }
    else:
        class_map = {
            0: 'free',
            1: 'bicycle',
            2: 'bus',
            3: 'car',
            4: 'construction',
            5: 'motorcycle',
            6: 'trailer',
            7: 'truck',
            8: 'pedestrian',
        }
    x = PrettyTable()
    x.field_names = ['class', 'IoU']
    class_names = list(class_map.values())
    class_ious = mean_ious
    dic = {}
    
    for cls_name, cls_iou in zip(class_names, class_ious):
        dic[cls_name] = np.round(cls_iou, 3)
        x.add_row([cls_name, np.round(cls_iou, 3)])
    
    mean_ious = sum(mean_ious[1:]) / len(mean_ious[1:])
    dic['mean'] = np.round(mean_ious, 3)
    x.add_row(['mean', np.round(mean_ious, 3)])
    
    if return_dic:
        return x, dic 
    else:
        return x

def format_vel_results(mean_epe, return_dic=False):
    class_map = {
        0: 'barrier',
        1: 'bicycle',
        2: 'bus',
        3: 'car',
        4: 'construction_vehicle',
        5: 'motorcycle',
        6: 'pedestrian',
        7: 'traffic_cone',
        8: 'trailer',
        9: 'truck',
    }
    x = PrettyTable()
    x.field_names = ['class', 'EPE']
    class_names = list(class_map.values())
    class_epes = mean_epe
    dic = {}
    
    for cls_name, cls_iou in zip(class_names, class_epes):
        dic[cls_name] = np.round(cls_iou, 3)
        x.add_row([cls_name, np.round(cls_iou, 3)])

    mean_all_epe = mean_epe.mean()
    dic['mean'] = np.round(mean_all_epe, 3)
    x.add_row(['mean', np.round(mean_all_epe, 3)])
    if return_dic:
        return x, dic 
    else:
        return x

================================================
FILE: projects/occ_plugin/utils/gaussian.py
================================================
import numpy as np
import torch
import torch.nn.functional as F
from torch.distributions import Normal
import pdb

def gaussian_2d(shape, sigma=1):
    """Generate gaussian map.

    Args:
        shape (list[int]): Shape of the map.
        sigma (float): Sigma to generate gaussian map.
            Defaults to 1.

    Returns:
        np.ndarray: Generated gaussian map.
    """
    m, n = [(ss - 1.) / 2. for ss in shape]
    y, x = np.ogrid[-m:m + 1, -n:n + 1]

    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
    h[h < np.finfo(h.dtype).eps * h.max()] = 0
    return h


def draw_heatmap_gaussian(heatmap, center, radius, k=1):
    """Get gaussian masked heatmap.

    Args:
        heatmap (torch.Tensor): Heatmap to be masked.
        center (torch.Tensor): Center coord of the heatmap.
        radius (int): Radius of gausian.
        K (int): Multiple of masked_gaussian. Defaults to 1.

    Returns:
        torch.Tensor: Masked heatmap.
    """
    diameter = 2 * radius + 1
    gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)

    x, y = int(center[0]), int(center[1])

    height, width = heatmap.shape[0:2]

    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)

    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = torch.from_numpy(
        gaussian[radius - top:radius + bottom,
                 radius - left:radius + right]).to(heatmap.device,
                                                   torch.float32)
    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
        torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
    return heatmap


def gaussian_radius(det_size, min_overlap=0.5):
    """Get radius of gaussian.

    Args:
        det_size (tuple[torch.Tensor]): Size of the detection result.
        min_overlap (float): Gaussian_overlap. Defaults to 0.5.

    Returns:
        torch.Tensor: Computed radius.
    """
    height, width = det_size

    a1 = 1
    b1 = (height + width)
    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
    sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
    r1 = (b1 + sq1) / 2

    a2 = 4
    b2 = 2 * (height + width)
    c2 = (1 - min_overlap) * width * height
    sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
    r2 = (b2 + sq2) / 2

    a3 = 4 * min_overlap
    b3 = -2 * min_overlap * (height + width)
    c3 = (min_overlap - 1) * width * height
    sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
    r3 = (b3 + sq3) / 2
    return min(r1, r2, r3)


def generate_guassian_depth_target(depth, stride, cam_depth_range, constant_std=None):
    depth = depth.flatten(0, 1)  # [bs*s, 6, 896, 1600] -> [bs*s*6, 896, 1600]
    B, tH, tW = depth.shape
    kernel_size = stride  # [4,4,4]
    center_idx = kernel_size * kernel_size // 2
    H = tH // stride  # 896//4 = 248
    W = tW // stride  # 1600//4 = 400
    
    unfold_depth = F.unfold(depth.unsqueeze(1), kernel_size, dilation=1, padding=0, stride=stride) #B, Cxkxk, HxW
    unfold_depth = unfold_depth.view(B, -1, H, W).permute(0, 2, 3, 1).contiguous() # B, H, W, kxk
    valid_mask = (unfold_depth != 0) # BN, H, W, kxk
    
    if constant_std is None:
        valid_mask_f = valid_mask.float() # BN, H, W, kxk
        valid_num = torch.sum(valid_mask_f, dim=-1) # BN, H, W
        valid_num[valid_num == 0] = 1e10
        
        mean = torch.sum(unfold_depth, dim=-1) / valid_num
        var_sum = torch.sum(((unfold_depth - mean.unsqueeze(-1))**2) * valid_mask_f, dim=-1) # BN, H, W
        std_var = torch.sqrt(var_sum / valid_num)
        std_var[valid_num == 1] = 1 # set std_var to 1 when only one point in patch
    else:
        std_var = torch.ones((B, H, W)).type_as(depth).float() * constant_std

    unfold_depth[~valid_mask] = 1e10
    min_depth = torch.min(unfold_depth, dim=-1)[0] #BN, H, W
    min_depth[min_depth == 1e10] = 0
    
    # x in raw depth 
    x = torch.arange(cam_depth_range[0] - cam_depth_range[2] / 2, cam_depth_range[1], cam_depth_range[2])
    # normalized by intervals
    dist = Normal(min_depth / cam_depth_range[2], std_var / cam_depth_range[2]) # BN, H, W, D
    cdfs = []
    for i in x:
        cdf = dist.cdf(i)
        cdfs.append(cdf)
    
    cdfs = torch.stack(cdfs, dim=-1)
    depth_dist = cdfs[..., 1:] - cdfs[...,:-1]
    
    return depth_dist, min_depth

================================================
FILE: projects/occ_plugin/utils/geometry.py
================================================
import numpy as np
import PIL
import torch
import torch.nn.functional as F
from pyquaternion import Quaternion


def convert_egopose_to_matrix_numpy(trans, rot):
    transformation_matrix = np.zeros((4, 4), dtype=np.float32)
    rotation = Quaternion(rot).rotation_matrix
    translation = np.array(trans)
    transformation_matrix[:3, :3] = rotation
    transformation_matrix[:3, 3] = translation
    transformation_matrix[3, 3] = 1.0
    return transformation_matrix


def invert_matrix_egopose_numpy(egopose):
    """ Compute the inverse transformation of a 4x4 egopose numpy matrix."""
    inverse_matrix = np.zeros((4, 4), dtype=np.float32)
    rotation = egopose[:3, :3]
    translation = egopose[:3, 3]
    inverse_matrix[:3, :3] = rotation.T
    inverse_matrix[:3, 3] = -np.dot(rotation.T, translation)
    inverse_matrix[3, 3] = 1.0
    return inverse_matrix

================================================
FILE: projects/occ_plugin/utils/metric_util.py
================================================
# -*- coding:utf-8 -*-
# author: Xinge
# @file: metric_util.py 

import numpy as np

def fast_hist(pred, label, n):
    k = (label >= 0) & (label < n)
    bin_count = np.bincount(
        n * label[k].astype(int) + pred[k], minlength=n ** 2)
    return bin_count[:n ** 2].reshape(n, n)

def per_class_iu(hist):
    return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))

def fast_hist_crop(output, target, unique_label):
    hist = fast_hist(output.flatten(), target.flatten(), np.max(unique_label) + 2)
    
    hist = hist[unique_label + 1, :]
    hist = hist[:, unique_label + 1]
    
    return hist

class SSCMetrics:
    def __init__(self, class_names, ignore_idx=255, empty_idx=None):
        self.class_names = class_names
        self.n_classes = len(class_names)
        self.ignore_idx = ignore_idx
        self.empty_idx = empty_idx
        self.reset()

    def hist_info(self, n_cl, pred, gt):
        assert pred.shape == gt.shape
        k = (gt >= 0) & (gt < n_cl)  # exclude 255
        labeled = np.sum(k)
        correct = np.sum((pred[k] == gt[k]))

        return (
            np.bincount(
                n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
            ).reshape(n_cl, n_cl),
            correct,
            labeled,
        )

    @staticmethod
    def compute_score(hist, correct, labeled):
        iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
        mean_IU = np.nanmean(iu)
        mean_IU_no_back = np.nanmean(iu[1:])
        freq = hist.sum(1) / hist.sum()
        freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
        mean_pixel_acc = correct / labeled if labeled != 0 else 0

        return iu, mean_IU, mean_IU_no_back, mean_pixel_acc

    def add_batch(self, y_pred, y_true, nonsurface=None):
        self.count += 1
        mask = y_true != self.ignore_idx
        if self.empty_idx is not None:
            mask = mask & (y_true != self.empty_idx)
        if nonsurface is not None:
            mask = mask & nonsurface
        tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)

        self.completion_tp += tp
        self.completion_fp += fp
        self.completion_fn += fn

        mask = y_true != self.ignore_idx
        if self.empty_idx is not None:
            mask = mask & (y_true != self.empty_idx)
        tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
            y_pred, y_true, mask
        )
        self.tps += tp_sum
        self.fps += fp_sum
        self.fns += fn_sum

    def get_stats(self):
        if self.completion_tp != 0:
            precision = self.completion_tp / (self.completion_tp + self.completion_fp)
            recall = self.completion_tp / (self.completion_tp + self.completion_fn)
            iou = self.completion_tp / (
                self.completion_tp + self.completion_fp + self.completion_fn
            )
        else:
            precision, recall, iou = 0, 0, 0
        iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
        return {
            "precision": precision,
            "recall": recall,
            "iou": iou,
            "iou_ssc": iou_ssc,
            "iou_ssc_mean": np.mean(iou_ssc[1:]),
        }

    def reset(self):

        self.completion_tp = 0
        self.completion_fp = 0
        self.completion_fn = 0
        self.tps = np.zeros(self.n_classes)
        self.fps = np.zeros(self.n_classes)
        self.fns = np.zeros(self.n_classes)

        self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
        self.labeled_ssc = 0
        self.correct_ssc = 0

        self.precision = 0
        self.recall = 0
        self.iou = 0
        self.count = 1e-8
        self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
        self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)

    def get_score_completion(self, predict, target, nonempty=None):
        predict = np.copy(predict)
        target = np.copy(target)

        """for scene completion, treat the task as two-classes problem, just empty or occupancy"""
        _bs = predict.shape[0]  # batch size
        # ---- ignore
        predict[target == self.ignore_idx] = 0
        target[target == self.ignore_idx] = 0
        # ---- flatten
        target = target.reshape(_bs, -1)  # (_bs, 129600)
        predict = predict.reshape(_bs, -1)  # (_bs, _C, 129600), 60*36*60=129600
        # ---- treat all non-empty object class as one category, set them to label 1
        b_pred = np.zeros(predict.shape)
        b_true = np.zeros(target.shape)
        b_pred[predict != self.empty_idx] = 1
        b_true[target != self.empty_idx] = 1
        p, r, iou = 0.0, 0.0, 0.0
        tp_sum, fp_sum, fn_sum = 0, 0, 0
        for idx in range(_bs):
            y_true = b_true[idx, :]  # GT
            y_pred = b_pred[idx, :]
            if nonempty is not None:
                nonempty_idx = nonempty[idx, :].reshape(-1)
                y_true = y_true[nonempty_idx == 1]
                y_pred = y_pred[nonempty_idx == 1]

            tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
            fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
            fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
            tp_sum += tp
            fp_sum += fp
            fn_sum += fn
        return tp_sum, fp_sum, fn_sum

    def get_score_semantic_and_completion(self, predict, target, nonempty=None):
        target = np.copy(target)
        predict = np.copy(predict)
        _bs = predict.shape[0]  # batch size
        _C = self.n_classes  # _C = 12
        # ---- ignore
        predict[target == self.ignore_idx] = 0
        target[target == self.ignore_idx] = 0
        # ---- flatten
        target = target.reshape(_bs, -1)  # (_bs, 129600)
        predict = predict.reshape(_bs, -1)  # (_bs, 129600), 60*36*60=129600

        cnt_class = np.zeros(_C, dtype=np.int32)  # count for each class
        iou_sum = np.zeros(_C, dtype=np.float32)  # sum of iou for each class
        tp_sum = np.zeros(_C, dtype=np.int32)  # tp
        fp_sum = np.zeros(_C, dtype=np.int32)  # fp
        fn_sum = np.zeros(_C, dtype=np.int32)  # fn

        for idx in range(_bs):
            y_true = target[idx, :]  # GT
            y_pred = predict[idx, :]
            if nonempty is not None:
                nonempty_idx = nonempty[idx, :].reshape(-1)
                y_pred = y_pred[
                    np.where(np.logical_and(nonempty_idx == 1, y_true != self.ignore_idx))
                ]
                y_true = y_true[
                    np.where(np.logical_and(nonempty_idx == 1, y_true != self.ignore_idx))
                ]
            for j in range(_C):  # for each class
                tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
                fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
                fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size

                tp_sum[j] += tp
                fp_sum[j] += fp
                fn_sum[j] += fn

        return tp_sum, fp_sum, fn_sum

================================================
FILE: projects/occ_plugin/utils/nusc_param.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


nusc_class_frequencies = np.array([2242961742295, 25985376, 1561108, 28862014, 196106643, 15920504,
                2158753, 26539491, 4004729, 34838681, 75173306, 2255027978, 50959399, 646022466, 869055679,
                1446141335, 1724391378])


nusc_class_names = [
    "empty",
    "barrier",
    "bicycle",
    "bus",
    "car",
    "construction",
    "motorcycle",
    "pedestrian",
    "trafficcone",
    "trailer",
    "truck",
    "driveable_surface",
    "other",
    "sidewalk",
    "terrain",
    "mannade",
    "vegetation",
]


classname_to_color = {  # RGB.
    # 0: (0, 0, 0),  # Black. noise
    1: (112, 128, 144),  # Slategrey barrier
    2: (220, 20, 60),  # Crimson bicycle
    3: (255, 127, 80),  # Orangered bus
    4: (255, 158, 0),  # Orange car
    5: (233, 150, 70),  # Darksalmon construction
    6: (255, 61, 99),  # Red motorcycle
    7: (0, 0, 230),  # Blue pedestrian
    8: (47, 79, 79),  # Darkslategrey trafficcone
    9: (255, 140, 0),  # Darkorange trailer
    10: (255, 99, 71),  # Tomato truck
    11: (0, 207, 191),  # nuTonomy green driveable_surface
    12: (175, 0, 75),  # flat other
    13: (75, 0, 75),  # sidewalk
    14: (112, 180, 60),  # terrain
    15: (222, 184, 135),  # Burlywood mannade
    16: (0, 175, 0),  # Green vegetation
}

def KL_sep(p, target):
    """
    KL divergence on nonzeros classes
    """
    nonzeros = target != 0
    nonzero_p = p[nonzeros]
    kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
    return kl_term


def geo_scal_loss(pred, ssc_target):

    # Get softmax probabilities
    pred = F.softmax(pred, dim=1)

    # Compute empty and nonempty probabilities
    empty_probs = pred[:, 0, :, :, :]
    nonempty_probs = 1 - empty_probs

    # Remove unknown voxels
    mask = ssc_target != 255
    nonempty_target = ssc_target != 0
    nonempty_target = nonempty_target[mask].float()
    nonempty_probs = nonempty_probs[mask]
    empty_probs = empty_probs[mask]

    intersection = (nonempty_target * nonempty_probs).sum()
    precision = intersection / nonempty_probs.sum()
    recall = intersection / nonempty_target.sum()
    spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
    return (
        F.binary_cross_entropy(precision, torch.ones_like(precision))
        + F.binary_cross_entropy(recall, torch.ones_like(recall))
        + F.binary_cross_entropy(spec, torch.ones_like(spec))
    )


def sem_scal_loss(pred, ssc_target):
    # Get softmax probabilities
    pred = F.softmax(pred, dim=1)
    loss = 0
    count = 0
    mask = ssc_target != 255
    n_classes = pred.shape[1]
    for i in range(0, n_classes):

        # Get probability of class i
        p = pred[:, i, :, :, :]

        # Remove unknown voxels
        target_ori = ssc_target
        p = p[mask]
        target = ssc_target[mask]

        completion_target = torch.ones_like(target)
        completion_target[target != i] = 0
        completion_target_ori = torch.ones_like(target_ori).float()
        completion_target_ori[target_ori != i] = 0
        if torch.sum(completion_target) > 0:
            count += 1.0
            nominator = torch.sum(p * completion_target)
            loss_class = 0
            if torch.sum(p) > 0:
                precision = nominator / (torch.sum(p))
                loss_precision = F.binary_cross_entropy(
                    precision, torch.ones_like(precision)
                )
                loss_class += loss_precision
            if torch.sum(completion_target) > 0:
                recall = nominator / (torch.sum(completion_target))
                loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
                loss_class += loss_recall
            if torch.sum(1 - completion_target) > 0:
                specificity = torch.sum((1 - p) * (1 - completion_target)) / (
                    torch.sum(1 - completion_target)
                )
                loss_specificity = F.binary_cross_entropy(
                    specificity, torch.ones_like(specificity)
                )
                loss_class += loss_specificity
            loss += loss_class
    return loss / count


def CE_ssc_loss(pred, target, class_weights):
    """
    :param: prediction: the predicted tensor, must be [BS, C, H, W, D]
    """
    criterion = nn.CrossEntropyLoss(
        weight=class_weights, ignore_index=255, reduction="mean"
    )
    loss = criterion(pred, target.long())

    return loss


================================================
FILE: projects/occ_plugin/utils/semkitti.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

semantic_kitti_class_frequencies = np.array(
    [
        5.41773033e09,
        1.57835390e07,
        1.25136000e05,
        1.18809000e05,
        6.46799000e05,
        8.21951000e05,
        2.62978000e05,
        2.83696000e05,
        2.04750000e05,
        6.16887030e07,
        4.50296100e06,
        4.48836500e07,
        2.26992300e06,
        5.68402180e07,
        1.57196520e07,
        1.58442623e08,
        2.06162300e06,
        3.69705220e07,
        1.15198800e06,
        3.34146000e05,
    ]
)

kitti_class_names = [
    "empty",
    "car",
    "bicycle",
    "motorcycle",
    "truck",
    "other-vehicle",
    "person",
    "bicyclist",
    "motorcyclist",
    "road",
    "parking",
    "sidewalk",
    "other-ground",
    "building",
    "fence",
    "vegetation",
    "trunk",
    "terrain",
    "pole",
    "traffic-sign",
]


def KL_sep(p, target):
    """
    KL divergence on nonzeros classes
    """
    nonzeros = target != 0
    nonzero_p = p[nonzeros]
    kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
    return kl_term


def geo_scal_loss(pred, ssc_target, ignore_index=255, non_empty_idx=0):

    # Get softmax probabilities
    pred = F.softmax(pred, dim=1)

    # Compute empty and nonempty probabilities
    empty_probs = pred[:, non_empty_idx]
    nonempty_probs = 1 - empty_probs

    # Remove unknown voxels
    mask = ssc_target != ignore_index
    nonempty_target = ssc_target != non_empty_idx
    nonempty_target = nonempty_target[mask].float()
    nonempty_probs = nonempty_probs[mask]
    empty_probs = empty_probs[mask]

    eps = 1e-5
    intersection = (nonempty_target * nonempty_probs).sum()
    precision = intersection / (nonempty_probs.sum()+eps)
    recall = intersection / (nonempty_target.sum()+eps)
    spec = ((1 - nonempty_target) * (empty_probs)).sum() / ((1 - nonempty_target).sum()+eps)
    return (
        F.binary_cross_entropy(precision, torch.ones_like(precision))
        + F.binary_cross_entropy(recall, torch.ones_like(recall))
        + F.binary_cross_entropy(spec, torch.ones_like(spec))
    )


def sem_scal_loss(pred, ssc_target, ignore_index=255):
    # Get softmax probabilities
    pred = F.softmax(pred, dim=1)
    loss = 0
    count = 0
    mask = ssc_target != ignore_index
    n_classes = pred.shape[1]
    for i in range(0, n_classes):

        # Get probability of class i
        p = pred[:, i]

        # Remove unknown voxels
        target_ori = ssc_target
        p = p[mask]
        target = ssc_target[mask]

        completion_target = torch.ones_like(target)
        completion_target[target != i] = 0
        completion_target_ori = torch.ones_like(target_ori).float()
        completion_target_ori[target_ori != i] = 0
        if torch.sum(completion_target) > 0:
            count += 1.0
            nominator = torch.sum(p * completion_target)
            loss_class = 0
            if torch.sum(p) > 0:
                precision = nominator / (torch.sum(p))
                loss_precision = F.binary_cross_entropy(
                    precision, torch.ones_like(precision)
                )
                loss_class += loss_precision
            if torch.sum(completion_target) > 0:
                recall = nominator / (torch.sum(completion_target))
                loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
                loss_class += loss_recall
            if torch.sum(1 - completion_target) > 0:
                specificity = torch.sum((1 - p) * (1 - completion_target)) / (
                    torch.sum(1 - completion_target)
                )
                loss_specificity = F.binary_cross_entropy(
                    specificity, torch.ones_like(specificity)
                )
                loss_class += loss_specificity
            loss += loss_class
    return loss / count


def CE_ssc_loss(pred, target, class_weights=None, ignore_index=255):
    """
    :param: prediction: the predicted tensor, must be [BS, C, ...]
    """
    criterion = nn.CrossEntropyLoss(
        weight=class_weights, ignore_index=ignore_index, reduction="mean"
    )
    loss = criterion(pred, target.long())

    return loss

def Smooth_L1_loss(pred, target, ignore_index=255):
    # pred/target B, H, W, D, 3
    kept = (target[:, :, :, :, 0] != ignore_index) & (target[:, :, :, :, 1] != ignore_index) & (target[:, :, :, :, 2] != ignore_index)

    criterion = nn.SmoothL1Loss( reduction="mean" )
    loss = criterion(pred[kept], target[kept])

    if torch.isnan(loss):
        pred = pred * 0
        target = target * 0
        loss = criterion(pred, target)
        return loss
    return loss

def vel_loss(pred, gt):
    return F.l1_loss(pred, gt)

================================================
FILE: projects/occ_plugin/utils/voxel_to_points.py
================================================
import open3d as o3d
import numpy as np

def query_points_from_voxels(pred, gt, img_metas):
    # pred, [tensor of shape (num_class, x, y, z)]: predicted classes
    # gt, [tensor of shape (batch, num_points)]: target points with semantic labels
    
    # logits to pred cls_id
    pred = np.argmax(pred.detach().cpu().numpy(), axis=0)
    gt_ = gt.detach().cpu().numpy()
    
    pred_fore_mask = pred > 0
    if pred_fore_mask.sum() == 0:
        return None
    
    # select foreground 3d voxel vertex
    x = np.linspace(0, pred.shape[0] - 1, pred.shape[0])
    y = np.linspace(0, pred.shape[1] - 1, pred.shape[1])
    z = np.linspace(0, pred.shape[2] - 1, pred.shape[2])
    X, Y, Z = np.meshgrid(x, y, z,  indexing='ij')
    vv = np.stack([X, Y, Z], axis=-1)
    
    # foreground predictions & coordinates
    pred = pred[pred_fore_mask]
    vv = vv[pred_fore_mask]
    
    vv[:, 0] = (vv[:, 0] + 0.5) * (img_metas['pc_range'][3] - img_metas['pc_range'][0]) /  img_metas['occ_size'][0]  + img_metas['pc_range'][0]
    vv[:, 1] = (vv[:, 1] + 0.5) * (img_metas['pc_range'][4] - img_metas['pc_range'][1]) /  img_metas['occ_size'][1]  + img_metas['pc_range'][1]
    vv[:, 2] = (vv[:, 2] + 0.5) * (img_metas['pc_range'][5] - img_metas['pc_range'][2]) /  img_metas['occ_size'][2]  + img_metas['pc_range'][2]

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(vv)
    
    # for every lidar point, search its nearest *foreground* voxel vertex as the semantic prediction
    kdtree = o3d.geometry.KDTreeFlann(pcd)
    indices = []
    for vert in gt_[:, :3]:
        _, inds, _ = kdtree.search_knn_vector_3d(vert, 1)
        indices.append(inds[0])
    
    pred_valid = pred[np.array(indices)]
    
    return pred_valid
    

================================================
FILE: run.sh
================================================
echo "-------------"
echo "load config from local path:" $1
if [ -f $1 ]; then
  config=$1
else
  echo "need a config file"
  exit
fi

bash tools/dist_train.sh $config $2 ${@:3}

================================================
FILE: run_eval.sh
================================================
echo "-------------"
echo "load config from local path:" $1
if [ -f $1 ]; then
  config=$1
else
  echo "need a config file"
  exit
fi

export PYTHONPATH="."

ckpt=$2
gpu=$3
bash tools/dist_test.sh $config $ckpt $gpu ${@:4}

================================================
FILE: setup.py
================================================
from setuptools import find_packages, setup

import os
import torch
from os import path as osp
from torch.utils.cpp_extension import (BuildExtension, CppExtension,
                                       CUDAExtension)


def make_cuda_ext(name,
                  module,
                  sources,
                  sources_cuda=[],
                  extra_args=[],
                  extra_include_path=[]):

    define_macros = []
    extra_compile_args = {'cxx': [] + extra_args}

    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
        define_macros += [('WITH_CUDA', None)]
        extension = CUDAExtension
        extra_compile_args['nvcc'] = extra_args + [
            '-D__CUDA_NO_HALF_OPERATORS__',
            '-D__CUDA_NO_HALF_CONVERSIONS__',
            '-D__CUDA_NO_HALF2_OPERATORS__',
        ]
        sources += sources_cuda
    else:
        print('Compiling {} without CUDA'.format(name))
        extension = CppExtension
        # raise EnvironmentError('CUDA is required to compile MMDetection!')

    return extension(
        name='{}.{}'.format(module, name),
        sources=[os.path.join(*module.split('.'), p) for p in sources],
        include_dirs=extra_include_path,
        define_macros=define_macros,
        extra_compile_args=extra_compile_args)


if __name__ == '__main__':
    # add_mim_extention()
    setup(
        name='OpenOccupancy',
        version='0.0',
        description=("OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception"),
        author='OpenOccupancy Contributors',
        author_email='wangxiaofeng2020@ia.ac.cn',
        keywords='Occupancy Perception',
        packages=find_packages(),
        include_package_data=True,
        package_data={'projects.occ_plugin.ops': ['*/*.so']},
        classifiers=[
            "Development Status :: 4 - Beta",
            "License :: OSI Approved :: Apache Software License",
            "Operating System :: OS Independent",
            "Programming Language :: Python :: 3",
            "Programming Language :: Python :: 3.6",
            "Programming Language :: Python :: 3.7",
        ],
        license="Apache License 2.0",

        ext_modules=[
            make_cuda_ext(
                name="occ_pool_ext",
                module="projects.occ_plugin.ops.occ_pooling",
                sources=[
                    "src/occ_pool.cpp",
                    "src/occ_pool_cuda.cu",
                ]),
        ],
        cmdclass={'build_ext': BuildExtension})

================================================
FILE: tools/dist_test.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29504}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.run --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --deterministic --eval bbox

================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
PORT=${PORT:-29501}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.run \
    --nnodes=$NNODES \
    --node_rank=$NODE_RANK \
    --master_addr=$MASTER_ADDR \
    --nproc_per_node=$GPUS \
    --master_port=$PORT \
    $(dirname "$0")/train.py \
    $CONFIG \
    --seed 2 \
    --resume ./work_dirs/OCFNet_in_Cam4DOcc_V1.2/epoch_15.pth
    --launcher pytorch ${@:3}


================================================
FILE: tools/gen_data/gen_depth_gt.py
================================================
import os
from multiprocessing import Pool

import mmcv
import numpy as np
from nuscenes.utils.data_classes import LidarPointCloud
from nuscenes.utils.geometry_utils import view_points
from pyquaternion import Quaternion
import copy

# https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/nuscenes.py#L834
def map_pointcloud_to_image(
    pc,
    im,
    lidar2ego_translation,
    lidar2ego_rotation,
    ego2global_translation,
    ego2global_rotation,
    sensor2ego_translation, 
    sensor2ego_rotation,
    cam_ego2global_translation,
    cam_ego2global_rotation,
    cam_intrinsic,
    min_dist: float = 0.0,
):

    # Points live in the point sensor frame. So they need to be
    # transformed via global to the image plane.
    # First step: transform the pointcloud to the ego vehicle
    # frame for the timestamp of the sweep.

    pc = LidarPointCloud(pc.T)
    pc.rotate(Quaternion(lidar2ego_rotation).rotation_matrix)
    pc.translate(np.array(lidar2ego_translation))

    # Second step: transform from ego to the global frame.
    pc.rotate(Quaternion(ego2global_rotation).rotation_matrix)
    pc.translate(np.array(ego2global_translation))

    # Third step: transform from global into the ego vehicle
    # frame for the timestamp of the image.
    pc.translate(-np.array(cam_ego2global_translation))
    pc.rotate(Quaternion(cam_ego2global_rotation).rotation_matrix.T)

    # Fourth step: transform from ego into the camera.
    pc.translate(-np.array(sensor2ego_translation))
    pc.rotate(Quaternion(sensor2ego_rotation).rotation_matrix.T)

    # Fifth step: actually take a "picture" of the point cloud.
    # Grab the depths (camera frame z axis points away from the camera).
    depths = pc.points[2, :]
    coloring = depths

    # Take the actual picture (matrix multiplication with camera-matrix
    # + renormalization).
    points = view_points(pc.points[:3, :],
                         cam_intrinsic,
                         normalize=True)

    # Remove points that are either outside or behind the camera.
    # Leave a margin of 1 pixel for aesthetic reasons. Also make
    # sure points are at least 1m in front of the camera to avoid
    # seeing the lidar points on the camera casing for non-keyframes
    # which are slightly out of sync.
    mask = np.ones(depths.shape[0], dtype=bool)
    mask = np.logical_and(mask, depths > min_dist)
    mask = np.logical_and(mask, points[0, :] > 1)
    mask = np.logical_and(mask, points[0, :] < im.shape[1] - 1)
    mask = np.logical_and(mask, points[1, :] > 1)
    mask = np.logical_and(mask, points[1, :] < im.shape[0] - 1)
    points = points[:, mask]
    coloring = coloring[mask]

    return points, coloring


data_root = './data/nuscenes'
info_path_train = './data/nuscenes/nuscenes_occ_infos_train.pkl'
info_path_val = './data/nuscenes/nuscenes_occ_infos_val.pkl'

# data3d_nusc = NuscMVDetData()

lidar_key = 'LIDAR_TOP'
cam_keys = [
    'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT',
    'CAM_BACK', 'CAM_BACK_LEFT'
]


def worker(info):
    lidar_path = info['lidar_path']
    points = np.fromfile(lidar_path,
                         dtype=np.float32,
                         count=-1).reshape(-1, 5)[..., :4]
    
    lidar2ego_translation = info['lidar2ego_translation']
    lidar2ego_rotation = info['lidar2ego_rotation']
    ego2global_translation = info['ego2global_translation']
    ego2global_rotation = info['ego2global_rotation']
    for i, cam_key in enumerate(cam_keys):
        sensor2ego_translation = info['cams'][cam_key]['sensor2ego_translation']
        sensor2ego_rotation = info['cams'][cam_key]['sensor2ego_rotation']
        cam_ego2global_translation = info['cams'][cam_key]['ego2global_translation']
        cam_ego2global_rotation = info['cams'][cam_key]['ego2global_rotation']
        cam_intrinsic = info['cams'][cam_key]['cam_intrinsic']
        img = mmcv.imread(
            os.path.join(info['cams'][cam_key]['data_path']))
        pts_img, depth = map_pointcloud_to_image(
            points.copy(), img, 
            copy.deepcopy(lidar2ego_translation), 
            copy.deepcopy(lidar2ego_rotation), 
            copy.deepcopy(ego2global_translation),
            copy.deepcopy(ego2global_rotation),
            copy.deepcopy(sensor2ego_translation), 
            copy.deepcopy(sensor2ego_rotation), 
            copy.deepcopy(cam_ego2global_translation), 
            copy.deepcopy(cam_ego2global_rotation),
            copy.deepcopy(cam_intrinsic))
        
        file_name = os.path.split(info['cams'][cam_key]['data_path'])[-1]
        np.concatenate([pts_img[:2, :].T, depth[:, None]],
                       axis=1).astype(np.float32).flatten().tofile(
                           os.path.join('./data', 'depth_gt',
                                        f'{file_name}.bin'))

if __name__ == '__main__':
    po = Pool(12)
    mmcv.mkdir_or_exist(os.path.join('./data', 'depth_gt'))
    infos = mmcv.load(info_path_train)['infos']
    for info in infos:
        po.apply_async(func=worker, args=(info, ))
    po.close()
    po.join()
    
    po2 = Pool(12)
    infos = mmcv.load(info_path_val)['infos']
    for info in infos:
        po2.apply_async(func=worker, args=(info, ))
    po2.close()
    po2.join()


================================================
FILE: tools/misc/browse_dataset.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import numpy as np
import warnings
from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
from os import path as osp

from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
                               DepthInstance3DBoxes, LiDARInstance3DBoxes)
from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
                                     show_seg_result)
from mmdet3d.datasets import build_dataset


def parse_args():
    parser = argparse.ArgumentParser(description='Browse a dataset')
    parser.add_argument('config', help='train config file path')
    parser.add_argument(
        '--skip-type',
        type=str,
        nargs='+',
        default=['Normalize'],
        help='skip some useless pipeline')
    parser.add_argument(
        '--output-dir',
        default=None,
        type=str,
        help='If there is no display interface, you can save it')
    parser.add_argument(
        '--task',
        type=str,
        choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
        help='Determine the visualization method depending on the task.')
    parser.add_argument(
        '--online',
        action='store_true',
        help='Whether to perform online visualization. Note that you often '
        'need a monitor to do so.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    args = parser.parse_args()
    return args


def build_data_cfg(config_path, skip_type, cfg_options):
    """Build data config for loading visualization data."""
    cfg = Config.fromfile(config_path)
    if cfg_options is not None:
        cfg.merge_from_dict(cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # extract inner dataset of `RepeatDataset` as `cfg.data.train`
    # so we don't need to worry about it later
    if cfg.data.train['type'] == 'RepeatDataset':
        cfg.data.train = cfg.data.train.dataset
    # use only first dataset for `ConcatDataset`
    if cfg.data.train['type'] == 'ConcatDataset':
        cfg.data.train = cfg.data.train.datasets[0]
    train_data_cfg = cfg.data.train
    # eval_pipeline purely consists of loading functions
    # use eval_pipeline for data loading
    train_data_cfg['pipeline'] = [
        x for x in cfg.eval_pipeline if x['type'] not in skip_type
    ]

    return cfg


def to_depth_mode(points, bboxes):
    """Convert points and bboxes to Depth Coord and Depth Box mode."""
    if points is not None:
        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
                                           Coord3DMode.DEPTH)
    if bboxes is not None:
        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
                                   Box3DMode.DEPTH)
    return points, bboxes


def show_det_data(idx, dataset, out_dir, filename, show=False):
    """Visualize 3D point cloud and 3D bboxes."""
    example = dataset.prepare_train_data(idx)
    points = example['points']._data.numpy()
    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
    if dataset.box_mode_3d != Box3DMode.DEPTH:
        points, gt_bboxes = to_depth_mode(points, gt_bboxes)
    show_result(
        points,
        gt_bboxes.clone(),
        None,
        out_dir,
        filename,
        show=show,
        snapshot=True)


def show_seg_data(idx, dataset, out_dir, filename, show=False):
    """Visualize 3D point cloud and segmentation mask."""
    example = dataset.prepare_train_data(idx)
    points = example['points']._data.numpy()
    gt_seg = example['pts_semantic_mask']._data.numpy()
    show_seg_result(
        points,
        gt_seg.copy(),
        None,
        out_dir,
        filename,
        np.array(dataset.PALETTE),
        dataset.ignore_index,
        show=show,
        snapshot=True)


def show_proj_bbox_img(idx,
                       dataset,
                       out_dir,
                       filename,
                       show=False,
                       is_nus_mono=False):
    """Visualize 3D bboxes on 2D image by projection."""
    try:
        example = dataset.prepare_train_data(idx)
    except AttributeError:  # for Mono-3D datasets
        example = dataset.prepare_train_img(idx)
    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
    img_metas = example['img_metas']._data
    img = example['img']._data.numpy()
    # need to transpose channel to first dim
    img = img.transpose(1, 2, 0)
    # no 3D gt bboxes, just show img
    if gt_bboxes.tensor.shape[0] == 0:
        gt_bboxes = None
    if isinstance(gt_bboxes, DepthInstance3DBoxes):
        show_multi_modality_result(
            img,
            gt_bboxes,
            None,
            None,
            out_dir,
            filename,
            box_mode='depth',
            img_metas=img_metas,
            show=show)
    elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
        show_multi_modality_result(
            img,
            gt_bboxes,
            None,
            img_metas['lidar2img'],
            out_dir,
            filename,
            box_mode='lidar',
            img_metas=img_metas,
            show=show)
    elif isinstance(gt_bboxes, CameraInstance3DBoxes):
        show_multi_modality_result(
            img,
            gt_bboxes,
            None,
            img_metas['cam2img'],
            out_dir,
            filename,
            box_mode='camera',
            img_metas=img_metas,
            show=show)
    else:
        # can't project, just show img
        warnings.warn(
            f'unrecognized gt box type {type(gt_bboxes)}, only show image')
        show_multi_modality_result(
            img, None, None, None, out_dir, filename, show=show)


def main():
    args = parse_args()

    if args.output_dir is not None:
        mkdir_or_exist(args.output_dir)

    cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
    try:
        dataset = build_dataset(
            cfg.data.train, default_args=dict(filter_empty_gt=False))
    except TypeError:  # seg dataset doesn't have `filter_empty_gt` key
        dataset = build_dataset(cfg.data.train)
    data_infos = dataset.data_infos
    dataset_type = cfg.dataset_type

    # configure visualization mode
    vis_task = args.task  # 'det', 'seg', 'multi_modality-det', 'mono-det'

    for idx, data_info in enumerate(track_iter_progress(data_infos)):
        if dataset_type in ['KittiDataset', 'WaymoDataset']:
            data_path = data_info['point_cloud']['velodyne_path']
        elif dataset_type in [
                'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
                'S3DISSegDataset', 'S3DISDataset'
        ]:
            data_path = data_info['pts_path']
        elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
            data_path = data_info['lidar_path']
        elif dataset_type in ['NuScenesMonoDataset']:
            data_path = data_info['file_name']
        else:
            raise NotImplementedError(
                f'unsupported dataset type {dataset_type}')

        file_name = osp.splitext(osp.basename(data_path))[0]

        if vis_task in ['det', 'multi_modality-det']:
            # show 3D bboxes on 3D point clouds
            show_det_data(
                idx, dataset, args.output_dir, file_name, show=args.online)
        if vis_task in ['multi_modality-det', 'mono-det']:
            # project 3D bboxes to 2D image
            show_proj_bbox_img(
                idx,
                dataset,
                args.output_dir,
                file_name,
                show=args.online,
                is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
        elif vis_task in ['seg']:
            # show 3D segmentation mask on 3D point clouds
            show_seg_data(
                idx, dataset, args.output_dir, file_name, show=args.online)


if __name__ == '__main__':
    main()


================================================
FILE: tools/misc/fuse_conv_bn.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import torch
from mmcv.runner import save_checkpoint
from torch import nn as nn

from mmdet.apis import init_model


def fuse_conv_bn(conv, bn):
    """During inference, the functionary of batch norm layers is turned off but
    only the mean and var alone channels are used, which exposes the chance to
    fuse it with the preceding conv layers to save computations and simplify
    network structures."""
    conv_w = conv.weight
    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
        bn.running_mean)

    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
    conv.weight = nn.Parameter(conv_w *
                               factor.reshape([conv.out_channels, 1, 1, 1]))
    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
    return conv


def fuse_module(m):
    last_conv = None
    last_conv_name = None

    for name, child in m.named_children():
        if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
            if last_conv is None:  # only fuse BN that is after Conv
                continue
            fused_conv = fuse_conv_bn(last_conv, child)
            m._modules[last_conv_name] = fused_conv
            # To reduce changes, set BN as Identity instead of deleting it.
            m._modules[name] = nn.Identity()
            last_conv = None
        elif isinstance(child, nn.Conv2d):
            last_conv = child
            last_conv_name = name
        else:
            fuse_module(child)
    return m


def parse_args():
    parser = argparse.ArgumentParser(
        description='fuse Conv and BN layers in a model')
    parser.add_argument('config', help='config file path')
    parser.add_argument('checkpoint', help='checkpoint file path')
    parser.add_argument('out', help='output path of the converted model')
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    # build the model from a config file and a checkpoint file
    model = init_model(args.config, args.checkpoint)
    # fuse conv and bn layers of the model
    fused_model = fuse_module(model)
    save_checkpoint(fused_model, args.out)


if __name__ == '__main__':
    main()


================================================
FILE: tools/misc/print_config.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
from mmcv import Config, DictAction


def parse_args():
    parser = argparse.ArgumentParser(description='Print the whole config')
    parser.add_argument('config', help='config file path')
    parser.add_argument(
        '--options', nargs='+', action=DictAction, help='arguments in dict')
    args = parser.parse_args()

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    if args.options is not None:
        cfg.merge_from_dict(args.options)
    print(f'Config:\n{cfg.pretty_text}')


if __name__ == '__main__':
    main()


================================================
FILE: tools/misc/visualize_results.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import mmcv
from mmcv import Config

from mmdet3d.datasets import build_dataset


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet3D visualize the results')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('--result', help='results file in pickle format')
    parser.add_argument(
        '--show-dir', help='directory where visualize results will be saved')
    args = parser.parse_args()

    return args


def main():
    args = parse_args()

    if args.result is not None and \
            not args.result.endswith(('.pkl', '.pickle')):
        raise ValueError('The results file must be a pkl file.')

    cfg = Config.fromfile(args.config)
    cfg.data.test.test_mode = True

    # build the dataset
    dataset = build_dataset(cfg.data.test)
    results = mmcv.load(args.result)

    if getattr(dataset, 'show', None) is not None:
        # data loading pipeline for showing
        eval_pipeline = cfg.get('eval_pipeline', {})
        if eval_pipeline:
            dataset.show(results, args.show_dir, pipeline=eval_pipeline)
        else:
            dataset.show(results, args.show_dir)  # use default pipeline
    else:
        raise NotImplementedError(
            'Show is not implemented for dataset {}!'.format(
                type(dataset).__name__))


if __name__ == '__main__':
    main()


================================================
FILE: tools/test.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
#  Modified by Junyi Ma, following OpenOccupancy of Zhiqi Li

import argparse
import mmcv
import os
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
                         wrap_fp16_model)

from mmdet3d.apis import single_gpu_test
from mmdet3d.datasets import build_dataset
from projects.occ_plugin.datasets.builder import build_dataloader
from mmdet3d.models import build_model
from mmdet.apis import set_random_seed
from projects.occ_plugin.occupancy.apis.test import custom_single_gpu_test, custom_multi_gpu_test

from mmdet.datasets import replace_ImageToTensor
import time
import os.path as osp
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=FutureWarning)


def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
        'useful when you want to format the result to a specific format and '
        'submit it to the test server')
    parser.add_argument(
        '--eval',
        type=str,
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where results will be saved')
    parser.add_argument(
        '--gpu-collect',
        action='store_true',
        help='whether to use gpu to collect results.')
    parser.add_argument(
        '--tmpdir',
        help='tmp directory used for collecting results from multiple '
        'workers, available when gpu-collect is not specified')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function (deprecate), '
        'change to --eval-options instead.')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.eval_options:
        raise ValueError(
            '--options and --eval-options cannot be both specified, '
            '--options is deprecated in favor of --eval-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --eval-options')
        args.eval_options = args.options
    return args


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
        or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])

    # import modules from plguin/xx, registry will be updated
    if hasattr(cfg, 'plugin'):
        if cfg.plugin:
            import importlib
            if hasattr(cfg, 'plugin_dir'):
                plugin_dir = cfg.plugin_dir
                _module_dir = os.path.dirname(plugin_dir)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]

                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                # print(_module_path)
                plg_lib = importlib.import_module(_module_path)
            else:
                # import dir is the dirpath for the config file
                _module_dir = os.path.dirname(args.config)
                _module_dir = _module_dir.split('/')
                _module_path = _module_dir[0]
                for m in _module_dir[1:]:
                    _module_path = _module_path + '.' + m
                # print(_module_path)
                plg_lib = importlib.import_module(_module_path)

    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    cfg.model.pretrained = None
    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    # print("args.launcher", args.launcher)
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # set random seeds
    if args.seed is not None:
        set_random_seed(args.seed, deterministic=args.deterministic)

    # build the dataloader
    dataset = build_dataset(cfg.data.test)
    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu,
        workers_per_gpu=cfg.data.workers_per_gpu,
        dist=distributed,
        shuffle=True,
        num_gpus=2, 
    )

    # build the model and load checkpoint
    cfg.model.train_cfg = None
    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        wrap_fp16_model(model)
    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)
    # old versions did not save class info in checkpoints, this walkaround is
    # for backward compatibility
    if 'CLASSES' in checkpoint.get('meta', {}):
        model.CLASSES = checkpoint['meta']['CLASSES']
    else:
        model.CLASSES = dataset.CLASSES
    # palette for visualization in segmentation tasks
    if 'PALETTE' in checkpoint.get('meta', {}):
        model.PALETTE = checkpoint['meta']['PALETTE']
    elif hasattr(dataset, 'PALETTE'):
        # segmentation dataset has `PALETTE` attribute
        model.PALETTE = dataset.PALETTE

    if args.show:
        if args.show_dir is None:
            args.show_dir = osp.join('./work_dirs',
                            osp.splitext(osp.basename(args.config))[0],
                            'visualization')
        print('save dir: ', args.show_dir)               
        os.makedirs(args.show_dir, exist_ok=True)
    if not distributed:
        model = MMDataParallel(model, device_ids=[0])
        outputs = custom_single_gpu_test(model, data_loader, args.show, args.show_dir)
    else:
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False)
        outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
                                        args.gpu_collect, args.show, args.show_dir)

    rank, _ = get_dist_info()
    if rank == 0 and distributed:
        
        kwargs = {} if args.eval_options is None else args.eval_options
        kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
            '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
        
        if args.format_only:
            dataset.format_results(outputs, **kwargs)

        if args.eval:
            eval_kwargs = cfg.get('evaluation', {}).copy()
            # hard-code way to remove EvalHook args
            for key in [
                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
                    'rule'
            ]:
                eval_kwargs.pop(key, None)
            eval_kwargs.update(dict(metric=args.eval, **kwargs))

            print(dataset.evaluate(outputs, **eval_kwargs))

if __name__ == '__main__':
    main()


================================================
FILE: tools/train.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
# Cam4DOcc refers to OpenOccupancy of Zhiqi Li
 
from __future__ import division

import argparse
import copy
import mmcv
import os
import time
import torch
import warnings
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist
from os import path as osp

from mmdet import __version__ as mmdet_version
from mmdet3d import __version__ as mmdet3d_version
from mmseg import __version__ as mmseg_version

from mmdet3d.datasets import build_dataset
from mmdet3d.models import build_model
from mmdet3d.utils import collect_env, get_root_logger
from mmdet.apis import set_random_seed

from mmcv.utils import TORCH_VERSION, digit_version
from projects.occ_plugin.occupancy.apis.train import custom_train_model

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=FutureWarning)


def parse_args():
    parser = argparse.ArgumentParser(description='Train a detector')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume', help='the checkpoint file to resume from')
    parser.add_argument(
        '--no-validate',
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument(
        '--gpus',
        type=int,
        help='number of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed training)')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file (deprecate), '
        'change to --cfg-options instead.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='pytorch',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    parser.add_argument(
        '--autoscale-lr',
        action='store_true',
        help='automatically scale lr with the number of gpus')
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)

    # import modules from plguin/xx, registry will be updated
    if hasattr(cfg, 'plugin') and cfg.plugin:
        assert cfg.plugin_dir is not None
        import importlib
        plugin_dir = cfg.plugin_dir
        _module_dir = os.path.dirname(plugin_dir)
        _module_dir = _module_dir.split('/')
        _module_path = _module_dir[0]

        for m in _module_dir[1:]:
            _module_path = _module_path + '.' + m
        # print(_module_path)
        plg_lib = importlib.import_module(_module_path)
    
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0])
    
    if args.resume is not None and osp.isfile(args.resume):
        cfg.resume_from = args.resume
        
    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids
    else:
        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)

    if args.autoscale_lr:
        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        # re-set gpu_ids with distributed training mode
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)

    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # dump config
    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
    logger = get_root_logger(
        log_file=log_file, log_level=cfg.log_level, name='mmdet')

    # init the meta dict to record some important information such as
    # environment info and seed, which will be logged
    meta = dict()
    # log env info
    env_info_dict = collect_env()
    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line)
    meta['env_info'] = env_info
    meta['config'] = cfg.pretty_text

    # log some basic info
    logger.info(f'Distributed training: {distributed}')

    # set random seeds
    if args.seed is not None:
        logger.info(f'Set random seed to {args.seed}, '
                    f'deterministic: {args.deterministic}')
        set_random_seed(args.seed, deterministic=args.deterministic)
    cfg.seed = args.seed
    meta['seed'] = args.seed
    meta['exp_name'] = osp.basename(args.config)

    model = build_model(
        cfg.model,
        train_cfg=cfg.get('train_cfg'),
        test_cfg=cfg.get('test_cfg'))
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info(f'Number of params: {n_parameters}')
    model.init_weights()

    datasets = [build_dataset(cfg.data.train)]
        
    # add an attribute for visualization convenience
    model.CLASSES = datasets[0].CLASSES
    
    custom_train_model(
        model,
        datasets,
        cfg,
        distributed=distributed,
        validate=(not args.no_validate),
        timestamp=timestamp,
        meta=meta)


if __name__ == '__main__':
    main()


================================================
FILE: viz/viz_gt.py
================================================
# Developed by Junyi Ma
# Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications
# https://github.com/haomo-ai/Cam4DOcc

from tqdm import tqdm
import pickle
import numpy as np
from mayavi import mlab
from tqdm import trange
import os
from xvfbwrapper import Xvfb

# export QT_QPA_PLATFORM='offscreen' 
mlab.options.offscreen = True

def viz_occ(occ, occ_mo, file_name, voxel_size, show_occ, show_time_change):

    vdisplay = Xvfb(width=1, height=1)
    vdisplay.start()

    mlab.figure(size=(800,800), bgcolor=(1,1,1))

    plt_plot_occ = mlab.points3d(
        occ[:, 0] * voxel_size,
        occ[:, 1] * voxel_size,
        occ[:, 2] * voxel_size,
        occ[:, 3],
        colormap="viridis",
        scale_factor=voxel_size - 0.05 * voxel_size,
        mode="cube",
        opacity=0.9,
        vmin=1,
    )
    colors_occ = np.array(
        [
            [152, 251, 152, 255],
            [152, 251, 152, 255],
            [152, 251, 152, 255],
            [152, 251, 152, 255],
            [152, 251, 152, 255],
        ]
    ).astype(np.uint8)    
    plt_plot_occ.glyph.scale_mode = "scale_by_vector"
    plt_plot_occ.module_manager.scalar_lut_manager.lut.table = colors_occ

    plt_plot_mov = mlab.points3d(
        occ_mo[:, 0] * voxel_size,
        occ_mo[:, 1] * voxel_size,
        occ_mo[:, 2] * voxel_size,
        occ_mo[:, 3],
        colormap="viridis",
        scale_factor=voxel_size - 0.05 * voxel_size,
        mode="cube",
        opacity=0.9,
        vmin=1,
    )
    if show_time_change:
        colors_occ_mo = np.array(
            [
                [255, 70, 255, 255],
                [255, 110, 255, 255],
                [255, 150, 255, 255],
                [255, 190, 255, 255],
                [255, 250, 250, 255],
            ]
        ).astype(np.uint8)
    else:
        colors_occ_mo = np.array(
            [
                [220, 20, 60, 255],
                [255, 127, 80, 255],
                [0, 0, 230, 255],
                [255, 158, 0, 255],
                [233, 150, 70, 255],
                [47, 79, 79, 255],
                [255, 99, 71, 255],
                [175, 0, 75, 255],
                [255, 61, 99, 255],
            ]
        ).astype(np.uint8)    
    plt_plot_mov.glyph.scale_mode = "scale_by_vector"
    plt_plot_mov.module_manager.scalar_lut_manager.lut.table = colors_occ_mo

    fig_dir = "./figs"
    if not os.path.exists(fig_dir):
        os.mkdir(fig_dir)
    mlab.savefig(os.path.join(fig_dir, file_name[:-4]+".png"))
    vdisplay.stop()


def main():

    show_time_change = True

    nuscocc_path = "../data/nuScenes-Occupancy/"
    cam4docc_path = "../data/cam4docc/GMO/segmentation/"

    segmentation_files = os.listdir(cam4docc_path)
    segmentation_files.sort(key=lambda x: (x.split("_")[1]))
    index = 0

    for file_ in tqdm(segmentation_files):

        scene_token = file_.split("_")[0]
        lidar_token = file_.split("_")[1]

        gt_file = nuscocc_path+"scene_"+scene_token+"/occupancy/"+lidar_token[:-4]+".npy"
        gt_occ_semantic =  np.load(gt_file,allow_pickle=True)
        gt_occ_semantic = gt_occ_semantic[gt_occ_semantic[:, -1]!=0]
        gt_occ_semantic = gt_occ_semantic[::2]
        gt_occ_semantic_refine = np.zeros_like(gt_occ_semantic)
        gt_occ_semantic_refine[:, 0] = gt_occ_semantic[:, 2]
        gt_occ_semantic_refine[:, 1] = gt_occ_semantic[:, 1]
        gt_occ_semantic_refine[:, 2] = gt_occ_semantic[:, 0]
        gt_occ_semantic_refine[:, 3] = 1

        gt_mo_semantic =  np.load(cam4docc_path+file_,allow_pickle=True)['arr_0']

        gt_mo_semantic_to_draw=np.zeros((0,4))
        for t in range(0,4):
            gt_mo_cur = gt_mo_semantic[t]
            gt_mo_cur = np.array(gt_mo_cur)
            gt_mo_cur = gt_mo_cur[::2]
            if show_time_change:
                gt_mo_cur[:, -1] = int(t+1)
            gt_mo_semantic_to_draw = np.concatenate((gt_mo_semantic_to_draw, gt_mo_cur))

        viz_occ(gt_occ_semantic_refine, gt_mo_semantic_to_draw, file_, voxel_size=0.2, show_occ=True, show_time_change=show_time_change)

        index += 1


if __name__ == "__main__":
    main()


================================================
FILE: viz/viz_pred.py
================================================
from tqdm import tqdm
import pickle
import numpy as np
from mayavi import mlab
from tqdm import trange
import os
from xvfbwrapper import Xvfb

mlab.options.offscreen = True

def viz_occ(occ, occ_mo, file_name, voxel_size, show_occ, show_time_change):

    vdisplay = Xvfb(width=1, height=1)
    vdisplay.start()

    mlab.figure(size=(800,800), bgcolor=(1,1,1))

    plt_plot_occ = mlab.points3d(
        occ[:, 0] * voxel_size,
        occ[:, 1] * voxel_size,
        occ[:, 2] * voxel_size,
        occ[:, 3],
        colormap="viridis",
        scale_factor=voxel_size - 0.05 * voxel_size,
        mode="cube",
        opacity=0.9,
        vmin=1,
    )
    colors_occ = np.array(
        [
            [152, 251, 152, 255],
            [152, 251, 152, 255],
            [152, 251, 152, 255],
            [152, 251, 152, 255],
            [152, 251, 152, 255],
        ]
    ).astype(np.uint8)    
    plt_plot_occ.glyph.scale_mode = "scale_by_vector"
    plt_plot_occ.module_manager.scalar_lut_manager.lut.table = colors_occ

    plt_plot_mov = mlab.points3d(
        occ_mo[:, 0] * voxel_size,
        occ_mo[:, 1] * voxel_size,
        occ_mo[:, 2] * voxel_size,
        occ_mo[:, 3],
        colormap="viridis",
        scale_factor=voxel_size - 0.05 * voxel_size,
        mode="cube",
        opacity=0.9,
        vmin=1,
    )
    if show_time_change:
        colors_occ_mo = np.array(
            [
                [255, 70, 255, 255],
                [255, 110, 255, 255],
                [255, 150, 255, 255],
                [255, 190, 255, 255],
                [255, 250, 250, 255],
            ]
        ).astype(np.uint8)
    else:
        colors_occ_mo = np.array(
            [
                [220, 20, 60, 255],
                [255, 127, 80, 255],
                [0, 0, 230, 255],
                [255, 158, 0, 255],
                [233, 150, 70, 255],
                [47, 79, 79, 255],
                [255, 99, 71, 255],
                [175, 0, 75, 255],
                [255, 61, 99, 255],
            ]
        ).astype(np.uint8)    
    plt_plot_mov.glyph.scale_mode = "scale_by_vector"
    plt_plot_mov.module_manager.scalar_lut_manager.lut.table = colors_occ_mo

    fig_dir = "./figs"
    if not os.path.exists(fig_dir):
        os.mkdir(fig_dir)
    mlab.savefig(os.path.join(fig_dir, file_name[:-4]+".png"))
    vdisplay.stop()


def main():

    show_time_change = True

    nuscocc_path = "../data/nuScenes-Occupancy/"
    pred_path = "../data/cam4docc/results/"

    segmentation_files = os.listdir(pred_path)
    segmentation_files.sort(key=lambda x: (x.split("_")[1]))
    index = 0

    segmentation_files = segmentation_files[::10]
    for file_ in tqdm(segmentation_files):

        scene_token = file_.split("_")[0]
        lidar_token = file_.split("_")[1]
        gt_file = nuscocc_path+"scene_"+scene_token+"/occupancy/"+lidar_token[:-4]+".npy"
        gt_occ_semantic =  np.load(gt_file,allow_pickle=True)
        gt_occ_semantic = gt_occ_semantic[gt_occ_semantic[:, -1]!=0]
        gt_occ_semantic = gt_occ_semantic[::2]
        gt_occ_semantic_refine = np.zeros_like(gt_occ_semantic)
        gt_occ_semantic_refine[:, 0] = gt_occ_semantic[:, 2]
        gt_occ_semantic_refine[:, 1] = gt_occ_semantic[:, 1]
        gt_occ_semantic_refine[:, 2] = gt_occ_semantic[:, 0]
        gt_occ_semantic_refine[:, 3] = 1


        pred_mo_semantic =  np.load(pred_path+file_,allow_pickle=True)['arr_0']
        pred_mo_semantic_to_draw=np.zeros((0,4))
        for t in range(0,4):
            pred_mo_cur = pred_mo_semantic[t]
            pred_mo_cur = np.array(pred_mo_cur)
            pred_mo_cur = pred_mo_cur[::2]
            if show_time_change:
                pred_mo_cur[:, -1] = int(t+1)
            pred_mo_semantic_to_draw = np.concatenate((pred_mo_semantic_to_draw, pred_mo_cur))

        viz_occ(gt_occ_semantic_refine, pred_mo_semantic_to_draw, file_, voxel_size=0.2, show_occ=True, show_time_change=show_time_change)

        index += 1


if __name__ == "__main__":
    main()
    # export QT_QPA_PLATFORM='offscreen'