Repository: HorizonRobotics/BIP3D
Branch: main
Commit: b1ed6a84faaf
Files: 87
Total size: 582.4 KB

Directory structure:
gitextract_nz633hq0/

├── .gitignore
├── LICENSE
├── README.md
├── bip3d/
│   ├── __init__.py
│   ├── converter/
│   │   ├── extract_occupancy_ann.py
│   │   ├── generate_image_3rscan.py
│   │   ├── generate_image_matterport3d.py
│   │   └── generate_image_scannet.py
│   ├── datasets/
│   │   ├── __init__.py
│   │   ├── embodiedscan_det_grounding_dataset.py
│   │   ├── transforms/
│   │   │   ├── __init__.py
│   │   │   ├── augmentation.py
│   │   │   ├── formatting.py
│   │   │   ├── loading.py
│   │   │   ├── multiview.py
│   │   │   └── transform.py
│   │   └── utils.py
│   ├── eval/
│   │   ├── __init__.py
│   │   ├── indoor_eval.py
│   │   └── metrics/
│   │       ├── __init__.py
│   │       ├── det_metric.py
│   │       └── grounding_metric.py
│   ├── grid_mask.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── base_target.py
│   │   ├── bbox3d_decoder.py
│   │   ├── bert.py
│   │   ├── data_preprocessors/
│   │   │   ├── __init__.py
│   │   │   ├── custom_data_preprocessor.py
│   │   │   └── utils.py
│   │   ├── deformable_aggregation.py
│   │   ├── feature_enhancer.py
│   │   ├── instance_bank.py
│   │   ├── spatial_enhancer.py
│   │   ├── structure.py
│   │   ├── target.py
│   │   └── utils.py
│   ├── ops/
│   │   ├── __init__.py
│   │   ├── deformable_aggregation.py
│   │   ├── gcc.sh
│   │   ├── setup.py
│   │   └── src/
│   │       ├── deformable_aggregation.cpp
│   │       ├── deformable_aggregation_cuda.cu
│   │       ├── deformable_aggregation_with_depth.cpp
│   │       └── deformable_aggregation_with_depth_cuda.cu
│   ├── registry.py
│   ├── structures/
│   │   ├── __init__.py
│   │   ├── bbox_3d/
│   │   │   ├── __init__.py
│   │   │   ├── base_box3d.py
│   │   │   ├── box_3d_mode.py
│   │   │   ├── coord_3d_mode.py
│   │   │   ├── euler_box3d.py
│   │   │   ├── euler_depth_box3d.py
│   │   │   └── utils.py
│   │   ├── ops/
│   │   │   ├── __init__.py
│   │   │   ├── box_np_ops.py
│   │   │   ├── iou3d_calculator.py
│   │   │   └── transforms.py
│   │   └── points/
│   │       ├── __init__.py
│   │       ├── base_points.py
│   │       ├── cam_points.py
│   │       ├── depth_points.py
│   │       └── lidar_points.py
│   └── utils/
│       ├── __init__.py
│       ├── array_converter.py
│       ├── default_color_map.py
│       ├── dist_utils.py
│       ├── line_mesh.py
│       └── typing_config.py
├── configs/
│   ├── __init__.py
│   ├── bip3d_det.py
│   ├── bip3d_det_grounding.py
│   ├── bip3d_det_rgb.py
│   ├── bip3d_grounding.py
│   ├── bip3d_grounding_rgb.py
│   └── default_runtime.py
├── docs/
│   └── quick_start.md
├── engine.sh
├── requirements.txt
├── test.sh
└── tools/
    ├── anchor_bbox3d_kmeans.py
    ├── benchmark.py
    ├── ckpt_rename.py
    ├── dist_test.sh
    ├── dist_train.sh
    ├── test.py
    └── train.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
*.so

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
#   install at all, so you might want to exclude Pipfile.lock from version control.
Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# VS Code
.vscode/


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 Horizon Robotics

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# BIP3D: Bridging 2D Images and 3D Perception for Embodied Intelligence

<div align="center" class="authors">
    <a href="https://scholar.google.com/citations?user=pfXQwcQAAAAJ&hl=en" target="_blank">Xuewu Lin</a>,
    <a href="https://wzmsltw.github.io/" target="_blank">Tianwei Lin</a>,
    <a href="https://scholar.google.com/citations?user=F2e_jZMAAAAJ&hl=en" target="_blank">Lichao Huang</a>,
    <a href="https://openreview.net/profile?id=~HONGYU_XIE2" target="_blank">Hongyu Xie</a>,
    <a href="https://scholar.google.com/citations?user=HQfc8TEAAAAJ&hl=en" target="_blank">Zhizhong Su</a>
</div>


<div align="center" style="line-height: 3;">
  <a href="https://github.com/HorizonRobotics/BIP3D" target="_blank" style="margin: 2px;">
    <img alt="Code" src="https://img.shields.io/badge/Code-Github-bule" style="display: inline-block; vertical-align: middle;"/>
  </a>
  <a href="https://linxuewu.github.io/BIP3D-page/" target="_blank" style="margin: 2px;">
    <img alt="Homepage" src="https://img.shields.io/badge/Homepage-BIP3D-green" style="display: inline-block; vertical-align: middle;"/>
  </a>
  <a href="https://huggingface.co/HorizonRobotics/BIP3D" target="_blank" style="margin: 2px;">
    <img alt="Hugging Face" src="https://img.shields.io/badge/Models-Hugging%20Face-yellow" style="display: inline-block; vertical-align: middle;"/>
  </a>
  <a href="https://arxiv.org/abs/2411.14869" target="_blank" style="margin: 2px;">
    <img alt="Paper" src="https://img.shields.io/badge/Paper-Arxiv-red" style="display: inline-block; vertical-align: middle;"/>
  </a>
</div>


## :rocket: News
**01/Jun/2025**: We have refactored and integrated the BIP3D code into [robo_orchard_lab](https://github.com/HorizonRobotics/robo_orchard_lab/tree/master/projects/bip3d_grounding), removing the dependency on MM​​ series. The environment is now easier to set up, and the performance are improved. Welcome to try it out!

**14/Mar/2025**: Our code has been released.

**27/Feb/2025**: Our paper has been accepted by CVPR 2025.

**22/Nov/2024**: We release our paper to [Arxiv](https://arxiv.org/abs/2411.14869).

## :open_book: Quick Start
[Quick Start](docs/quick_start.md)

## :link: Framework
<div align="center">
  <img src="https://github.com/HorizonRobotics/BIP3D/raw/main/resources/bip3d_structure.png" width="90%" alt="BIP3D" />
  <p style="font-size:0.8em; color:#555;">The Architecture Diagram of BIP3D, where the red stars indicate the parts that have been modified or added compared to the base model, GroundingDINO, and dashed lines indicate optional elements.</p>
</div>

## :trophy: Results on EmbodiedScan Benchmark
We made several improvements based on the original paper, achieving better 3D perception results. The main improvements include the following two points:
1. **New Fusion Operation**: We enhanced the decoder by replacing the deformable aggregation (DAG) with a 3D deformable attention mechanism (DAT). Specifically, we improved the feature sampling process by transitioning from bilinear interpolation to trilinear interpolation, which leverages depth distribution for more accurate feature extraction.
2. **Mixed Data Training**: To optimize the grounding model's performance, we adopted a mixed-data training strategy by integrating detection data with grounding data during the grounding finetuning process.

### 1. Results on Multi-view 3D Detection Validation Dataset
Op DAG denotes deformable aggregation, and DAT denotes 3D deformable attention. Set [`with_depth=True`](configs/bip3d_det.py#L175) to activate the DAT.

The metric in the table is `AP@0.25`. For more metrics, please refer to the logs.
|Model | Inputs | Op | Overall | Head | Common | Tail | Small | Medium | Large | ScanNet | 3RScan | MP3D | ckpt | log |
|  :----  | :---: |  :---: | :---: |:---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: | :----: | :---: |
|[BIP3D](configs/bip3d_det_rgb.py) | RGB | DAG | 16.57|23.29|13.84|12.29|2.67|17.85|12.89|19.71|26.76|8.50   | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dag/job.log) |
|[BIP3D](configs/bip3d_det_rgb.py) | RGB | DAT | 16.67|22.41|14.19|13.18|3.32|17.25|14.89|20.80|24.18|9.91  | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dat/job.log) |
|[BIP3D](configs/bip3d_det.py) |RGB-D | DAG | 22.53|28.89|20.51|17.83|6.95|24.21|15.46|24.77|35.29|10.34  | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dag/job.log) |
|[BIP3D](configs/bip3d_det.py) |RGB-D | DAT | 23.24|31.51|20.20|17.62|7.31|24.09|15.82|26.35|36.29|11.44   | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dat/job.log) |

### 2. Results on Multi-view 3D Grounding Mini Dataset
To train and validate with mini dataset, set [`data_version="v1-mini"`](configs/bip3d_grounding.py#L333).
|Model | Inputs | Op | Overall | Easy | Hard | View-dep | View-indep | ScanNet | 3RScan | MP3D | ckpt | log |
|  :----  | :---: | :---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: |:---: | :----: |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAG | 44.00|44.39|39.56|46.05|42.92|48.62|42.47|36.40  | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dag/job.log) |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAT | 44.43|44.74|41.02|45.17|44.04|49.70|41.81|37.28  | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dat/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAG | 45.79|46.22|40.91|45.93|45.71|48.94|46.61|37.36  | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dag/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAT | 58.47|59.02|52.23|60.20|57.56|66.63|54.79|46.72  | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dat/job.log) |


### 3. Results on Multi-view 3D Grounding Validation Dataset
|Model | Inputs | Op | Mixed Data | Overall | Easy | Hard | View-dep | View-indep | ScanNet | 3RScan | MP3D | ckpt | log |
|  :----  | :---: | :---: | :---: |:---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: |:---: | :----: |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAG |No| 45.81|46.21|41.34|47.07|45.09|50.40|47.53|32.97   | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dag/job.log) |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAT |No| 47.29|47.82|41.42|48.58|46.56|52.74|47.85|34.60   | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dat/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAG |No| 53.75|53.87|52.43|55.21|52.93|60.05|54.92|38.20   | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dag/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAT |No|61.36|61.88|55.58|62.43|60.76|66.96|62.75|46.92   | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat/job.log) |
|[BIP3D](configs/bip3d_det_grounding.py) | RGB-D | DAT |Yes|66.58|66.99|62.07|67.95|65.81|72.43|68.26|51.14   | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat_mixdata/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat_mixdata/job.log) |


### 4. [Results on Multi-view 3D Grounding Test Dataset](https://huggingface.co/spaces/AGC2024/visual-grounding-2024)

|Model | Overall | Easy | Hard | View-dep | View-indep | ckpt | log |
|  :----  | :---: | :---: | :---: | :---: | :---:| :---:|:---:|
|[EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan) | 39.67 | 40.52 | 30.24 | 39.05 | 39.94 | - | - |
|[SAG3D*](https://opendrivelab.github.io/Challenge%202024/multiview_Mi-Robot.pdf) | 46.92 | 47.72 | 38.03 | 46.31 | 47.18 | - | - |
|[DenseG*](https://opendrivelab.github.io/Challenge%202024/multiview_THU-LenovoAI.pdf) | 59.59 | 60.39 | 50.81 | 60.50 | 59.20 |  - | - |
|[BIP3D](configs/bip3d_det_grounding.py) | 67.38 | 68.12 | 59.08 | 67.88 | 67.16 |  - | - |
|BIP3D-B | 70.53 | 71.22 | 62.91 | 70.69 | 70.47 | - | - |

`*` denotes model ensemble, and note that our BIP3D does not use the ensemble trick. This differs from what is mentioned in the paper and shows significant improvements.

Our best model, BIP3D-B, is based on GroundingDINO-base and is trained with the addition ARKitScenes dataset.


## :page_facing_up: Citation
```
@article{lin2024bip3d,
  title={BIP3D: Bridging 2D Images and 3D Perception for Embodied Intelligence},
  author={Lin, Xuewu and Lin, Tianwei and Huang, Lichao and Xie, Hongyu and Su, Zhizhong},
  journal={arXiv preprint arXiv:2411.14869},
  year={2024}
}
```


## :handshake: Acknowledgement
[EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan)

[Sparse4D](https://github.com/HorizonRobotics/Sparse4D)

[3D-deformable-attention](https://github.com/IDEA-Research/3D-deformable-attention)

[mmdet-GroundingDINO](https://github.com/open-mmlab/mmdetection/tree/main/configs/grounding_dino)


================================================
FILE: bip3d/__init__.py
================================================
from .registry import *


================================================
FILE: bip3d/converter/extract_occupancy_ann.py
================================================
import os
import shutil
from argparse import ArgumentParser

from tqdm import tqdm


def extract_occupancy(dataset, src, dst):
    """Extract occupancy annotations of a single dataset to dataset root."""
    print('Processing dataset', dataset)
    scenes = os.listdir(os.path.join(src, dataset))
    dst_dataset = os.path.join(dst, dataset)
    if not os.path.exists(dst_dataset):
        print('Missing dataset:', dataset)
        return
    for scene in tqdm(scenes):
        if dataset == 'scannet':
            dst_scene = os.path.join(dst_dataset, 'scans', scene)
        else:
            dst_scene = os.path.join(dst_dataset, scene)

        if not os.path.exists(dst_scene):
            print(f'Missing scene {scene} in dataset {dataset}')
            continue
        dst_occ = os.path.join(dst_scene, 'occupancy')
        if not os.path.exists(dst_occ):
            shutil.copytree(os.path.join(src, dataset, scene), dst_occ)
        else:
            files = os.listdir(os.path.join(src, dataset, scene))
            for file in files:
                if not os.path.exists(os.path.join(dst_occ, file)):
                    shutil.copyfile(os.path.join(src, dataset, scene, file),
                                    os.path.join(dst_occ, file))


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--src',
                        required=True,
                        help='folder of the occupancy annotations')
    parser.add_argument('--dst',
                        required=True,
                        help='folder of the raw datasets')
    args = parser.parse_args()
    datasets = os.listdir(args.src)
    for dataset in datasets:
        extract_occupancy(dataset, args.src, args.dst)


================================================
FILE: bip3d/converter/generate_image_3rscan.py
================================================
import os
import zipfile
from argparse import ArgumentParser
from functools import partial

import mmengine


def process_scene(path, scene_name):
    """Process single 3Rscan scene."""
    with zipfile.ZipFile(os.path.join(path, scene_name, 'sequence.zip'),
                         'r') as zip_ref:
        zip_ref.extractall(os.path.join(path, scene_name, 'sequence'))


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--dataset_folder',
                        required=True,
                        help='folder of the dataset.')
    parser.add_argument('--nproc', type=int, default=8)
    args = parser.parse_args()

    mmengine.track_parallel_progress(func=partial(process_scene,
                                                  args.dataset_folder),
                                     tasks=os.listdir(args.dataset_folder),
                                     nproc=args.nproc)


================================================
FILE: bip3d/converter/generate_image_matterport3d.py
================================================
import os
import zipfile
from argparse import ArgumentParser
from functools import partial

import mmengine


def process_scene(path, output_folder, scene_name):
    """Process single 3Rscan scene."""
    files = list(os.listdir(os.path.join(path, scene_name)))
    for file in files:
        if not file.endswith(".zip"):
            continue
        if file != "sens.zip":
            continue
        with zipfile.ZipFile(os.path.join(path, scene_name, file),
                             'r') as zip_ref:
            if file == "sens.zip":
                zip_ref.extractall(os.path.join(output_folder, scene_name, file[:-4]))
            else:
                zip_ref.extractall(output_folder)


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--dataset_folder',
                        required=True,
                        help='folder of the dataset.')
    parser.add_argument('--output_folder',
                        required=True,
                        help='output folder of the dataset.')
    parser.add_argument('--nproc', type=int, default=8)
    args = parser.parse_args()

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder, exist_ok=True)

    mmengine.track_parallel_progress(func=partial(process_scene,
                                                  args.dataset_folder, args.output_folder),
                                              tasks=os.listdir(args.dataset_folder),
                                     nproc=args.nproc)


================================================
FILE: bip3d/converter/generate_image_scannet.py
================================================
# Modified from https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py # noqa
import os
import struct
import zlib
from argparse import ArgumentParser
from functools import partial

import imageio
import mmengine
import numpy as np

COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'}

COMPRESSION_TYPE_DEPTH = {
    -1: 'unknown',
    0: 'raw_ushort',
    1: 'zlib_ushort',
    2: 'occi_ushort'
}


class RGBDFrame:
    """Class for single ScanNet RGB-D image processing."""

    def load(self, file_handle):
        """Load basic information of a given RGBD frame."""
        self.camera_to_world = np.asarray(struct.unpack(
            'f' * 16, file_handle.read(16 * 4)),
                                          dtype=np.float32).reshape(4, 4)
        self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0]
        self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0]
        self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
        self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
        self.color_data = b''.join(
            struct.unpack('c' * self.color_size_bytes,
                          file_handle.read(self.color_size_bytes)))
        self.depth_data = b''.join(
            struct.unpack('c' * self.depth_size_bytes,
                          file_handle.read(self.depth_size_bytes)))

    def decompress_depth(self, compression_type):
        """Decompress the depth data."""
        assert compression_type == 'zlib_ushort'
        return zlib.decompress(self.depth_data)

    def decompress_color(self, compression_type):
        """Decompress the RGB image data."""
        assert compression_type == 'jpeg'
        return imageio.imread(self.color_data)


class SensorData:
    """Class for single ScanNet scene processing.

    Single scene file contains multiple RGB-D images.
    """

    def __init__(self, filename, fast=False):
        self.version = 4
        self.load(filename, fast)

    def load(self, filename, fast):
        """Load a single scene data with multiple RGBD frames."""
        with open(filename, 'rb') as f:
            version = struct.unpack('I', f.read(4))[0]
            assert self.version == version
            strlen = struct.unpack('Q', f.read(8))[0]
            self.sensor_name = b''.join(
                struct.unpack('c' * strlen, f.read(strlen)))
            self.intrinsic_color = np.asarray(struct.unpack(
                'f' * 16, f.read(16 * 4)),
                                              dtype=np.float32).reshape(4, 4)
            self.extrinsic_color = np.asarray(struct.unpack(
                'f' * 16, f.read(16 * 4)),
                                              dtype=np.float32).reshape(4, 4)
            self.intrinsic_depth = np.asarray(struct.unpack(
                'f' * 16, f.read(16 * 4)),
                                              dtype=np.float32).reshape(4, 4)
            self.extrinsic_depth = np.asarray(struct.unpack(
                'f' * 16, f.read(16 * 4)),
                                              dtype=np.float32).reshape(4, 4)
            self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack(
                'i', f.read(4))[0]]
            self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack(
                'i', f.read(4))[0]]
            self.color_width = struct.unpack('I', f.read(4))[0]
            self.color_height = struct.unpack('I', f.read(4))[0]
            self.depth_width = struct.unpack('I', f.read(4))[0]
            self.depth_height = struct.unpack('I', f.read(4))[0]
            self.depth_shift = struct.unpack('f', f.read(4))[0]
            num_frames = struct.unpack('Q', f.read(8))[0]
            self.num_frames = num_frames
            self.frames = []
            if fast:
                index = list(range(num_frames))[::10]
            else:
                index = list(range(num_frames))
            self.index = index
            for i in range(num_frames):
                frame = RGBDFrame()
                frame.load(f)
                if i in index:
                    self.frames.append(frame)

    def export_depth_images(self, output_path):
        """Export depth images to the output path."""
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        for f in range(len(self.frames)):
            depth_data = self.frames[f].decompress_depth(
                self.depth_compression_type)
            depth = np.fromstring(depth_data, dtype=np.uint16).reshape(
                self.depth_height, self.depth_width)
            imageio.imwrite(
                os.path.join(output_path,
                             self.index_to_str(self.index[f]) + '.png'), depth)

    def export_color_images(self, output_path):
        """Export RGB images to the output path."""
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        for f in range(len(self.frames)):
            color = self.frames[f].decompress_color(
                self.color_compression_type)
            imageio.imwrite(
                os.path.join(output_path,
                             self.index_to_str(self.index[f]) + '.jpg'), color)

    @staticmethod
    def index_to_str(index):
        """Convert the sample index to string."""
        return str(index).zfill(5)

    @staticmethod
    def save_mat_to_file(matrix, filename):
        """Save a matrix to file."""
        with open(filename, 'w') as f:
            for line in matrix:
                np.savetxt(f, line[np.newaxis], fmt='%f')

    def export_poses(self, output_path):
        """Export camera poses to the output path."""
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        for f in range(len(self.frames)):
            self.save_mat_to_file(
                self.frames[f].camera_to_world,
                os.path.join(output_path,
                             self.index_to_str(self.index[f]) + '.txt'))

    def export_intrinsics(self, output_path):
        """Export the intrinsic matrix to the output path."""
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        self.save_mat_to_file(self.intrinsic_color,
                              os.path.join(output_path, 'intrinsic.txt'))

    def export_depth_intrinsics(self, output_path):
        """Export the depth intrinsic matrix to the output path."""
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        self.save_mat_to_file(self.intrinsic_depth,
                              os.path.join(output_path, 'depth_intrinsic.txt'))


def process_scene(path, fast, idx):
    """Process single ScanNet scene.

    Extract RGB images, poses and camera intrinsics.
    """
    try:
        print(path, fast, idx)
        data = SensorData(os.path.join(path, idx, f'{idx}.sens'), fast)
        # output_path = os.path.join('/bucket/output/robot_lab/users/xuewu.lin/scannet_posed_images', idx)
        output_path = os.path.join('/horizon-bucket/robot_lab/users/xuewu.lin/scannet_posed_images', idx)
        data.export_color_images(output_path)
        data.export_intrinsics(output_path)
        data.export_poses(output_path)
        data.export_depth_images(output_path)
        data.export_depth_intrinsics(output_path)
    except Exception as e:
        print("bug info: ", path, fast, idx)
        raise e


def process_directory(path, fast, nproc):
    """Process the files in a directory with parallel support."""
    mmengine.track_parallel_progress(func=partial(process_scene, path, fast),
                                     # tasks=['scene0175_00'],
                                     tasks=['scene0288_01'],
                                     # tasks=['scene0288_01', 'scene0175_00'],
                                     # tasks=os.listdir(path),
                                     nproc=nproc)


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--dataset_folder',
                        default=None,
                        help='folder of the dataset.')
    parser.add_argument('--nproc', type=int, default=8)
    parser.add_argument('--fast', action='store_true')
    args = parser.parse_args()

    if args.dataset_folder is not None:
        os.chdir(args.dataset_folder)

    # process train and val scenes
    if os.path.exists('scans'):
        process_directory('scans', args.fast, args.nproc)


================================================
FILE: bip3d/datasets/__init__.py
================================================
from .embodiedscan_det_grounding_dataset import EmbodiedScanDetGroundingDataset
from .transforms import *  # noqa: F401,F403

__all__ = [
    'EmbodiedScanDetGroundingDataset'
]


================================================
FILE: bip3d/datasets/embodiedscan_det_grounding_dataset.py
================================================
import math
import pickle
import copy
import tqdm
import os
import warnings
from typing import Callable, List, Optional, Union

import mmengine
import numpy as np
from mmengine.dataset import BaseDataset, force_full_init
from mmengine.fileio import load
from mmengine.logging import print_log

from bip3d.registry import DATASETS
from bip3d.structures import get_box_type
from .utils import sample

class_names = (
    'adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', 'backpack',
    'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', 'baseboard',
    'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', 'bench', 'bicycle',
    'bidet', 'bin', 'blackboard', 'blanket', 'blinds', 'board', 'body loofah',
    'book', 'boots', 'bottle', 'bowl', 'box', 'bread', 'broom', 'brush',
    'bucket', 'cabinet', 'calendar', 'camera', 'can', 'candle', 'candlestick',
    'cap', 'car', 'carpet', 'cart', 'case', 'chair', 'chandelier', 'cleanser',
    'clock', 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'coil',
    'column', 'commode', 'computer', 'conducting wire', 'container', 'control',
    'copier', 'cosmetics', 'couch', 'counter', 'countertop', 'crate', 'crib',
    'cube', 'cup', 'curtain', 'cushion', 'decoration', 'desk', 'detergent',
    'device', 'dish rack', 'dishwasher', 'dispenser', 'divider', 'door',
    'door knob', 'doorframe', 'doorway', 'drawer', 'dress', 'dresser', 'drum',
    'duct', 'dumbbell', 'dustpan', 'dvd', 'eraser', 'excercise equipment',
    'fan', 'faucet', 'fence', 'file', 'fire extinguisher', 'fireplace',
    'flowerpot', 'flush', 'folder', 'food', 'footstool', 'frame', 'fruit',
    'furniture', 'garage door', 'garbage', 'glass', 'globe', 'glove',
    'grab bar', 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 'hanger',
    'hat', 'headboard', 'headphones', 'heater', 'helmets', 'holder', 'hook',
    'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle',
    'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', 'ladder',
    'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', 'machine',
    'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu',
    'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', 'napkins',
    'notebook', 'ottoman', 'oven', 'pack', 'package', 'pad', 'pan', 'panel',
    'paper', 'paper cutter', 'partition', 'pedestal', 'pen', 'person', 'piano',
    'picture', 'pillar', 'pillow', 'pipe', 'pitcher', 'plant', 'plate',
    'player', 'plug', 'plunger', 'pool', 'pool table', 'poster', 'pot',
    'price tag', 'printer', 'projector', 'purse', 'rack', 'radiator', 'radio',
    'rail', 'range hood', 'refrigerator', 'remote control', 'ridge', 'rod',
    'roll', 'roof', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen',
    'seasoning', 'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel',
    'shower', 'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', 'socket',
    'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', 'stapler',
    'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', 'structure',
    'sunglasses', 'support', 'switch', 'table', 'tablet', 'teapot',
    'telephone', 'thermostat', 'tissue', 'tissue box', 'toaster', 'toilet',
    'toilet paper', 'toiletry', 'tool', 'toothbrush', 'toothpaste', 'towel',
    'toy', 'tray', 'treadmill', 'trophy', 'tube', 'tv', 'umbrella', 'urn',
    'utensil', 'vacuum cleaner', 'vanity', 'vase', 'vent', 'ventilation',
    'wardrobe', 'washbasin', 'washing machine', 'water cooler', 'water heater',
    'window', 'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap')
head_labels = [
    48, 177, 82, 179, 37, 243, 28, 277, 32, 84, 215, 145, 182, 170, 22, 72, 30,
    141, 65, 257, 221, 225, 52, 75, 231, 158, 236, 156, 47, 74, 6, 18, 71, 242,
    217, 251, 66, 263, 5, 45, 14, 73, 278, 198, 24, 23, 196, 252, 19, 135, 26,
    229, 183, 200, 107, 272, 246, 269, 125, 59, 279, 15, 163, 258, 57, 195, 51,
    88, 97, 58, 102, 36, 137, 31, 80, 160, 155, 61, 238, 96, 190, 25, 219, 152,
    142, 201, 274, 249, 178, 192
]
common_labels = [
    189, 164, 101, 205, 273, 233, 131, 180, 86, 220, 67, 268, 224, 270, 53,
    203, 237, 226, 10, 133, 248, 41, 55, 16, 199, 134, 99, 185, 2, 20, 234,
    194, 253, 35, 174, 8, 223, 13, 91, 262, 230, 121, 49, 63, 119, 162, 79,
    168, 245, 267, 122, 104, 100, 1, 176, 280, 140, 209, 259, 143, 165, 147,
    117, 85, 105, 95, 109, 207, 68, 175, 106, 60, 4, 46, 171, 204, 111, 211,
    108, 120, 157, 222, 17, 264, 151, 98, 38, 261, 123, 78, 118, 127, 240, 124
]
tail_labels = [
    76, 149, 173, 250, 275, 255, 34, 77, 266, 283, 112, 115, 186, 136, 256, 40,
    254, 172, 9, 212, 213, 181, 154, 94, 191, 193, 3, 130, 146, 70, 128, 167,
    126, 81, 7, 11, 148, 228, 239, 247, 21, 42, 89, 153, 161, 244, 110, 0, 29,
    114, 132, 159, 218, 232, 260, 56, 92, 116, 282, 33, 113, 138, 12, 188, 44,
    150, 197, 271, 169, 206, 90, 235, 103, 281, 184, 208, 216, 202, 214, 241,
    129, 210, 276, 64, 27, 87, 139, 227, 187, 62, 43, 50, 69, 93, 144, 166,
    265, 54, 83, 39
]


@DATASETS.register_module()
class EmbodiedScanDetGroundingDataset(BaseDataset):
    def __init__(
        self,
        data_root: str,
        ann_file: str,
        vg_file=None,
        metainfo: Optional[dict] = None,
        pipeline: List[Union[dict, Callable]] = [],
        test_mode: bool = False,
        load_eval_anns: bool = True,
        filter_empty_gt: bool = True,
        remove_dontcare: bool = False,
        box_type_3d: str = "Euler-Depth",
        dataset_length=None,
        mode="detection",
        max_n_images=50,
        n_images_per_sample=1,
        drop_last_per_scene=False,
        part=None,
        temporal=False,
        num_text=1,
        tokens_positive_rebuild=True,
        sep_token="[SEP]",
        **kwargs,
    ):
        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
        self.filter_empty_gt = filter_empty_gt
        self.remove_dontcare = remove_dontcare
        self.load_eval_anns = load_eval_anns
        self.dataset_length = dataset_length
        self.part = part
        self.mode = mode
        assert self.mode in ["detection", "continuous", "grounding"]
        super().__init__(
            ann_file=ann_file,
            metainfo=metainfo,
            data_root=data_root,
            pipeline=pipeline,
            test_mode=test_mode,
            serialize_data=self.mode == "detection",
            **kwargs,
        )
        if self.mode == "continuous":
            self.max_n_images = max_n_images
            self.n_images_per_sample = n_images_per_sample
            self.drop_last_per_scene = drop_last_per_scene
            self.convert_to_continuous()
        elif self.mode == "grounding":
            self.vg_file = vg_file
            self.num_text = num_text
            self.tokens_positive_rebuild = tokens_positive_rebuild
            self.sep_token = sep_token
            self.load_language_data()
            self.data_bytes, self.data_address = self._serialize_data()
            self.serialize_data = True
        print_log(f"dataset length : {self.__len__()}")

    def process_metainfo(self):
        assert "categories" in self._metainfo

        if "classes" not in self._metainfo:
            self._metainfo.setdefault(
                "classes", list(self._metainfo["categories"].keys())
            )

        self.label_mapping = np.full(
            max(list(self._metainfo["categories"].values())) + 1, -1, dtype=int
        )
        for key, value in self._metainfo["categories"].items():
            if key in self._metainfo["classes"]:
                self.label_mapping[value] = self._metainfo["classes"].index(
                    key
                )

    def parse_data_info(self, info: dict):
        info["box_type_3d"] = self.box_type_3d
        info["axis_align_matrix"] = self._get_axis_align_matrix(info)
        info["scan_id"] = info["sample_idx"]
        ann_dataset = info["sample_idx"].split("/")[0]
        if ann_dataset == "matterport3d":
            info["depth_shift"] = 4000.0
        else:
            info["depth_shift"] = 1000.0
        # Because multi-view settings are different from original designs
        # we temporarily follow the ori design in ImVoxelNet
        info["img_path"] = []
        info["depth_img_path"] = []
        if "cam2img" in info:
            cam2img = info["cam2img"].astype(np.float32)
        else:
            cam2img = []

        extrinsics = []
        for i in range(len(info["images"])):
            img_path = os.path.join(
                self.data_prefix.get("img_path", ""),
                info["images"][i]["img_path"],
            )
            depth_img_path = os.path.join(
                self.data_prefix.get("img_path", ""),
                info["images"][i]["depth_path"],
            )

            info["img_path"].append(img_path)
            info["depth_img_path"].append(depth_img_path)
            align_global2cam = np.linalg.inv(
                info["axis_align_matrix"] @ info["images"][i]["cam2global"]
            )
            extrinsics.append(align_global2cam.astype(np.float32))
            if "cam2img" not in info:
                cam2img.append(info["images"][i]["cam2img"].astype(np.float32))

        info["depth2img"] = dict(
            extrinsic=extrinsics,
            intrinsic=cam2img,
            origin=np.array([0.0, 0.0, 0.5]).astype(np.float32),
        )

        if "depth_cam2img" not in info:
            info["depth_cam2img"] = cam2img

        if not self.test_mode:
            info["ann_info"] = self.parse_ann_info(info)

        if self.test_mode and self.load_eval_anns:
            info["ann_info"] = self.parse_ann_info(info)
            info["eval_ann_info"] = self._remove_dontcare(info["ann_info"])

        return info

    def parse_ann_info(self, info: dict):
        """Process the `instances` in data info to `ann_info`.

        Args:
            info (dict): Info dict.

        Returns:
            dict: Processed `ann_info`.
        """

        ann_info = None
        if "instances" in info and len(info["instances"]) > 0:
            ann_info = dict(
                gt_bboxes_3d=np.zeros(
                    (len(info["instances"]), 9), dtype=np.float32
                ),
                gt_labels_3d=np.zeros(
                    (len(info["instances"]),), dtype=np.int64
                ),
                gt_names=[],
                bbox_id=np.zeros((len(info["instances"]),), dtype=np.int64) - 1,
            )
            for idx, instance in enumerate(info["instances"]):
                ann_info["gt_bboxes_3d"][idx] = instance["bbox_3d"]
                ann_info["gt_labels_3d"][idx] = self.label_mapping[
                    instance["bbox_label_3d"]
                ]
                ann_info["gt_names"].append(
                    self._metainfo["classes"][ann_info["gt_labels_3d"][idx]]
                    if ann_info["gt_labels_3d"][idx] >= 0
                    else "others"
                )
                ann_info["bbox_id"][idx] = instance["bbox_id"]

        # pack ann_info for return
        if ann_info is None:
            ann_info = dict()
            ann_info["gt_bboxes_3d"] = np.zeros((0, 9), dtype=np.float32)
            ann_info["gt_labels_3d"] = np.zeros((0,), dtype=np.int64)
            ann_info["bbox_id"] = np.zeros((0,), dtype=np.int64) - 1
            ann_info["gt_names"] = []

        # post-processing/filtering ann_info if not empty gt
        if "visible_instance_ids" in info["images"][0]:
            ids = []
            for i in range(len(info["images"])):
                ids.append(info["images"][i]["visible_instance_ids"])
            mask_length = ann_info["gt_labels_3d"].shape[0]
            ann_info["visible_instance_masks"] = self._ids2masks(
                ids, mask_length
            )

        if self.remove_dontcare:
            ann_info = self._remove_dontcare(ann_info)

        ann_dataset = info["sample_idx"].split("/")[0]
        ann_info["gt_bboxes_3d"] = self.box_type_3d(
            ann_info["gt_bboxes_3d"],
            box_dim=ann_info["gt_bboxes_3d"].shape[-1],
            with_yaw=True,
            origin=(0.5, 0.5, 0.5),
        )
        return ann_info

    @staticmethod
    def _get_axis_align_matrix(info: dict):
        """Get axis_align_matrix from info. If not exist, return identity mat.

        Args:
            info (dict): Info of a single sample data.

        Returns:
            np.ndarray: 4x4 transformation matrix.
        """
        if "axis_align_matrix" in info:
            return np.array(info["axis_align_matrix"])
        else:
            warnings.warn(
                "axis_align_matrix is not found in ScanNet data info, please "
                "use new pre-process scripts to re-generate ScanNet data"
            )
            return np.eye(4).astype(np.float32)

    def _ids2masks(self, ids, mask_length):
        """Change visible_instance_ids to visible_instance_masks."""
        masks = []
        for idx in range(len(ids)):
            mask = np.zeros((mask_length,), dtype=bool)
            mask[ids[idx]] = 1
            masks.append(mask)
        return masks

    def _remove_dontcare(self, ann_info: dict):
        """Remove annotations that do not need to be cared.

        -1 indicates dontcare in MMDet3d.

        Args:
            ann_info (dict): Dict of annotation infos. The
                instance with label `-1` will be removed.

        Returns:
            dict: Annotations after filtering.
        """
        img_filtered_annotations = {}
        filter_mask = ann_info["gt_labels_3d"] > -1
        for key in ann_info.keys():
            if key == "instances":
                img_filtered_annotations[key] = ann_info[key]
            elif key == "visible_instance_masks":
                img_filtered_annotations[key] = []
                for idx in range(len(ann_info[key])):
                    img_filtered_annotations[key].append(
                        ann_info[key][idx][filter_mask]
                    )
            elif key == "gt_names":
                img_filtered_annotations[key] = [
                    x for i, x in enumerate(ann_info[key]) if filter_mask[i]
                ]
            else:
                img_filtered_annotations[key] = ann_info[key][filter_mask]
        return img_filtered_annotations

    def load_data_list(self):
        annotations = load(self.ann_file)
        if not isinstance(annotations, dict):
            raise TypeError(
                f"The annotations loaded from annotation file "
                f"should be a dict, but got {type(annotations)}!"
            )
        if "data_list" not in annotations or "metainfo" not in annotations:
            raise ValueError(
                "Annotation must have data_list and metainfo " "keys"
            )
        metainfo = annotations["metainfo"]
        raw_data_list = annotations["data_list"]

        # Meta information load from annotation file will not influence the
        # existed meta information load from `BaseDataset.METAINFO` and
        # `metainfo` arguments defined in constructor.
        for k, v in metainfo.items():
            self._metainfo.setdefault(k, v)

        self.process_metainfo()

        # load and parse data_infos.
        data_list = []
        for raw_data_info in tqdm.tqdm(
            raw_data_list,
            mininterval=10,
            desc=f"Loading {'Test' if self.test_mode else 'Train'} dataset",
        ):
            if self.part is not None:
                valid = False
                for x in self.part:
                    if x in raw_data_info["sample_idx"]:
                        valid = True
                        break
                if not valid:
                    continue

            data_info = self.parse_data_info(raw_data_info)
            if data_info is None:
                continue
            assert isinstance(data_info, dict)
            data_list.append(data_info)

            if (
                self.dataset_length is not None
                and len(data_list) >= self.dataset_length
            ):
                break
        return data_list

    @staticmethod
    def _get_axis_align_matrix(info: dict):
        if 'axis_align_matrix' in info:
            return np.array(info['axis_align_matrix'])
        else:
            warnings.warn(
                'axis_align_matrix is not found in ScanNet data info, please '
                'use new pre-process scripts to re-generate ScanNet data')
            return np.eye(4).astype(np.float32)

    @staticmethod
    def _is_view_dep(text):
        """Check whether to augment based on sr3d utterance."""
        rels = [
            'front', 'behind', 'back', 'left', 'right', 'facing', 'leftmost',
            'rightmost', 'looking', 'across'
        ]
        words = set(text.split())
        return any(rel in words for rel in rels)

    def convert_info_to_scan(self):
        self.scans = dict()
        for data in self.data_list:
            scan_id = data['scan_id']
            self.scans[scan_id] = data
        self.scan_ids = list(self.scans.keys())

    def load_language_data(self):
        self.convert_info_to_scan()
        if isinstance(self.vg_file, str):
            language_annotations = load(os.path.join(self.data_root, self.vg_file))
        else:
            language_annotations = []
            for x in self.vg_file:
                language_annotations.extend(load(os.path.join(self.data_root, x)))
        if self.dataset_length is not None:
            interval = len(language_annotations) / self.dataset_length
            output = []
            for i in range(self.dataset_length):
                output.append(ids[int(interval*i)])
            language_annotations = output
        self.data_list = language_annotations
        self.scan_id_to_data_idx = {}
        for scan_id in self.scan_ids:
            self.scan_id_to_data_idx[scan_id] = []
        for i, d in enumerate(self.data_list):
            self.scan_id_to_data_idx[d["scan_id"]].append(i)

    def convert_to_continuous(self):
        self.convert_info_to_scan()
        data_list = []
        self.flag = []
        self.image_id_dict = {}
        for flag, scan_id in enumerate(self.scan_ids):
            total_n = len(self.scans[scan_id]["images"])
            sample_n = min(self.max_n_images, total_n)
            ids = sample(total_n, sample_n, True).tolist()
            self.image_id_dict[scan_id] = ids
            if self.n_images_per_sample > 1:
                if self.drop_last_per_scene:
                    sample_n = math.floor(sample_n / self.n_images_per_sample)
                else:
                    sample_n = math.ceil(sample_n / self.n_images_per_sample)

                ids = [
                    ids[i*self.n_images_per_sample : (i+1)*self.n_images_per_sample]
                    for i in range(sample_n)
                ]
            data_list.extend(
                [dict(scan_id=scan_id, image_id=i) for i in ids]
            )
            self.flag.extend([flag] * len(ids))
        self.data_list = data_list
        self.flag = np.array(self.flag)

    def get_data_info_continuous(self, data_info):
        scan_id = data_info["scan_id"]
        data = copy.deepcopy(self.scans[scan_id])
        img_idx = data_info["image_id"]
        if isinstance(img_idx, int):
            img_idx = [img_idx]
        for k in ["images", "img_path", "depth_img_path"]:
            data[k] = index(data[k], img_idx)

        seen_image_id = self.image_id_dict[scan_id]
        seen_image_id = seen_image_id[:seen_image_id.index(img_idx[0])]

        if len(seen_image_id) != 0:
            last_visible_mask = index(
                data["ann_info"]["visible_instance_masks"], seen_image_id
            )
            last_visible_mask = np.stack(last_visible_mask).any(axis=0)
        else:
            last_visible_mask = data["ann_info"]["visible_instance_masks"][0] * False

        visible_instance_masks = index(
            data["ann_info"]["visible_instance_masks"], img_idx
        )

        current_visible_mask = np.stack(visible_instance_masks).any(axis=0)
        ignore_mask = np.logical_and(
            last_visible_mask, ~current_visible_mask
        )
        all_visible_mask = np.logical_or(
            last_visible_mask, current_visible_mask,
        )
        data["visible_instance_masks"] = visible_instance_masks
        data["all_visible_mask"] = all_visible_mask
        data["ignore_mask"] = ignore_mask

        data["depth2img"]["extrinsic"] = index(
            data["depth2img"]["extrinsic"], img_idx
        )
        if isinstance(data["depth2img"]["intrinsic"], list):
            data["depth2img"]["intrinsic"] = index(
                data["depth2img"]["intrinsic"], img_idx
            )
        return data

    def merge_grounding_data(self, data_infos):
        output = dict(
            text="",
        )
        for key in ["target_id", "distractor_ids", "target", "anchors", "anchor_ids", "tokens_positive"]:
            if key in data_infos[0]:
                output[key] = []
        for data_info in data_infos:
            if "target_id" in data_info and data_info["target_id"] in output["target_id"]:
                continue

            if self.tokens_positive_rebuild and "target" in data_info:
                start_idx = data_info["text"].find(data_info["target"])
                end_idx = start_idx + len(data_info["target"])
                tokens_positive = [[start_idx, end_idx]]
            elif "tokens_positive" in data_info:
                tokens_positive = data_info["tokens_positive"]
            else:
                tokens_positive = None

            if len(output["text"]) == 0:
                output["text"] = data_info["text"]
            else:
                if tokens_positive is not None:
                    tokens_positive = np.array(tokens_positive)
                    tokens_positive += len(output["text"]) + len(self.sep_token)
                    tokens_positive = tokens_positive.tolist()
                output["text"] += self.sep_token + data_info["text"]
            if tokens_positive is not None:
                output["tokens_positive"].append(tokens_positive)
            for k in ["target_id", "distractor_ids", "target", "anchors", "anchor_ids"]:
                if k not in data_info:
                    continue
                output[k].append(data_info[k])
        output["scan_id"] = data_infos[0]["scan_id"]
        return output

    def get_data_info_grounding(self, data_info):

        flags = {}
        if "distractor_ids" in data_info:
            flags["is_unique"] = len(data_info["distractor_ids"]) == 0
            flags["is_hard"] = len(data_info["distractor_ids"]) > 3
        if "text" in data_info:
            flags["is_view_dep"] = self._is_view_dep(data_info["text"])

        scan_id = data_info["scan_id"]
        scan_data = copy.deepcopy(self.scans[scan_id])
        data_info = [data_info]
        if self.num_text > 1:
            data_idx = self.scan_id_to_data_idx[scan_id]
            sample_idx = sample(
                len(data_idx),
                max(min(int(np.random.rand()*self.num_text), len(data_idx))-1, 1),
                fix_interval=False
            )
            for i in sample_idx:
                data_info.append(super().get_data_info(data_idx[i]))
        data_info = self.merge_grounding_data(data_info)
        scan_data["text"] = data_info["text"]

        if "ann_info" in scan_data and "target" in data_info:
            tokens_positive = []
            obj_idx = []
            for i, (target_name, id) in enumerate(
                zip(data_info["target"], data_info["target_id"])
            ):
                mask = np.logical_and(
                    scan_data["ann_info"]["bbox_id"] == id,
                    np.array(scan_data["ann_info"]["gt_names"]) == target_name
                )
                if np.sum(mask) != 1:
                    continue
                obj_idx.append(np.where(mask)[0][0])
                tokens_positive.append(data_info["tokens_positive"][i])
            obj_idx = np.array(obj_idx, dtype=np.int32)
            scan_data["ann_info"]["gt_bboxes_3d"] = scan_data["ann_info"]["gt_bboxes_3d"][obj_idx]
            scan_data["ann_info"]["gt_labels_3d"] = scan_data["ann_info"]["gt_labels_3d"][obj_idx]
            scan_data["ann_info"]["gt_names"] = [
                scan_data["ann_info"]["gt_names"][i] for i in obj_idx
            ]
            if "visible_instance_masks" in scan_data["ann_info"]:
                scan_data["ann_info"]["visible_instance_masks"] = [
                    visible_instance_mask[obj_idx]
                    for visible_instance_mask in scan_data["ann_info"]["visible_instance_masks"]
                ]
            scan_data["tokens_positive"] = tokens_positive
            scan_data["eval_ann_info"] = scan_data["ann_info"]
            scan_data["eval_ann_info"].update(flags)
        elif "tokens_positive" in data_info:
            scan_data["tokens_positive"] = data_info.get("tokens_positive")
        return scan_data

    @force_full_init
    def get_data_info(self, idx):
        data_info = super().get_data_info(idx)
        if self.mode == "detection":
            return data_info
        elif self.mode == "continuous":
            return self.get_data_info_continuous(data_info)
        elif self.mode == "grounding":
            return self.get_data_info_grounding(data_info)


def index(input, idx):
    if isinstance(idx, int):
        idx = [idx]
    output = []
    for i in idx:
        output.append(input[i])
    return output


================================================
FILE: bip3d/datasets/transforms/__init__.py
================================================
from .augmentation import (
    GlobalRotScaleTrans,
    RandomFlip3D,
    ResizeCropFlipImage,
)
from .formatting import Pack3DDetInputs
from .loading import LoadAnnotations3D, LoadDepthFromFile
from .multiview import MultiViewPipeline

from .transform import (
    CategoryGroundingDataPrepare,
    CamIntrisicStandardization,
    CustomResize,
    DepthProbLabelGenerator,
)


================================================
FILE: bip3d/datasets/transforms/augmentation.py
================================================
from typing import List, Union

import cv2
import numpy as np
from mmcv.transforms import BaseTransform
from mmdet.datasets.transforms import RandomFlip

from bip3d.registry import TRANSFORMS


@TRANSFORMS.register_module()
class RandomFlip3D(RandomFlip):
    """Flip the points & bbox.

    If the input dict contains the key "flip", then the flag will be used,
    otherwise it will be randomly decided by a ratio specified in the init
    method.

    Required Keys:

    - points (np.float32)
    - gt_bboxes_3d (np.float32)

    Modified Keys:

    - points (np.float32)
    - gt_bboxes_3d (np.float32)

    Added Keys:

    - points (np.float32)
    - pcd_trans (np.float32)
    - pcd_rotation (np.float32)
    - pcd_rotation_angle (np.float32)
    - pcd_scale_factor (np.float32)

    Args:
        sync_2d (bool): Whether to apply flip according to the 2D
            images. If True, it will apply the same flip as that to 2D images.
            If False, it will decide whether to flip randomly and independently
            to that of 2D images. Defaults to True.
        flip_2d (bool): Whether to apply flip for the img data.
            If True, it will adopt the flip augmentation for the img.
            False occurs on bev augmentation for bev-based image 3d det.
            Defaults to True.
        flip_3d (bool): Whether to apply flip for the 3d point cloud data.
            If True, it will adopt the flip augmentation for the point cloud.
            Defaults to True.
        flip_ratio_bev_horizontal (float): The flipping probability
            in horizontal direction. Defaults to 0.0.
        flip_ratio_bev_vertical (float): The flipping probability
            in vertical direction. Defaults to 0.0.
        flip_box3d (bool): Whether to flip bounding box. In most of the case,
            the box should be fliped. In cam-based bev detection, this is set
            to False, since the flip of 2D images does not influence the 3D
            box. Defaults to True.
    """

    def __init__(self,
                 sync_2d: bool = True,
                 flip_2d: bool = True,
                 flip_3d: bool = True,
                 flip_ratio_bev_horizontal: float = 0.0,
                 flip_ratio_bev_vertical: float = 0.0,
                 flip_box3d: bool = True,
                 update_lidar2cam: bool = False,
                 **kwargs) -> None:
        # `flip_ratio_bev_horizontal` is equal to
        # for flip prob of 2d image when
        # `sync_2d` is True
        super(RandomFlip3D, self).__init__(prob=flip_ratio_bev_horizontal,
                                           direction='horizontal',
                                           **kwargs)
        self.sync_2d = sync_2d
        self.flip_2d = flip_2d
        self.flip_3d = flip_3d
        self.flip_ratio_bev_horizontal = flip_ratio_bev_horizontal
        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
        self.flip_box3d = flip_box3d
        self.update_lidar2cam = update_lidar2cam
        if flip_ratio_bev_horizontal is not None:
            assert isinstance(flip_ratio_bev_horizontal, (int, float)) \
                    and 0 <= flip_ratio_bev_horizontal <= 1
        if flip_ratio_bev_vertical is not None:
            assert isinstance(flip_ratio_bev_vertical, (int, float)) \
                    and 0 <= flip_ratio_bev_vertical <= 1

    def transform(self, input_dict: dict) -> dict:
        """Call function to flip points, values in the ``bbox3d_fields`` and
        also flip 2D image and its annotations.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
            dict: Flipped results, 'flip', 'flip_direction',
            'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added
            into result dict.
        """
        # flip 2D image and its annotations
        if self.flip_2d:
            # only handle the 2D image
            if 'img' in input_dict:
                super(RandomFlip3D, self).transform(input_dict)
            flip = input_dict.get('flip', False)
            if flip:
                input_dict = self.random_flip_data_2d(input_dict)

        if self.flip_3d:
            # only handle the 3D points
            if self.sync_2d and 'img' in input_dict:
                # TODO check if this is necessary in FOCS3D
                input_dict['pcd_horizontal_flip'] = input_dict['flip']
                input_dict['pcd_vertical_flip'] = False
            else:
                if 'pcd_horizontal_flip' not in input_dict:
                    if np.random.rand() < self.flip_ratio_bev_horizontal:
                        flip_horizontal = True
                    else:
                        flip_horizontal = False
                    input_dict['pcd_horizontal_flip'] = flip_horizontal
                if 'pcd_vertical_flip' not in input_dict:
                    if np.random.rand() < self.flip_ratio_bev_vertical:
                        flip_vertical = True
                    else:
                        flip_vertical = False
                    input_dict['pcd_vertical_flip'] = flip_vertical

            if 'transformation_3d_flow' not in input_dict:
                input_dict['transformation_3d_flow'] = []

            if input_dict['pcd_horizontal_flip']:
                self.random_flip_data_3d(input_dict, 'horizontal')
                input_dict['transformation_3d_flow'].extend(['HF'])
            if input_dict['pcd_vertical_flip']:
                self.random_flip_data_3d(input_dict, 'vertical')
                input_dict['transformation_3d_flow'].extend(['VF'])
            if self.update_lidar2cam:
                self._transform_lidar2cam(input_dict)
        return input_dict

    def random_flip_data_3d(self,
                            input_dict: dict,
                            direction: str = 'horizontal') -> None:
        """Flip 3D data randomly.

        `random_flip_data_3d` should take these situations into consideration:

        - 1. LIDAR-based 3d detection
        - 2. LIDAR-based 3d segmentation
        - 3. vision-only detection
        - 4. multi-modality 3d detection.

        Args:
            input_dict (dict): Result dict from loading pipeline.
            direction (str): Flip direction. Defaults to 'horizontal'.

        Returns:
            dict: Flipped results, 'points', 'bbox3d_fields' keys are
            updated in the result dict.
        """
        assert direction in ['horizontal', 'vertical']
        if self.flip_box3d:
            if 'gt_bboxes_3d' in input_dict:
                if 'points' in input_dict:
                    input_dict['points'] = input_dict['gt_bboxes_3d'].flip(
                        direction, points=input_dict['points'])
                else:
                    # vision-only detection
                    input_dict['gt_bboxes_3d'].flip(direction)
            else:
                input_dict['points'].flip(direction)

    def random_flip_data_2d(self,
                            input_dict: dict,
                            direction: str = 'horizontal') -> dict:
        if 'centers_2d' in input_dict:
            assert self.sync_2d is True and direction == 'horizontal', \
                'Only support sync_2d=True and horizontal flip with images'
            w = input_dict['img_shape'][1]
            input_dict['centers_2d'][..., 0] = \
                w - input_dict['centers_2d'][..., 0]
            # need to modify the horizontal position of camera center
            # along u-axis in the image (flip like centers2d)
            # ['cam2img'][0][2] = c_u
            # see more details and examples at
            # https://github.com/open-mmlab/mmdetection3d/pull/744
            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]

        if 'fov_ori2aug' not in input_dict:
            fov_ori2aug = np.eye(4, 4)
        else:
            fov_ori2aug = input_dict['fov_ori2aug']
        # get the value of w
        w = input_dict['img_shape'][1]
        # flip_matrix[0,0] = -1
        # flip_matrix[0,3] = w
        # fov_ori2aug = np.matmul(fov_ori2aug, flip_matrix)
        fov_ori2aug[0] *= -1
        fov_ori2aug[0, 3] += w
        input_dict['fov_ori2aug'] = fov_ori2aug
        return input_dict

    def _flip_on_direction(self, results: dict) -> None:
        """Function to flip images, bounding boxes, semantic segmentation map
        and keypoints.

        Add the override feature that if 'flip' is already in results, use it
        to do the augmentation.
        """
        if 'flip' not in results:
            cur_dir = self._choose_direction()
        else:
            # `flip_direction` works only when `flip` is True.
            # For example, in `MultiScaleFlipAug3D`, `flip_direction` is
            # 'horizontal' but `flip` is False.
            if results['flip']:
                assert 'flip_direction' in results, 'flip and flip_direction '
                'must exist simultaneously'
                cur_dir = results['flip_direction']
            else:
                cur_dir = None
        if cur_dir is None:
            results['flip'] = False
            results['flip_direction'] = None
        else:
            results['flip'] = True
            results['flip_direction'] = cur_dir
            self._flip(results)

    def _transform_lidar2cam(self, results: dict) -> None:
        """TODO."""
        aug_matrix = np.eye(4)
        if results.get('pcd_horizontal_flip', False):
            aug_matrix[1, 1] *= -1
        if results.get('pcd_vertical_flip', False):
            aug_matrix[0, 0] *= -1
        lidar2cam_list = []
        for lidar2cam in results['lidar2cam']:
            lidar2cam = np.array(lidar2cam)
            lidar2cam = np.matmul(lidar2cam, aug_matrix)
            lidar2cam_list.append(lidar2cam.tolist())
        results['lidar2cam'] = lidar2cam_list

    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(sync_2d={self.sync_2d},'
        repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
        return repr_str


@TRANSFORMS.register_module()
class GlobalRotScaleTrans(BaseTransform):
    """Apply global rotation, scaling and translation to a 3D scene.

    Required Keys:

    - points (np.float32)
    - gt_bboxes_3d (np.float32)

    Modified Keys:

    - points (np.float32)
    - gt_bboxes_3d (np.float32)

    Added Keys:

    - points (np.float32)
    - pcd_trans (np.float32)
    - pcd_rotation (np.float32)
    - pcd_rotation_angle (np.float32)
    - pcd_scale_factor (np.float32)

    Args:
        rot_range (list[float]): Range of rotation angle.
            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
        rot_dof (int): DoF of rotation noise. Defaults to 1.
        scale_ratio_range (list[float]): Range of scale ratio.
            Defaults to [0.95, 1.05].
        translation_std (list[float]): The standard deviation of
            translation noise applied to a scene, which
            is sampled from a gaussian distribution whose standard deviation
            is set by ``translation_std``. Defaults to [0, 0, 0].
        shift_height (bool): Whether to shift height.
            (the fourth dimension of indoor points) when scaling.
            Defaults to False.
    """

    def __init__(self,
                 rot_range: Union[List[float], int,
                                  float] = [-0.78539816, 0.78539816],
                 rot_dof: int = 1,
                 scale_ratio_range: List[float] = [0.95, 1.05],
                 translation_std: List[int] = [0, 0, 0],
                 shift_height: bool = False,
                 update_lidar2cam: bool = False) -> None:
        seq_types = (list, tuple, np.ndarray)
        if not isinstance(rot_range, seq_types):
            assert isinstance(rot_range, (int, float)), \
                f'unsupported rot_range type {type(rot_range)}'
            rot_range = [-rot_range, rot_range]
        self.rot_range = rot_range
        self.rot_dof = rot_dof
        self.update_lidar2cam = update_lidar2cam

        assert isinstance(scale_ratio_range, seq_types), \
            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'

        self.scale_ratio_range = scale_ratio_range

        if not isinstance(translation_std, seq_types):
            assert isinstance(translation_std, (int, float)), \
                f'unsupported translation_std type {type(translation_std)}'
            translation_std = [
                translation_std, translation_std, translation_std
            ]
        assert all([std >= 0 for std in translation_std]), \
            'translation_std should be positive'
        self.translation_std = translation_std
        self.shift_height = shift_height

    def transform(self, input_dict: dict) -> dict:
        """Private function to rotate, scale and translate bounding boxes and
        points.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
            dict: Results after scaling, 'points', 'pcd_rotation',
            'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated
            in the result dict.
        """
        if 'transformation_3d_flow' not in input_dict:
            input_dict['transformation_3d_flow'] = []

        self._rot_bbox_points(input_dict)

        if 'pcd_scale_factor' not in input_dict:
            self._random_scale(input_dict)
        self._scale_bbox_points(input_dict)

        self._trans_bbox_points(input_dict)

        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
        if self.update_lidar2cam:
            self._transform_lidar2cam(input_dict)
        return input_dict

    def _trans_bbox_points(self, input_dict: dict) -> None:
        """Private function to translate bounding boxes and points.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
            dict: Results after translation, 'points', 'pcd_trans'
            and `gt_bboxes_3d` is updated in the result dict.
        """
        translation_std = np.array(self.translation_std, dtype=np.float32)
        trans_factor = np.random.normal(scale=translation_std, size=3).T

        if 'points' in input_dict:
            input_dict['points'].translate(trans_factor)
        input_dict['pcd_trans'] = trans_factor
        if 'gt_bboxes_3d' in input_dict:
            input_dict['gt_bboxes_3d'].translate(trans_factor)

    def _rot_bbox_points(self, input_dict: dict) -> None:
        """Private function to rotate bounding boxes and points.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
            dict: Results after rotation, 'points', 'pcd_rotation'
            and `gt_bboxes_3d` is updated in the result dict.
        """
        rotation = self.rot_range
        if self.rot_dof == 1:
            noise_rotation = np.random.uniform(rotation[0], rotation[1])
            noise_rotation *= -1
        elif self.rot_dof > 1:
            noise_rotation = np.array([
                -np.random.uniform(rotation[0], rotation[1]),
                -np.random.uniform(rotation[0], rotation[1]),
                -np.random.uniform(rotation[0], rotation[1])
            ])
        else:
            raise NotImplementedError
        # TODO delete this. And -1 is to align the rotation with
        # the version of 0.17.
        # if 'gt_bboxes_3d' in input_dict and \
        #         len(input_dict['gt_bboxes_3d'].tensor) != 0:
        if 'gt_bboxes_3d' in input_dict:
            # rotate points with bboxes
            if 'points' in input_dict:
                points, rot_mat_T = input_dict['gt_bboxes_3d'].rotate(
                    noise_rotation, input_dict['points'])
                input_dict['points'] = points
            else:
                rot_mat_T = input_dict['gt_bboxes_3d'].rotate(noise_rotation)
        elif 'points' in input_dict:
            # if no bbox in input_dict, only rotate points
            rot_mat_T = input_dict['points'].rotate(noise_rotation)

        input_dict['pcd_rotation'] = rot_mat_T
        input_dict['pcd_rotation_angle'] = noise_rotation

    def _scale_bbox_points(self, input_dict: dict) -> None:
        """Private function to scale bounding boxes and points.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
            dict: Results after scaling, 'points' and
            `gt_bboxes_3d` is updated in the result dict.
        """
        scale = input_dict['pcd_scale_factor']
        if 'points' in input_dict:
            points = input_dict['points']
            points.scale(scale)
            if self.shift_height:
                assert 'height' in points.attribute_dims.keys(), \
                  'setting shift_height=True \
                   but points have no height attribute'

                points.tensor[:, points.attribute_dims['height']] *= scale
            input_dict['points'] = points

        if 'gt_bboxes_3d' in input_dict and \
                len(input_dict['gt_bboxes_3d'].tensor) != 0:
            input_dict['gt_bboxes_3d'].scale(scale)

    def _random_scale(self, input_dict: dict) -> None:
        """Private function to randomly set the scale factor.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
            dict: Results after scaling, 'pcd_scale_factor'
            are updated in the result dict.
        """
        scale_factor = np.random.uniform(self.scale_ratio_range[0],
                                         self.scale_ratio_range[1])
        input_dict['pcd_scale_factor'] = scale_factor

    def _transform_lidar2cam(self, input_dict: dict) -> None:
        aug_matrix = np.eye(4)

        if 'pcd_rotation' in input_dict:
            aug_matrix[:3, :3] = input_dict['pcd_rotation'].T.numpy(
            ) * input_dict['pcd_scale_factor']
        else:
            aug_matrix[:3, :3] = np.eye(3).view(
                1, 3, 3) * input_dict['pcd_scale_factor']
        aug_matrix[:3, -1] = input_dict['pcd_trans'].reshape(1, 3)
        aug_matrix[-1, -1] = 1.0
        aug_matrix = np.linalg.inv(aug_matrix)
        lidar2cam_list = []
        for lidar2cam in input_dict["depth2img"]["extrinsic"]:
        # for lidar2cam in input_dict['lidar2cam']:
            lidar2cam = np.array(lidar2cam)
            lidar2cam = np.matmul(lidar2cam, aug_matrix)
            lidar2cam_list.append(lidar2cam.tolist())
        input_dict["depth2img"]["extrinsic"] = lidar2cam_list
        # input_dict['lidar2cam'] = lidar2cam_list

    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(rot_range={self.rot_range},'
        repr_str += f' scale_ratio_range={self.scale_ratio_range},'
        repr_str += f' translation_std={self.translation_std},'
        repr_str += f' shift_height={self.shift_height})'
        return repr_str


@TRANSFORMS.register_module()
class ResizeCropFlipImage(object):
    def __init__(self, data_aug_conf, test_mode=False):
        self.data_aug_conf = data_aug_conf
        self.test_mode = test_mode

    def get_augmentation(self):
        if self.data_aug_conf is None:
            return None
        H, W = self.data_aug_conf["H"], self.data_aug_conf["W"]
        fH, fW = self.data_aug_conf["final_dim"]
        if not self.test_mode:
            resize = np.random.uniform(*self.data_aug_conf["resize_lim"])
            resize_dims = (int(W * resize), int(H * resize))
            newW, newH = resize_dims
            crop_h = (
                int(
                    (1 - np.random.uniform(*self.data_aug_conf["bot_pct_lim"]))
                    * newH
                )
                - fH
            )
            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
            flip = False
            if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]):
                flip = True
            rotate = np.random.uniform(*self.data_aug_conf["rot_lim"])
            # rotate_3d = np.random.uniform(*self.data_aug_conf["rot3d_range"])
            rotate_3d = 0
        else:
            resize = max(fH / H, fW / W)
            resize_dims = (int(W * resize), int(H * resize))
            newW, newH = resize_dims
            crop_h = (
                int((1 - np.mean(self.data_aug_conf["bot_pct_lim"])) * newH)
                - fH
            )
            crop_w = int(max(0, newW - fW) / 2)
            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
            flip = False
            rotate = 0
            rotate_3d = 0
        aug_config = {
            "resize": resize,
            "resize_dims": resize_dims,
            "crop": crop,
            "flip": flip,
            "rotate": rotate,
            "rotate_3d": rotate_3d,
        }
        return aug_config

    def __call__(self, results):
        img = results["img"]
        aug_configs = self.get_augmentation()
        resize = aug_configs.get("resize", 1)
        results["img"], extend_matrix = self._transform(img, aug_configs)
        results["trans_mat"] = extend_matrix
        if "depth_img" in results:
            results["depth_img"], _ = self._transform(results["depth_img"], aug_configs)
        return results

    def _transform(self, img, aug_configs):
        H, W = img.shape[:2]
        resize = aug_configs.get("resize", 1)
        resize_dims = (int(W * resize), int(H * resize))
        crop = aug_configs.get("crop", [0, 0, *resize_dims])
        flip = aug_configs.get("flip", False)
        rotate = aug_configs.get("rotate", 0)

        transform_matrix = np.eye(3)
        transform_matrix[:2, :2] *= resize
        transform_matrix[:2, 2] -= np.array(crop[:2])
        if flip:
            flip_matrix = np.array(
                [[-1, 0, crop[2] - crop[0]], [0, 1, 0], [0, 0, 1]]
            )
            transform_matrix = flip_matrix @ transform_matrix
        rotate = rotate / 180 * np.pi
        rot_matrix = np.array(
            [
                [np.cos(rotate), np.sin(rotate), 0],
                [-np.sin(rotate), np.cos(rotate), 0],
                [0, 0, 1],
            ]
        )
        rot_center = np.array([crop[2] - crop[0], crop[3] - crop[1]]) / 2
        rot_matrix[:2, 2] = -rot_matrix[:2, :2] @ rot_center + rot_center
        transform_matrix = rot_matrix @ transform_matrix
        extend_matrix = np.eye(4)
        extend_matrix[:3, :3] = transform_matrix

        img = cv2.warpAffine(img, extend_matrix[:2, :3], img.shape[:2][::-1])
        return img, extend_matrix


================================================
FILE: bip3d/datasets/transforms/formatting.py
================================================
from typing import List, Sequence, Union

import mmengine
import numpy as np
import torch
from mmcv.transforms import BaseTransform
from mmengine.structures import InstanceData, PixelData

from bip3d.registry import TRANSFORMS
from bip3d.structures.bbox_3d import BaseInstance3DBoxes
from bip3d.structures.points import BasePoints
from bip3d.utils.typing_config import Det3DDataElement, PointData


def to_tensor(
    data: Union[torch.Tensor, np.ndarray, Sequence, int,
                float]) -> torch.Tensor:
    """Convert objects of various python types to :obj:`torch.Tensor`.

    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
    :class:`Sequence`, :class:`int` and :class:`float`.

    Args:
        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
            be converted.

    Returns:
        torch.Tensor: the converted data.
    """

    if isinstance(data, torch.Tensor):
        return data
    elif isinstance(data, np.ndarray):
        if data.dtype is np.dtype('float64'):
            data = data.astype(np.float32)
        return torch.from_numpy(data)
    elif isinstance(data, Sequence) and not mmengine.is_str(data):
        return torch.tensor(data)
    elif isinstance(data, int):
        return torch.LongTensor([data])
    elif isinstance(data, float):
        return torch.FloatTensor([data])
    else:
        raise TypeError(f'type {type(data)} cannot be converted to tensor.')


@TRANSFORMS.register_module()
class Pack3DDetInputs(BaseTransform):
    INPUTS_KEYS = ['points', 'img', "depth_img"]
    # to be compatible with depths in bevdepth
    INSTANCEDATA_3D_KEYS = [
        'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
    ]
    INSTANCEDATA_2D_KEYS = [
        'gt_bboxes',
        'gt_bboxes_labels',
    ]

    SEG_KEYS = [
        'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
        'gt_semantic_seg'
    ]

    def __init__(
        self,
        keys: dict,
        meta_keys: dict = (
            'img_path', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img',
            'cam2img', 'pad_shape', 'depth_map_path', 'scale_factor', 'flip',
            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
            'box_type_3d', 'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
            'pcd_rotation_angle', 'lidar_path', 'transformation_3d_flow',
            'trans_mat', 'affine_aug', 'sweep_img_metas', 'ori_cam2img',
            'cam2global', 'crop_offset', 'img_crop_offset', 'resize_img_shape',
            'lidar2cam', 'ori_lidar2img', 'num_ref_frames', 'num_views',
            'ego2global', 'fov_ori2aug', 'ego2cam', 'axis_align_matrix',
            'text', 'tokens_positive', 'scan_id', "distractor_bboxes", "anchor_bboxes",
            "trans_mat", "ignore_mask",
        )):
        self.keys = keys
        self.meta_keys = meta_keys

    def _remove_prefix(self, key: str) -> str:
        if key.startswith('gt_'):
            key = key[3:]
        return key

    def transform(self, results: Union[dict,
                                       List[dict]]):
        """Method to pack the input data. when the value in this dict is a
        list, it usually is in Augmentations Testing.

        Args:
            results (dict | list[dict]): Result dict from the data pipeline.

        Returns:
            dict | List[dict]:

            - 'inputs' (dict): The forward data of models. It usually contains
              following keys:

                - points
                - img

            - 'data_samples' (:obj:`Det3DDataSample`): The annotation info of
              the sample.
        """
        # augtest
        if isinstance(results, list):
            if len(results) == 1:
                # simple test
                return self.pack_single_results(results[0])
            pack_results = []
            for single_result in results:
                pack_results.append(self.pack_single_results(single_result))
            return pack_results
        # norm training and simple testing
        elif isinstance(results, dict):
            return self.pack_single_results(results)
        else:
            raise NotImplementedError

    def pack_single_results(self, results: dict) -> dict:
        """Method to pack the single input data. when the value in this dict is
        a list, it usually is in Augmentations Testing.

        Args:
            results (dict): Result dict from the data pipeline.

        Returns:
            dict: A dict contains

            - 'inputs' (dict): The forward data of models. It usually contains
              following keys:

                - points
                - img

            - 'data_samples' (:obj:`Det3DDataSample`): The annotation info
              of the sample.
        """

        if 'points' in results:
            if isinstance(results['points'], BasePoints):
                results['points'] = results['points'].tensor
            # multi-sweep points
            elif isinstance(results['points'], list):
                if isinstance(results['points'][0], BasePoints):
                    for idx in range(len(results['points'])):
                        results['points'][idx] = results['points'][idx].tensor

        for key in ["img", "depth_img"]:
            if key not in results:
                continue
            if isinstance(results[key], list):
                # process multiple imgs in single frame
                imgs = np.stack(results[key], axis=0)
                if len(imgs.shape) <= 3:
                    imgs = imgs[..., None]
                if imgs.flags.c_contiguous:
                    imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
                else:
                    imgs = to_tensor(
                        np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
                results[key] = imgs
            else:
                img = results[key]
                if len(img.shape) < 3:
                    img = np.expand_dims(img, -1)
                # To improve the computational speed by by 3-5 times, apply:
                # `torch.permute()` rather than `np.transpose()`.
                # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
                # for more details
                if img.flags.c_contiguous:
                    img = to_tensor(img).permute(2, 0, 1).contiguous()
                else:
                    img = to_tensor(
                        np.ascontiguousarray(img.transpose(2, 0, 1)))
                results[key] = img

        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
                'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
        ]:
            if key not in results:
                continue
            if isinstance(results[key], list):
                results[key] = [to_tensor(res) for res in results[key]]
            else:
                results[key] = to_tensor(results[key])
        if 'gt_bboxes_3d' in results:
            # multi-sweep version
            if isinstance(results['gt_bboxes_3d'], list):
                if not isinstance(results['gt_bboxes_3d'][0],
                                  BaseInstance3DBoxes):
                    for idx in range(len(results['gt_bboxes_3d'])):
                        results['gt_bboxes_3d'][idx] = to_tensor(
                            results['gt_bboxes_3d'][idx])
            elif not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
                results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])

        if 'gt_semantic_seg' in results:
            results['gt_semantic_seg'] = to_tensor(
                results['gt_semantic_seg'][None])
        if 'gt_seg_map' in results:
            results['gt_seg_map'] = results['gt_seg_map'][None, ...]
        if 'depth_map' in results:
            results['depth_map'] = to_tensor(results['depth_map'])

        data_sample = Det3DDataElement()
        gt_instances_3d = InstanceData()
        gt_instances = InstanceData()
        gt_pts_seg = PointData()
        gt_depth_map = PixelData()

        data_metas = {}
        for key in self.meta_keys:
            if key in results:
                data_metas[key] = results[key]
            # TODO: unify ScanNet multi-view info with nuScenes and Waymo
            elif 'images' in results and isinstance(results['images'], dict):
                if len(results['images'].keys()) == 1:
                    cam_type = list(results['images'].keys())[0]
                    # single-view image
                    if key in results['images'][cam_type]:
                        data_metas[key] = results['images'][cam_type][key]
                else:
                    # multi-view image
                    img_metas = []
                    cam_types = list(results['images'].keys())
                    for cam_type in cam_types:
                        if key in results['images'][cam_type]:
                            img_metas.append(results['images'][cam_type][key])
                    if len(img_metas) > 0:
                        data_metas[key] = img_metas
            elif 'lidar_points' in results and isinstance(
                    results['lidar_points'], dict):
                if key in results['lidar_points']:
                    data_metas[key] = results['lidar_points'][key]
        data_sample.set_metainfo(data_metas)

        inputs = {}
        for key in self.keys:
            if key in results:
                if key in self.INPUTS_KEYS:
                    inputs[key] = results[key]
                elif key in self.INSTANCEDATA_3D_KEYS:
                    gt_instances_3d[self._remove_prefix(key)] = results[key]
                elif key in self.INSTANCEDATA_2D_KEYS:
                    if key == 'gt_bboxes_labels':
                        gt_instances['labels'] = results[key]
                    else:
                        gt_instances[self._remove_prefix(key)] = results[key]
                elif key in self.SEG_KEYS:
                    gt_pts_seg[self._remove_prefix(key)] = results[key]
                elif key == 'depth_map':
                    gt_depth_map.set_data(dict(data=results[key]))
                elif key == 'gt_occupancy':
                    data_sample.gt_occupancy = to_tensor(
                        results['gt_occupancy'])
                    if isinstance(results['gt_occupancy_masks'], list):
                        data_sample.gt_occupancy_masks = [
                            to_tensor(mask)
                            for mask in results['gt_occupancy_masks']
                        ]
                    else:
                        data_sample.gt_occupancy_masks = to_tensor(
                            results['gt_occupancy_masks'])
                else:
                    raise NotImplementedError(f'Please modified '
                                              f'`Pack3DDetInputs` '
                                              f'to put {key} to '
                                              f'corresponding field')

        data_sample.gt_instances_3d = gt_instances_3d
        data_sample.gt_instances = gt_instances
        data_sample.gt_pts_seg = gt_pts_seg
        data_sample.gt_depth_map = gt_depth_map

        if 'eval_ann_info' in results:
            data_sample.eval_ann_info = results['eval_ann_info']
        else:
            data_sample.eval_ann_info = None

        packed_results = dict()
        packed_results['data_samples'] = data_sample
        packed_results['inputs'] = inputs
        return packed_results

    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(keys={self.keys})'
        repr_str += f'(meta_keys={self.meta_keys})'
        return repr_str


================================================
FILE: bip3d/datasets/transforms/loading.py
================================================
from typing import Optional

import mmcv
import mmengine
import numpy as np
from mmcv.transforms import BaseTransform
from mmdet.datasets.transforms import LoadAnnotations

from bip3d.registry import TRANSFORMS


@TRANSFORMS.register_module()
class LoadDepthFromFile(BaseTransform):
    """Load a depth image from file.

    Required Keys:

    - depth_img_path

    Modified Keys:

    - depth_img
    - depth_img_shape

    Args:
        imdecode_backend (str): The image decoding backend type. The backend
            argument for :func:`mmcv.imfrombytes`.
            See :func:`mmcv.imfrombytes` for details.
            Defaults to 'cv2'.
        ignore_empty (bool): Whether to allow loading empty image or file path
            not existent. Defaults to False.
        backend_args (dict, optional): Instantiates the corresponding file
            backend. It may contain `backend` key to specify the file
            backend. If it contains, the file backend corresponding to this
            value will be used and initialized with the remaining values,
            otherwise the corresponding file backend will be selected
            based on the prefix of the file path. Defaults to None.
            New in version 2.0.0rc4.
    """

    def __init__(self,
                 imdecode_backend: str = 'cv2',
                 ignore_empty: bool = False,
                 *,
                 backend_args: Optional[dict] = None):
        self.ignore_empty = ignore_empty
        self.imdecode_backend = imdecode_backend

        self.backend_args = None
        if backend_args is not None:
            self.backend_args = backend_args.copy()

    def transform(self, results: dict):
        """Functions to load image.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded image and meta information.
        """

        filename = results['depth_img_path']
        depth_shift = results['depth_shift']

        try:
            depth_img_bytes = mmengine.fileio.get(
                filename, backend_args=self.backend_args)
            depth_img = mmcv.imfrombytes(depth_img_bytes,
                                         flag='unchanged',
                                         backend=self.imdecode_backend).astype(
                                             np.float32) / depth_shift
        except Exception as e:
            if self.ignore_empty:
                return None
            else:
                raise e
        results['depth_img'] = depth_img
        return results

    def __repr__(self):
        repr_str = (f'{self.__class__.__name__}('
                    f'ignore_empty={self.ignore_empty}, '
                    f"imdecode_backend='{self.imdecode_backend}', ")

        if self.backend_args is not None:
            repr_str += f'backend_args={self.backend_args})'
        else:
            repr_str += f'backend_args={self.backend_args})'

        return repr_str


# TODO : refine
@TRANSFORMS.register_module()
class LoadAnnotations3D(LoadAnnotations):
    """Load Annotations3D.

    Load instance mask and semantic mask of points and
    encapsulate the items into related fields.

    Required Keys:

    - ann_info (dict)

        - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
          :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
          3D ground truth bboxes. Only when `with_bbox_3d` is True
        - gt_labels_3d (np.int64): Labels of ground truths.
          Only when `with_label_3d` is True.
        - gt_bboxes (np.float32): 2D ground truth bboxes.
          Only when `with_bbox` is True.
        - gt_labels (np.ndarray): Labels of ground truths.
          Only when `with_label` is True.
        - depths (np.ndarray): Only when
          `with_bbox_depth` is True.
        - centers_2d (np.ndarray): Only when
          `with_bbox_depth` is True.
        - attr_labels (np.ndarray): Attribute labels of instances.
          Only when `with_attr_label` is True.

    - pts_instance_mask_path (str): Path of instance mask file.
      Only when `with_mask_3d` is True.
    - pts_semantic_mask_path (str): Path of semantic mask file.
      Only when `with_seg_3d` is True.
    - pts_panoptic_mask_path (str): Path of panoptic mask file.
      Only when both `with_panoptic_3d` is True.

    Added Keys:

    - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
      :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
      3D ground truth bboxes. Only when `with_bbox_3d` is True
    - gt_labels_3d (np.int64): Labels of ground truths.
      Only when `with_label_3d` is True.
    - gt_bboxes (np.float32): 2D ground truth bboxes.
      Only when `with_bbox` is True.
    - gt_labels (np.int64): Labels of ground truths.
      Only when `with_label` is True.
    - depths (np.float32): Only when
      `with_bbox_depth` is True.
    - centers_2d (np.ndarray): Only when
      `with_bbox_depth` is True.
    - attr_labels (np.int64): Attribute labels of instances.
      Only when `with_attr_label` is True.
    - pts_instance_mask (np.int64): Instance mask of each point.
      Only when `with_mask_3d` is True.
    - pts_semantic_mask (np.int64): Semantic mask of each point.
      Only when `with_seg_3d` is True.

    Args:
        with_bbox_3d (bool): Whether to load 3D boxes. Defaults to True.
        with_label_3d (bool): Whether to load 3D labels. Defaults to True.
        with_attr_label (bool): Whether to load attribute label.
            Defaults to False.
        with_mask_3d (bool): Whether to load 3D instance masks for points.
            Defaults to False.
        with_seg_3d (bool): Whether to load 3D semantic masks for points.
            Defaults to False.
        with_bbox (bool): Whether to load 2D boxes. Defaults to False.
        with_label (bool): Whether to load 2D labels. Defaults to False.
        with_mask (bool): Whether to load 2D instance masks. Defaults to False.
        with_seg (bool): Whether to load 2D semantic masks. Defaults to False.
        with_bbox_depth (bool): Whether to load 2.5D boxes. Defaults to False.
        with_panoptic_3d (bool): Whether to load 3D panoptic masks for points.
            Defaults to False.
        poly2mask (bool): Whether to convert polygon annotations to bitmasks.
            Defaults to True.
        seg_3d_dtype (str): String of dtype of 3D semantic masks.
            Defaults to 'np.int64'.
        seg_offset (int): The offset to split semantic and instance labels from
            panoptic labels. Defaults to None.
        dataset_type (str): Type of dataset used for splitting semantic and
            instance labels. Defaults to None.
        backend_args (dict, optional): Arguments to instantiate the
            corresponding backend. Defaults to None.
    """

    def __init__(self,
                 with_bbox_3d: bool = True,
                 with_label_3d: bool = True,
                 with_depth_map: bool = False,
                 with_attr_label: bool = False,
                 with_mask_3d: bool = False,
                 with_seg_3d: bool = False,
                 with_bbox: bool = False,
                 with_label: bool = False,
                 with_mask: bool = False,
                 with_seg: bool = False,
                 with_bbox_depth: bool = False,
                 with_panoptic_3d: bool = False,
                 with_visible_instance_masks: bool = False,
                 with_occupancy: bool = False,
                 with_visible_occupancy_masks: bool = False,
                 poly2mask: bool = True,
                 seg_3d_dtype: str = 'np.int64',
                 seg_offset: int = None,
                 dataset_type: str = None,
                 backend_args: Optional[dict] = None):
        super().__init__(with_bbox=with_bbox,
                         with_label=with_label,
                         with_mask=with_mask,
                         with_seg=with_seg,
                         poly2mask=poly2mask,
                         backend_args=backend_args)
        self.with_bbox_3d = with_bbox_3d
        self.with_bbox_depth = with_bbox_depth
        self.with_label_3d = with_label_3d
        self.with_depth_map = with_depth_map
        self.with_attr_label = with_attr_label
        self.with_mask_3d = with_mask_3d
        self.with_seg_3d = with_seg_3d
        self.with_panoptic_3d = with_panoptic_3d
        self.with_visible_instance_masks = with_visible_instance_masks
        self.with_occupancy = with_occupancy
        self.with_visible_occupancy_masks = with_visible_occupancy_masks
        self.seg_3d_dtype = eval(seg_3d_dtype)
        self.seg_offset = seg_offset
        self.dataset_type = dataset_type

    def _load_bboxes_3d(self, results: dict):
        """Private function to move the 3D bounding box annotation from
        `ann_info` field to the root of `results`.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded 3D bounding box annotations.
        """

        results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
        return results

    def _load_bboxes_depth(self, results: dict):
        """Private function to load 2.5D bounding box annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded 2.5D bounding box annotations.
        """

        results['depths'] = results['ann_info']['depths']
        results['centers_2d'] = results['ann_info']['centers_2d']
        return results

    def _load_labels_3d(self, results: dict):
        """Private function to load label annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded label annotations.
        """

        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
        return results

    def _load_attr_labels(self, results: dict):
        """Private function to load label annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded label annotations.
        """
        results['attr_labels'] = results['ann_info']['attr_labels']
        return results

    def _load_depth_map(self, results: dict):

        img_filename = results['img_path']
        pts_filename = img_filename.replace('samples', 'depth_points') + '.bin'
        results['depth_map_path'] = pts_filename
        if self.file_client is None:
            self.file_client = mmengine.FileClient(**self.backend_args)
        try:
            pts_bytes = self.file_client.get(pts_filename)
            points = np.frombuffer(pts_bytes, dtype=np.float32)
        except ConnectionError:
            mmengine.check_file_exist(pts_filename)
            if pts_filename.endswith('.npy'):
                points = np.load(pts_filename)
            else:
                points = np.fromfile(pts_filename, dtype=np.float32)
        pts_img = points.reshape(-1, 3)
        img_shape = results['ori_shape']
        depth_img = np.zeros(img_shape, dtype=np.float32)
        iy = np.round(pts_img[:, 1]).astype(np.int64)
        ix = np.round(pts_img[:, 0]).astype(np.int64)
        depth_img[iy, ix] = pts_img[:, 2]
        results['depth_map'] = depth_img

        return results

    def _load_masks_3d(self, results: dict):
        """Private function to load 3D mask annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded 3D mask annotations.
        """
        pts_instance_mask_path = results['pts_instance_mask_path']

        try:
            mask_bytes = mmengine.fileio.get(pts_instance_mask_path,
                                             backend_args=self.backend_args)
            pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int64)
        except ConnectionError:
            mmengine.check_file_exist(pts_instance_mask_path)
            pts_instance_mask = np.fromfile(pts_instance_mask_path,
                                            dtype=np.int64)

        results['pts_instance_mask'] = pts_instance_mask
        # 'eval_ann_info' will be passed to evaluator
        if 'eval_ann_info' in results:
            results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask
        return results

    def _load_semantic_seg_3d(self, results: dict):
        """Private function to load 3D semantic segmentation annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing the semantic segmentation annotations.
        """
        pts_semantic_mask_path = results['pts_semantic_mask_path']

        try:
            mask_bytes = mmengine.fileio.get(pts_semantic_mask_path,
                                             backend_args=self.backend_args)
            # add .copy() to fix read-only bug
            pts_semantic_mask = np.frombuffer(mask_bytes,
                                              dtype=self.seg_3d_dtype).copy()
        except ConnectionError:
            mmengine.check_file_exist(pts_semantic_mask_path)
            pts_semantic_mask = np.fromfile(pts_semantic_mask_path,
                                            dtype=np.int64)

        if self.dataset_type == 'semantickitti':
            pts_semantic_mask = pts_semantic_mask.astype(np.int64)
            pts_semantic_mask = pts_semantic_mask % self.seg_offset
        # nuScenes loads semantic and panoptic labels from different files.

        results['pts_semantic_mask'] = pts_semantic_mask

        # 'eval_ann_info' will be passed to evaluator
        if 'eval_ann_info' in results:
            results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
        return results

    def _load_panoptic_3d(self, results: dict):
        """Private function to load 3D panoptic segmentation annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing the panoptic segmentation annotations.
        """
        pts_panoptic_mask_path = results['pts_panoptic_mask_path']

        try:
            mask_bytes = mmengine.fileio.get(pts_panoptic_mask_path,
                                             backend_args=self.backend_args)
            # add .copy() to fix read-only bug
            pts_panoptic_mask = np.frombuffer(mask_bytes,
                                              dtype=self.seg_3d_dtype).copy()
        except ConnectionError:
            mmengine.check_file_exist(pts_panoptic_mask_path)
            pts_panoptic_mask = np.fromfile(pts_panoptic_mask_path,
                                            dtype=np.int64)

        if self.dataset_type == 'semantickitti':
            pts_semantic_mask = pts_panoptic_mask.astype(np.int64)
            pts_semantic_mask = pts_semantic_mask % self.seg_offset
        elif self.dataset_type == 'nuscenes':
            pts_semantic_mask = pts_semantic_mask // self.seg_offset

        results['pts_semantic_mask'] = pts_semantic_mask

        # We can directly take panoptic labels as instance ids.
        pts_instance_mask = pts_panoptic_mask.astype(np.int64)
        results['pts_instance_mask'] = pts_instance_mask

        # 'eval_ann_info' will be passed to evaluator
        if 'eval_ann_info' in results:
            results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
            results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask
        return results

    def _load_visible_instance_masks(self, results: dict):
        """Private function to move the 3D bounding box annotation from
        `ann_info` field to the root of `results`.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded 3D bounding box annotations.
        """

        results['visible_instance_masks'] = results['ann_info'][
            'visible_instance_masks']
        return results

    def _load_occupancy(self, results: dict):
        """Private function to move the 3D bounding box annotation from
        `ann_info` field to the root of `results`.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded 3D bounding box annotations.
        """

        results['gt_occupancy'] = results['ann_info']['gt_occupancy']
        return results

    def _load_visible_occupancy_masks(self, results: dict):
        """Private function to move the 3D bounding box annotation from
        `ann_info` field to the root of `results`.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded 3D bounding box annotations.
        """

        results['visible_occupancy_masks'] = results['ann_info'][
            'visible_occupancy_masks']
        return results

    def _load_bboxes(self, results: dict):
        """Private function to load bounding box annotations.

        The only difference is it remove the proceess for
        `ignore_flag`

        Args:
            results (dict): Result dict from :obj:`mmcv.BaseDataset`.

        Returns:
            dict: The dict contains loaded bounding box annotations.
        """

        results['gt_bboxes'] = results['ann_info']['gt_bboxes']

    def _load_labels(self, results: dict):
        """Private function to load label annotations.

        Args:
            results (dict): Result dict from :obj :obj:`mmcv.BaseDataset`.

        Returns:
            dict: The dict contains loaded label annotations.
        """
        results['gt_bboxes_labels'] = results['ann_info']['gt_bboxes_labels']

    def transform(self, results: dict):
        """Function to load multiple types annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.

        Returns:
            dict: The dict containing loaded 3D bounding box, label, mask and
            semantic segmentation annotations.
        """
        results = super().transform(results)
        if self.with_bbox_3d:
            results = self._load_bboxes_3d(results)
        if self.with_bbox_depth:
            results = self._load_bboxes_depth(results)
        if self.with_label_3d:
            results = self._load_labels_3d(results)
        if self.with_depth_map:
            results = self._load_depth_map(results)
        if self.with_attr_label:
            results = self._load_attr_labels(results)
        if self.with_panoptic_3d:
            results = self._load_panoptic_3d(results)
        if self.with_mask_3d:
            results = self._load_masks_3d(results)
        if self.with_seg_3d:
            results = self._load_semantic_seg_3d(results)
        if self.with_visible_instance_masks:
            results = self._load_visible_instance_masks(results)
        if self.with_occupancy:
            results = self._load_occupancy(results)
        if self.with_visible_occupancy_masks:
            results = self._load_visible_occupancy_masks(results)
        return results

    def __repr__(self):
        """str: Return a string that describes the module."""
        indent_str = '    '
        repr_str = self.__class__.__name__ + '(\n'
        repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
        repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
        repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
        repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
        repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
        repr_str += f'{indent_str}with_panoptic_3d={self.with_panoptic_3d}, '
        repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
        repr_str += f'{indent_str}with_label={self.with_label}, '
        repr_str += f'{indent_str}with_mask={self.with_mask}, '
        repr_str += f'{indent_str}with_seg={self.with_seg}, '
        repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
        repr_str += f'{indent_str}poly2mask={self.poly2mask}), '
        repr_str += f'{indent_str}seg_offset={self.seg_offset}), '
        repr_str += f'{indent_str}with_visible_instance_masks='
        repr_str += f'{self.with_visible_instance_masks}), '
        repr_str += f'{indent_str}with_occupancy={self.with_occupancy}), '
        repr_str += f'{indent_str}with_visible_occupancy_masks='
        repr_str += f'{self.with_visible_occupancy_masks})'

        return repr_str


================================================
FILE: bip3d/datasets/transforms/multiview.py
================================================
import copy
import random

import numpy as np
import torch
from mmcv.transforms import BaseTransform, Compose

from bip3d.registry import TRANSFORMS
from bip3d.structures.points import get_points_type
from ..utils import sample


@TRANSFORMS.register_module()
class MultiViewPipeline(BaseTransform):
    """HARD CODE"""
    def __init__(
        self,
        transforms,
        n_images,
        max_n_images=None,
        ordered=False,
        rotate_3rscan=False,
    ):
        super().__init__()
        self.transforms = Compose(transforms)
        self.n_images = n_images
        self.max_n_images = (
            max_n_images if max_n_images is not None else n_images
        )
        self.ordered = ordered
        self.rotate_3rscan = rotate_3rscan

    def transform(self, results: dict):
        """Transform function.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: output dict after transformation.
        """
        imgs = []
        img_paths = []
        points = []
        intrinsics = []
        extrinsics = []
        depth_imgs = []
        trans_mat = []
        
        total_n = len(results["img_path"])
        sample_n = min(max(self.n_images, total_n), self.max_n_images)
        ids = sample(total_n, sample_n, self.ordered)
        for i in ids.tolist():
            _results = dict()
            _results["img_path"] = results["img_path"][i]
            if "depth_img_path" in results:
                _results["depth_img_path"] = results["depth_img_path"][i]
                if isinstance(results["depth_cam2img"], list):
                    _results["depth_cam2img"] = results["depth_cam2img"][i]
                    _results["cam2img"] = results["depth2img"]["intrinsic"][i]
                else:
                    _results["depth_cam2img"] = results["depth_cam2img"]
                    _results["cam2img"] = results["cam2img"]
                _results["depth_shift"] = results["depth_shift"]
            _results = self.transforms(_results)
            if "depth_shift" in _results:
                _results.pop("depth_shift")
            if "img" in _results:
                imgs.append(_results["img"])
                img_paths.append(_results["img_path"])
            if "depth_img" in _results:
                depth_imgs.append(_results["depth_img"])
            if "points" in _results:
                points.append(_results["points"])
            if _results.get("modify_cam2img"):
                intrinsics.append(_results["cam2img"])
            elif isinstance(results["depth2img"]["intrinsic"], list):
                intrinsics.append(results["depth2img"]["intrinsic"][i])
            else:
                intrinsics.append(results["depth2img"]["intrinsic"])
            extrinsics.append(results["depth2img"]["extrinsic"][i])
            if "trans_mat" in _results:
                trans_mat.append(_results["trans_mat"])
        for key in _results.keys():
            if key not in ["img", "points", "img_path"]:
                results[key] = _results[key]
        if len(imgs):
            if self.rotate_3rscan and "3rscan" in img_paths[0]:
                imgs = [np.transpose(x, (1, 0, 2)) for x in imgs]
                rot_mat = np.array(
                    [
                        [0, 1, 0, 0],
                        [1, 0, 0, 0],
                        [0, 0, 1, 0],
                        [0, 0, 0, 1],
                    ]
                )
                rot_mat = [np.linalg.inv(x) @ rot_mat @ x for x in intrinsics]
                extrinsics = [x @ y for x, y in zip(rot_mat, extrinsics)]
                results["scale_factor"] = results["scale_factor"][::-1]
                results["ori_shape"] = results["ori_shape"][::-1]
            results["img"] = imgs
            results["img_path"] = img_paths
        if len(depth_imgs):
            if self.rotate_3rscan and "3rscan" in img_paths[0]:
                depth_imgs = [np.transpose(x, (1, 0)) for x in depth_imgs]
            results["depth_img"] = depth_imgs

        if len(points):
            results["points"] = points
        if (
            "ann_info" in results
            and "visible_instance_masks" in results["ann_info"]
        ):
            results["visible_instance_masks"] = [
                results["ann_info"]["visible_instance_masks"][i] for i in ids
            ]
            results["ann_info"]["visible_instance_masks"] = results[
                "visible_instance_masks"
            ]
        elif "visible_instance_masks" in results:
            results["visible_instance_masks"] = [
                results["visible_instance_masks"][i] for i in ids
            ]
        results["depth2img"]["intrinsic"] = intrinsics
        results["depth2img"]["extrinsic"] = extrinsics
        if len(trans_mat) != 0:
            results["depth2img"]["trans_mat"] = trans_mat
        return results


================================================
FILE: bip3d/datasets/transforms/transform.py
================================================
from typing import List, Optional, Tuple, Union
import cv2
import copy

import numpy as np
from numpy import random
import mmcv
import torch
from PIL import Image
from mmcv.transforms import BaseTransform, Resize

from bip3d.registry import TRANSFORMS
from bip3d.structures.bbox_3d import points_cam2img, points_img2cam
from bip3d.structures.points import BasePoints, get_points_type


@TRANSFORMS.register_module()
class CategoryGroundingDataPrepare(BaseTransform):
    def __init__(
        self,
        classes,
        training,
        max_class=None,
        sep_token="[SEP]",
        filter_others=True,
        z_range=None,
        filter_invisible=False,
        instance_mask_key="visible_instance_masks",
    ):
        self.classes = list(classes)
        self.training = training
        self.max_class = max_class
        self.sep_token = sep_token
        self.filter_others = filter_others
        self.z_range = z_range
        self.filter_invisible = filter_invisible
        self.instance_mask_key = instance_mask_key

    def transform(self, results):
        visible_instance_masks = results.get(self.instance_mask_key)
        if isinstance(visible_instance_masks, (list, tuple)):
            visible_instance_masks = np.stack(visible_instance_masks).any(axis=0)
            
        ignore_mask = results.get("ignore_mask")
        gt_names = results["ann_info"]["gt_names"]
        if self.filter_others and "gt_labels_3d" in results:
            mask = results["gt_labels_3d"] >= 0
            results["gt_labels_3d"] = results["gt_labels_3d"][mask]
            results["gt_bboxes_3d"] = results["gt_bboxes_3d"][mask]
            visible_instance_masks = visible_instance_masks[mask]
            gt_names = [x for i, x in enumerate(gt_names) if mask[i]]
            if ignore_mask is not None:
                ignore_mask = ignore_mask[mask]

        if (
            self.z_range is not None
            and "gt_labels_3d" in results
            and self.training
        ):
            mask = torch.logical_and(
                results["gt_bboxes_3d"].tensor[..., 2] >= self.z_range[0],
                results["gt_bboxes_3d"].tensor[..., 2] <= self.z_range[1],
            ).numpy()
            results["gt_labels_3d"] = results["gt_labels_3d"][mask]
            results["gt_bboxes_3d"] = results["gt_bboxes_3d"][mask]
            visible_instance_masks = visible_instance_masks[mask]
            gt_names = [x for i, x in enumerate(gt_names) if mask[i]]
            if ignore_mask is not None:
                ignore_mask = ignore_mask[mask]

        if self.training or self.filter_invisible:
            results["gt_labels_3d"] = results["gt_labels_3d"][
                visible_instance_masks
            ]
            results["gt_bboxes_3d"] = results["gt_bboxes_3d"][
                visible_instance_masks
            ]
            gt_names = [
                x for i, x in enumerate(gt_names) if visible_instance_masks[i]
            ]
            if ignore_mask is not None:
                ignore_mask = ignore_mask[visible_instance_masks]
                results["ignore_mask"] = ignore_mask

        if self.training:
            if (
                self.max_class is not None
                and len(self.classes) > self.max_class
            ):
                classes = copy.deepcopy(gt_names)
                random.shuffle(self.classes)
                for c in self.classes:
                    if c in classes:
                        continue
                    classes.append(c)
                    if len(classes) >= self.max_class:
                        break
                random.shuffle(classes)
            else:
                classes = copy.deepcopy(self.classes)
        else:
            classes = copy.deepcopy(self.classes)
            gt_names = classes

        results["text"] = self.sep_token.join(classes)
        tokens_positive = []
        for name in gt_names:
            start = results["text"].find(
                self.sep_token + name + self.sep_token
            )
            if start == -1:
                if results["text"].startswith(name + self.sep_token):
                    start = 0
                else:
                    start = results["text"].find(self.sep_token + name) + len(
                        self.sep_token
                    )
            else:
                start += len(self.sep_token)
            end = start + len(name)
            tokens_positive.append([[start, end]])
        results["tokens_positive"] = tokens_positive
        return results


@TRANSFORMS.register_module()
class CamIntrisicStandardization(BaseTransform):
    def __init__(self, dst_intrinsic, dst_wh):
        if not isinstance(dst_intrinsic, np.ndarray):
            dst_intrinsic = np.array(dst_intrinsic)
        if dst_intrinsic.shape[0] == 3:
            tmp = np.eye(4)
            tmp[:3, :3] = dst_intrinsic
            dst_intrinsic = tmp
        self.dst_intrinsic = dst_intrinsic
        self.dst_wh = dst_wh
        u, v = np.arange(dst_wh[0]), np.arange(dst_wh[1])
        u = np.repeat(u[None], dst_wh[1], 0)
        v = np.repeat(v[:, None], dst_wh[0], 1)
        uv = np.stack([u, v, np.ones_like(u)], axis=-1)
        self.dst_pts = uv @ np.linalg.inv(self.dst_intrinsic[:3, :3]).T

    def transform(self, results):
        src_intrinsic = results["cam2img"][:3, :3]
        src_uv = self.dst_pts @ src_intrinsic.T
        src_uv = src_uv.astype(np.float32)
        if "depth_img" in results and results["img"].shape[:2] != results["depth_img"].shape[:2]:
            results["depth_img"] = cv2.resize(
                results["depth_img"], results["img"].shape[:2][::-1],
                interpolation=cv2.INTER_LINEAR,
            )
        for key in ["img", "depth_img"]:
            if key not in results:
                continue
            results[key] = cv2.remap(
                results[key],
                src_uv[..., 0],
                src_uv[..., 1],
                cv2.INTER_NEAREST,
            )
            # warp_mat = self.dst_intrinsic[:3, :3] @ np.linalg.inv(src_intrinsic)
            # cv2.warpAffine(results[key], warp_mat[:2, :3], self.dst_wh)
        results["cam2img"] = copy.deepcopy(self.dst_intrinsic)
        results["depth_cam2img"] = copy.deepcopy(self.dst_intrinsic)
        results["scale"] = self.dst_wh
        results['img_shape'] = results["img"].shape[:2]
        results['scale_factor'] = (1, 1)
        results['keep_ratio'] = False
        results["modify_cam2img"] = True
        return results


@TRANSFORMS.register_module()
class CustomResize(Resize):
    def _resize_img(self, results: dict, key="img"):
        """Resize images with ``results['scale']``."""

        if results.get(key, None) is not None:
            if self.keep_ratio:
                img, scale_factor = mmcv.imrescale(
                    results[key],
                    results['scale'],
                    interpolation=self.interpolation,
                    return_scale=True,
                    backend=self.backend)
                # the w_scale and h_scale has minor difference
                # a real fix should be done in the mmcv.imrescale in the future
                new_h, new_w = img.shape[:2]
                h, w = results[key].shape[:2]
                w_scale = new_w / w
                h_scale = new_h / h
            else:
                img, w_scale, h_scale = mmcv.imresize(
                    results[key],
                    results['scale'],
                    interpolation=self.interpolation,
                    return_scale=True,
                    backend=self.backend)
            results[key] = img
            if key == "img":
                results['img_shape'] = img.shape[:2]
                results['scale_factor'] = (w_scale, h_scale)
                results['keep_ratio'] = self.keep_ratio

    def transform(self, results: dict):
        if self.scale:
            results['scale'] = self.scale
        else:
            img_shape = results['img'].shape[:2]
            results['scale'] = _scale_size(img_shape[::-1],
                                           self.scale_factor)  # type: ignore
        self._resize_img(results)
        self._resize_bboxes(results)
        self._resize_seg(results)
        self._resize_keypoints(results)
        if "depth_img" in results:
            self._resize_img(results, key="depth_img")
        return results


@TRANSFORMS.register_module()
class DepthProbLabelGenerator(BaseTransform):
    def __init__(
        self,
        max_depth=10,
        min_depth=0.25,
        num_depth=64,
        stride=[8, 16, 32, 64],
        origin_stride=1,
    ):
        self.max_depth = max_depth
        self.min_depth = min_depth
        self.num_depth = num_depth
        self.stride = [x//origin_stride for x in stride]
        self.origin_stride = origin_stride

    def transform(self, input_dict):
        depth = input_dict["inputs"]["depth_img"].cpu().numpy()
        if self.origin_stride != 1:
            H, W = depth.shape[-2:]
            depth = np.transpose(depth, (0, 2, 3, 1))
            depth = [mmcv.imresize(
                x, (W//self.origin_stride, H//self.origin_stride),
                interpolation="nearest",
            ) for x in depth]
            depth = np.stack(depth)[:,None]
        depth = np.clip(
            depth,
            a_min=self.min_depth,
            a_max=self.max_depth,
        )
        depth_anchor = np.linspace(
            self.min_depth, self.max_depth, self.num_depth)[:, None, None]
        distance = np.abs(depth - depth_anchor)
        mask = distance < (depth_anchor[1] - depth_anchor[0])
        depth_gt = np.where(mask, depth_anchor, 0)
        y = depth_gt.sum(axis=1, keepdims=True) - depth_gt
        depth_valid_mask = depth > 0
        depth_prob_gt = np.where(
            (depth_gt != 0) & depth_valid_mask,
            (depth - y) / (depth_gt - y),
            0,
        )
        views, _, H, W = depth.shape
        gt = []
        gt_map = []
        for s in self.stride:
            gt_tmp = np.reshape(
                depth_prob_gt, (views, self.num_depth, H//s, s, W//s, s))
            gt_tmp = gt_tmp.sum(axis=-1).sum(axis=3)
            mask_tmp = depth_valid_mask.reshape(views, 1, H//s, s, W//s, s)
            mask_tmp = mask_tmp.sum(axis=-1).sum(axis=3)
            gt_tmp /= np.clip(mask_tmp, a_min=1, a_max=None)
            # gt_map.append(np.transpose(gt_tmp, (0, 2, 3, 1)))
            gt_tmp = gt_tmp.reshape(views, self.num_depth, -1)
            gt_tmp = np.transpose(gt_tmp, (0, 2, 1))
            gt.append(gt_tmp)
        gt = np.concatenate(gt, axis=1)
        gt = np.clip(gt, a_min=0.0, a_max=1.0)
        input_dict["inputs"]["depth_prob_gt"] = torch.from_numpy(gt).to(
            input_dict["inputs"]["depth_img"])
        return input_dict


================================================
FILE: bip3d/datasets/utils.py
================================================
import numpy as np
from scipy.spatial.transform import Rotation


def sample(total_n, sample_n, fix_interval=True):
    ids = np.arange(total_n)
    if sample_n == total_n:
        return ids
    elif sample_n > total_n:
        return np.concatenate([
            ids, sample(total_n, sample_n - total_n, fix_interval)
        ])
    elif fix_interval:
        interval = total_n / sample_n
        output = []
        for i in range(sample_n):
            output.append(ids[int(interval * i)])
        return np.array(output)
    return np.random.choice(ids, sample_n, replace=False)


def xyzrpy_to_matrix(input):
    output = np.eye(4)
    output[:3, :3] = Rotation.from_euler("xyz", np.array(input[3:])).as_matrix()
    output[:3, 3] = input[:3]
    return output


================================================
FILE: bip3d/eval/__init__.py
================================================
from .metrics import GroundingMetric, IndoorDetMetric

__all__ = ['IndoorDetMetric', 'GroundingMetric']


================================================
FILE: bip3d/eval/indoor_eval.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import numpy as np
import torch
from mmengine.logging import print_log
from terminaltables import AsciiTable
# from bip3d.visualization.default_color_map import DEFAULT_COLOR_MAP
import matplotlib.pyplot as plt


def average_precision(recalls, precisions, mode='area'):
    """Calculate average precision (for single or multiple scales).

    Args:
        recalls (np.ndarray): Recalls with shape of (num_scales, num_dets)
            or (num_dets, ).
        precisions (np.ndarray): Precisions with shape of
            (num_scales, num_dets) or (num_dets, ).
        mode (str): 'area' or '11points', 'area' means calculating the area
            under precision-recall curve, '11points' means calculating
            the average precision of recalls at [0, 0.1, ..., 1]

    Returns:
        float or np.ndarray: Calculated average precision.
    """
    if recalls.ndim == 1:
        recalls = recalls[np.newaxis, :]
        precisions = precisions[np.newaxis, :]

    assert recalls.shape == precisions.shape
    assert recalls.ndim == 2

    num_scales = recalls.shape[0]
    ap = np.zeros(num_scales, dtype=np.float32)
    if mode == 'area':
        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
        mrec = np.hstack((zeros, recalls, ones))
        mpre = np.hstack((zeros, precisions, zeros))
        for i in range(mpre.shape[1] - 1, 0, -1):
            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
        for i in range(num_scales):
            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
            ap[i] = np.sum(
                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
    elif mode == '11points':
        for i in range(num_scales):
            for thr in np.arange(0, 1 + 1e-3, 0.1):
                precs = precisions[i, recalls[i, :] >= thr]
                prec = precs.max() if precs.size > 0 else 0
                ap[i] += prec
            ap /= 11
    else:
        raise ValueError(
            'Unrecognized mode, only "area" and "11points" are supported')
    return ap


def eval_det_cls(pred, gt, iou_thr=None, area_range=None):
    """Generic functions to compute precision/recall for object detection for a
    single class.

    Args:
        pred (dict): Predictions mapping from image id to bounding boxes
            and scores.
        gt (dict): Ground truths mapping from image id to bounding boxes.
        iou_thr (list[float]): A list of iou thresholds.

    Return:
        tuple (np.ndarray, np.ndarray, float): Recalls, precisions and
            average precision.
    """

    # {img_id: {'bbox': box structure, 'det': matched list}}
    class_recs = {}
    npos = 0
    # figure out the bbox code size first
    gt_bbox_code_size = 9
    pred_bbox_code_size = 9
    for img_id in gt.keys():
        if len(gt[img_id]) != 0:
            gt_bbox_code_size = gt[img_id][0].tensor.shape[1]
            break
    for img_id in pred.keys():
        if len(pred[img_id][0]) != 0:
            pred_bbox_code_size = pred[img_id][0][0].tensor.shape[1]
            break
    assert gt_bbox_code_size == pred_bbox_code_size
    for img_id in gt.keys():
        cur_gt_num = len(gt[img_id])
        if cur_gt_num != 0:
            gt_cur = torch.zeros([cur_gt_num, gt_bbox_code_size],
                                 dtype=torch.float32)
            for i in range(cur_gt_num):
                gt_cur[i] = gt[img_id][i].tensor
            bbox = gt[img_id][0].new_box(gt_cur)
        else:
            bbox = gt[img_id]
        det = [[False] * len(bbox) for i in iou_thr]
        npos += len(bbox)
        class_recs[img_id] = {'bbox': bbox, 'det': det}
        if area_range is not None and len(bbox) > 0:
            area = bbox.tensor[:, 3:6].cumprod(dim=-1)[..., -1]
            ignore = torch.logical_or(area < area_range[0], area > area_range[1])
            npos -= ignore.sum().item()
            class_recs[img_id]["ignore"] = ignore

    # construct dets
    image_ids = []
    confidence = []
    ious = []
    for img_id in pred.keys():
        cur_num = len(pred[img_id])
        if cur_num == 0:
            continue
        pred_cur = torch.zeros((cur_num, pred_bbox_code_size),
                               dtype=torch.float32)
        box_idx = 0
        for box, score in pred[img_id]:
            image_ids.append(img_id)
            confidence.append(score)
            # handle outlier (too thin) predicted boxes
            w, l, h = box.tensor[0, 3:6]
            faces = [w * l, w * h, h * l]
            if torch.any(box.tensor.new_tensor(faces) < 2e-4):
                # print('Find small predicted boxes,',
                #       'and clamp short edges to 2e-2 meters.')
                box.tensor[:, 3:6] = torch.clamp(box.tensor[:, 3:6], min=2e-2)
            pred_cur[box_idx] = box.tensor
            box_idx += 1
        pred_cur = box.new_box(pred_cur)
        gt_cur = class_recs[img_id]['bbox']
        if len(gt_cur) > 0:
            # calculate iou in each image
            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
            for i in range(cur_num):
                ious.append(iou_cur[i])
        else:
            for i in range(cur_num):
                ious.append(np.zeros(1))

    confidence = np.array(confidence)

    # sort by confidence
    sorted_ind = np.argsort(-confidence)
    image_ids = [image_ids[x] for x in sorted_ind]
    ious = [ious[x] for x in sorted_ind]

    # go down dets and mark TPs and FPs
    num_images = len(image_ids)
    tp_thr = [np.zeros(num_images) for i in iou_thr]
    fp_thr = [np.zeros(num_images) for i in iou_thr]
    for d in range(num_images):
        R = class_recs[image_ids[d]]
        iou_max = -np.inf
        BBGT = R['bbox']
        cur_iou = ious[d]

        if len(BBGT) > 0:
            # compute overlaps
            for j in range(len(BBGT)):
                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
                iou = cur_iou[j]
                if iou > iou_max:
                    iou_max = iou
                    jmax = j

        for iou_idx, thresh in enumerate(iou_thr):
            if iou_max > thresh:
                if "ignore" in R and R["ignore"][jmax]:
                    continue
                if not R['det'][iou_idx][jmax]:
                    tp_thr[iou_idx][d] = 1.
                    R['det'][iou_idx][jmax] = 1
                else:
                    fp_thr[iou_idx][d] = 1.
            else:
                fp_thr[iou_idx][d] = 1.

    ret = []
    for iou_idx, thresh in enumerate(iou_thr):
        # compute precision recall
        fp = np.cumsum(fp_thr[iou_idx])
        tp = np.cumsum(tp_thr[iou_idx])
        recall = tp / float(npos)
        # avoid divide by zero in case the first detection matches a difficult
        # ground truth
        precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
        ap = average_precision(recall, precision)
        ret.append((recall, precision, ap))

    return ret


def eval_map_recall(pred, gt, ovthresh=None, area_range=None):
    """Evaluate mAP and recall.

    Generic functions to compute precision/recall for object detection
        for multiple classes.

    Args:
        pred (dict): Information of detection results,
            which maps class_id and predictions.
        gt (dict): Information of ground truths, which maps class_id and
            ground truths.
        ovthresh (list[float], optional): iou threshold. Default: None.

    Return:
        tuple[dict]: dict results of recall, AP, and precision for all classes.
    """

    ret_values = {}
    for classname in gt.keys():
        if classname in pred:
            ret_values[classname] = eval_det_cls(pred[classname],
                                                 gt[classname], ovthresh, area_range)
    recall = [{} for i in ovthresh]
    precision = [{} for i in ovthresh]
    ap = [{} for i in ovthresh]

    for label in gt.keys():
        for iou_idx, thresh in enumerate(ovthresh):
            if label in pred:
                recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
                    label] = ret_values[label][iou_idx]
            else:
                recall[iou_idx][label] = np.zeros(1)
                precision[iou_idx][label] = np.zeros(1)
                ap[iou_idx][label] = np.zeros(1)

    return recall, precision, ap


def save_pr_curve(recs, precs, metrics):
    color = list(DEFAULT_COLOR_MAP.values())
    for rec, prec, metric in zip(recs, precs, metrics):
        plt.figure(figsize=(20, 18))
        fontsize = 50
        font = 'Times New Roman'
        for k in rec.keys():
            if np.isnan(rec[k]).any():
                continue
            plt.plot(rec[k], prec[k])
        plt.xlabel('Recall', fontsize=fontsize)
        plt.ylabel('Precision', fontsize=fontsize)
        plt.savefig(f'./pr_curver_{metric}.png',dpi=500.)


def indoor_eval(gt_annos,
                dt_annos,
                metric,
                label2cat,
                logger=None,
                box_mode_3d=None,
                classes_split=None,
                part="Overall"):
    """Indoor Evaluation.

    Evaluate the result of the detection.

    Args:
        gt_annos (list[dict]): Ground truth annotations.
        dt_annos (list[dict]): Detection annotations. the dict
            includes the following keys

            - labels_3d (torch.Tensor): Labels of boxes.
            - bboxes_3d (:obj:`BaseInstance3DBoxes`):
                3D bounding boxes in Depth coordinate.
            - scores_3d (torch.Tensor): Scores of boxes.
        metric (list[float]): IoU thresholds for computing average precisions.
        label2cat (tuple): Map from label to category.
        logger (logging.Logger | str, optional): The way to print the mAP
            summary. See `mmdet.utils.print_log()` for details. Default: None.

    Return:
        dict[str, float]: Dict of results.
    """
    assert len(dt_annos) == len(gt_annos)
    pred = {}  # map {class_id: pred}
    gt = {}  # map {class_id: gt}
    for img_id in range(len(dt_annos)):
        # parse detected annotations
        det_anno = dt_annos[img_id]
        for i in range(len(det_anno['labels_3d'])):
            label = det_anno['labels_3d'].numpy()[i]
            bbox = det_anno['bboxes_3d'].convert_to(box_mode_3d)[i]
            score = det_anno['scores_3d'].numpy()[i]
            if label not in pred:
                pred[int(label)] = {}
            if img_id not in pred[label]:
                pred[int(label)][img_id] = []
            if label not in gt:
                gt[int(label)] = {}
            if img_id not in gt[label]:
                gt[int(label)][img_id] = []
            pred[int(label)][img_id].append((bbox, score))

        # parse gt annotations
        gt_anno = gt_annos[img_id]

        gt_boxes = gt_anno['gt_bboxes_3d']
        labels_3d = gt_anno['gt_labels_3d']

        for i in range(len(labels_3d)):
            label = labels_3d[i]
            bbox = gt_boxes[i]
            if label not in gt:
                gt[label] = {}
            if img_id not in gt[label]:
                gt[label][img_id] = []
            gt[label][img_id].append(bbox)

    rec, prec, ap = eval_map_recall(pred, gt, metric)

    # filter nan results
    ori_keys = list(ap[0].keys())
    for key in ori_keys:
        if np.isnan(ap[0][key][0]):
            for r in rec:
                del r[key]
            for p in prec:
                del p[key]
            for a in ap:
                del a[key]

    ret_dict = dict()
    header = ['classes']
    table_columns = [[label2cat[label]
                      for label in ap[0].keys()] + [part]]

    for i, iou_thresh in enumerate(metric):
        header.append(f'AP_{iou_thresh:.2f}')
        header.append(f'AR_{iou_thresh:.2f}')
        rec_list = []
        for label in ap[i].keys():
            ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
                ap[i][label][0])
        ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list(
            ap[i].values())))

        table_columns.append(list(map(float, list(ap[i].values()))))
        table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]

        for label in rec[i].keys():
            ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
                rec[i][label][-1])
            rec_list.append(rec[i][label][-1])
        ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))

        table_columns.append(list(map(float, rec_list)))
        table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]

    table_data = [header]
    table_rows = list(zip(*table_columns))
    table_data += table_rows
    table = AsciiTable(table_data)
    table.inner_footing_row_border = True
    print_log('\n' + table.table, logger=logger)

    if classes_split is not None:
        splits = ['head', 'common', 'tail']
        for idx in range(len(splits)):
            header = [f'{splits[idx]}_classes']
            # init the category list/column
            cat_list = []
            for label in classes_split[idx]:
                if label in ap[0]:
                    cat_list.append(label2cat[label])
            table_columns = [cat_list + [part]]

            for i, iou_thresh in enumerate(metric):
                header.append(f'AP_{iou_thresh:.2f}')
                header.append(f'AR_{iou_thresh:.2f}')
                ap_list = []
                for label in classes_split[idx]:
                    if label in ap[i]:
                        ap_list.append(float(ap[i][label][0]))
                mean_ap = float(np.mean(ap_list))

                table_columns.append(list(map(float, ap_list)))
                table_columns[-1] += [mean_ap]
                table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]

                rec_list = []
                for label in classes_split[idx]:
                    if label in rec[i]:
                        rec_list.append(rec[i][label][-1])
                mean_rec = float(np.mean(rec_list))

                table_columns.append(list(map(float, rec_list)))
                table_columns[-1] += [mean_rec]
                table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]

            table_data = [header]
            table_rows = list(zip(*table_columns))
            table_data += table_rows
            table = AsciiTable(table_data)
            table.inner_footing_row_border = True
            print_log('\n' + table.table, logger=logger)

    area_split = {
        "small": [0, 0.2**3],
        "medium": [0.2**3, 1.0**3],
        "large": [1.0**3, float("inf")],
    }
    if area_split is not None:
        header = ["area"]
        for i, iou_thresh in enumerate(metric):
            header.append(f'AP_{iou_thresh:.2f}')
            header.append(f'AR_{iou_thresh:.2f}')
        table_rows = []
        area_ret_dict = dict()
        for idx, (area, area_range) in enumerate(area_split.items()):
            # header = [area]
            rec, prec, ap = eval_map_recall(pred, gt, metric, area_range)
            ori_keys = list(ap[0].keys())
            for key in ori_keys:
                if np.isnan(ap[0][key][0]):
                    for r in rec:
                        del r[key]
                    for p in prec:
                        del p[key]
                    for a in ap:
                        del a[key]

            # table_columns = [[f"{area}_{part}"]]
            table_rows.append([f"{area}_{part}"])
            for i, iou_thresh in enumerate(metric):
                # header.append(f'AP_{iou_thresh:.2f}')
                # header.append(f'AR_{iou_thresh:.2f}')
                rec_list = []
                for label in ap[i].keys():
                    area_ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
                        ap[i][label][0])
                area_ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list(
                    ap[i].values())))

                table_rows[-1].append(f"{area_ret_dict[f'mAP_{iou_thresh:.2f}']:.4f}")
                # table_columns.append([f"{area_ret_dict[f'mAP_{iou_thresh:.2f}']:.4f}"])

                for label in rec[i].keys():
                    area_ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
                        rec[i][label][-1])
                    rec_list.append(rec[i][label][-1])
                area_ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))

                table_rows[-1].append(f"{area_ret_dict[f'mAR_{iou_thresh:.2f}']:.4f}")
                # table_columns.append([f"{area_ret_dict[f'mAR_{iou_thresh:.2f}']:.4f}"])

            # table_data = [header]
            # table_rows = list(zip(*table_columns))
            # table_data += table_rows
            # table = AsciiTable(table_data)
            # table.inner_footing_row_border = True
            # print_log('\n' + table.table, logger=logger)
        table_data = [header]
        table_data += table_rows
        table = AsciiTable(table_data)
        table.inner_footing_row_border = False
        print_log('\n' + table.table, logger=logger)
    return ret_dict


================================================
FILE: bip3d/eval/metrics/__init__.py
================================================
from .det_metric import IndoorDetMetric
from .grounding_metric import GroundingMetric
# from .occupancy_metric import OccupancyMetric

__all__ = ['IndoorDetMetric', 'GroundingMetric']


================================================
FILE: bip3d/eval/metrics/det_metric.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import logging
from collections import OrderedDict
from typing import Dict, List, Optional, Sequence, Union

import numpy as np
from terminaltables import AsciiTable
from mmdet.evaluation import eval_map
from mmengine.dist import (broadcast_object_list, collect_results,
                           is_main_process, get_rank)
from mmengine.evaluator import BaseMetric
from mmengine.evaluator.metric import _to_cpu
from mmengine.logging import MMLogger, print_log

from bip3d.registry import METRICS
from bip3d.structures import get_box_type

from ..indoor_eval import indoor_eval


@METRICS.register_module()
class IndoorDetMetric(BaseMetric):
    """Indoor scene evaluation metric.

    Args:
        iou_thr (float or List[float]): List of iou threshold when calculate
            the metric. Defaults to [0.25, 0.5].
        collect_device (str): Device name used for collecting results from
            different ranks during distributed training. Must be 'cpu' or
            'gpu'. Defaults to 'cpu'.
        prefix (str, optional): The prefix that will be added in the metric
            names to disambiguate homonymous metrics of different evaluators.
            If prefix is not provided in the argument, self.default_prefix will
            be used instead. Defaults to None.
    """

    def __init__(self,
                 iou_thr: List[float] = [0.25, 0.5],
                 collect_device: str = 'cpu',
                 prefix: Optional[str] = None,
                 batchwise_anns: bool = False,
                 save_result_path=None,
                 eval_part=None,
                 **kwargs):
        super(IndoorDetMetric, self).__init__(prefix=prefix,
                                              collect_device=collect_device,
                                              **kwargs)
        self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
        self.batchwise_anns = batchwise_anns
        self.save_result_path = save_result_path
        if eval_part is None:
            eval_part = ["scannet", "3rscan", "matterport3d", "arkit"]
        self.eval_part = eval_part

    def process(self, data_batch: dict, data_samples: Sequence[dict]):
        """Process one batch of data samples and predictions.

        The processed results should be stored in ``self.results``, which will
        be used to compute the metrics when all batches have been processed.

        Args:
            data_batch (dict): A batch of data from the dataloader.
            data_samples (Sequence[dict]): A batch of outputs from the model.
        """
        for data_sample in data_samples:
            pred_3d = data_sample['pred_instances_3d']
            eval_ann_info = data_sample['eval_ann_info']
            eval_ann_info["scan_id"] = data_sample["scan_id"]
            cpu_pred_3d = dict()
            for k, v in pred_3d.items():
                if hasattr(v, 'to'):
                    cpu_pred_3d[k] = v.to('cpu')
                else:
                    cpu_pred_3d[k] = v
            self.results.append((eval_ann_info, cpu_pred_3d))

    def compute_metrics(self, results: list, part="Overall", split=True):
        """Compute the metrics from processed results.

        Args:
            results (list): The processed results of each batch.

        Returns:
            Dict[str, float]: The computed metrics. The keys are the names of
            the metrics, and the values are corresponding results.
        """
        logger: MMLogger = MMLogger.get_current_instance()
        ann_infos = []
        pred_results = []

        for eval_ann, sinlge_pred_results in results:
            ann_infos.append(eval_ann)
            pred_results.append(sinlge_pred_results)

        # some checkpoints may not record the key "box_type_3d"
        box_type_3d, box_mode_3d = get_box_type(
            self.dataset_meta.get('box_type_3d', 'depth'))

        print_log(f"eval : {len(ann_infos)}, {len(pred_results)}, rank: {get_rank()}")
        ret_dict = indoor_eval(ann_infos,
                               pred_results,
                               self.iou_thr,
                               self.dataset_meta['classes'],
                               logger=logger,
                               box_mode_3d=box_mode_3d,
                               classes_split=self.dataset_meta.get(
                                   'classes_split', None) if split else None,
                               part=part)

        return ret_dict

    def evaluate(self, size: int):
        """Evaluate the model performance of the whole dataset after processing
        all batches.

        Args:
            size (int): Length of the entire validation dataset. When batch
                size > 1, the dataloader may pad some data samples to make
                sure all ranks have the same length of dataset slice. The
                ``collect_results`` function will drop the padded data based on
                this size.

        Returns:
            dict: Evaluation metrics dict on the val dataset. The keys are the
            names of the metrics, and the values are corresponding results.
        """
        if len(self.results) == 0:
            print_log(
                f'{self.__class__.__name__} got empty `self.results`. Please '
                'ensure that the processed results are properly added into '
                '`self.results` in `process` method.',
                logger='current',
                level=logging.WARNING)
        print_log(
            f"number of results: "
            f"{len(self.results)}, size: {size}, "
            f"batchwise_anns: {self.batchwise_anns}, rank: {get_rank()}, "
            f"collect_dir: {self.collect_dir}"
        )
        if self.batchwise_anns:
            # the actual dataset length/size is the len(self.results)
            if self.collect_device == 'cpu':
                results = collect_results(self.results,
                                          len(self.results),
                                          self.collect_device,
                                          tmpdir=self.collect_dir)
            else:
                results = collect_results(self.results, len(self.results),
                                          self.collect_device)
        else:
            if self.collect_device == 'cpu':
                results = collect_results(self.results,
                                          size,
                                          self.collect_device,
                                          tmpdir=self.collect_dir)
            else:
                results = collect_results(self.results, size,
                                          self.collect_device)

        print_log(
            f"number of collected results: "
            f"{len(results) if results is not None else 'None'},"
            f"rank: {get_rank()}"
        )
        if is_main_process():
            # cast all tensors in results list to cpu
            results = _to_cpu(results)
            # import pdb; pdb.set_trace()
            if self.save_result_path is not None:
                import pickle
                import copy
                _ret = []
                for x in results:
                    x = list(copy.deepcopy(x))
                    x[0]["gt_bboxes_3d"] = x[0]["gt_bboxes_3d"].cpu().numpy()
                    x[1]["bboxes_3d"] = x[1]["bboxes_3d"].cpu().numpy()
                    x[1]["scores_3d"] = x[1]["scores_3d"].cpu().numpy()
                    x[1]["labels_3d"] = x[1]["labels_3d"].cpu().numpy()
                    x[1]["target_scores_3d"] = x[1]["target_scores_3d"].cpu().numpy()
                    _ret.append(x)
                pickle.dump(_ret, open(os.path.join(self.save_result_path, "results.pkl"), "wb"))

            _metrics = self.compute_metrics(results)  # type: ignore
            for part in self.eval_part:
            # for part in []:
                print(f"=================== {part} =========================")
                part_ret = [x for x in results if part in x[0]["scan_id"]]
                if len(part_ret) > 0:
                    _metrics_part = self.compute_metrics(part_ret, part=part)
                    _metrics_part = {f"{part}--{k}": v for k, v in _metrics_part.items()}
                    _metrics.update(_metrics_part)
            # Add prefix to metric names
            if self.prefix:
                _metrics = {
                    '/'.join((self.prefix, k)): v
                    for k, v in _metrics.items()
                }
            metrics = [_metrics]
        else:
            metrics = [None]  # type: ignore

        broadcast_object_list(metrics)

        if is_main_process():
            table = []
            header = ["Summary"]
            for i, iou_thresh in enumerate(self.iou_thr):
                header.append(f'AP_{iou_thresh:.2f}')
                header.append(f'AR_{iou_thresh:.2f}')
            table.append(header)

            for part in self.eval_part + ["Overall"]:
                table_data = [part]
                for i, iou_thresh in enumerate(self.iou_thr):
                    key = f"{part+'--' if part!='Overall' else ''}mAP_{iou_thresh:.2f}"
                    if key in _metrics:
                        table_data.append(f"{_metrics[key]:.4f}")
                    else:
                        table_data.append(f"0.xxxx")
                    key = f"{part+'--' if part!='Overall' else ''}mAR_{iou_thresh:.2f}"
                    if key in _metrics:
                        table_data.append(f"{_metrics[key]:.4f}")
                    else:
                        table_data.append(f"0.xxxx")
                table.append(table_data)
            table = AsciiTable(table)
            table.inner_footing_row_border = True
            logger: MMLogger = MMLogger.get_current_instance()
            print_log('\n' + table.table, logger=logger)

        # reset the results list
        self.results.clear()
        return metrics[0]


@METRICS.register_module()
class Indoor2DMetric(BaseMetric):
    """indoor 2d predictions evaluation metric.

    Args:
        iou_thr (float or List[float]): List of iou threshold when calculate
            the metric. Defaults to [0.5].
        collect_device (str): Device name used for collecting results from
            different ranks during distributed training. Must be 'cpu' or
            'gpu'. Defaults to 'cpu'.
        prefix (str, optional): The prefix that will be added in the metric
            names to disambiguate homonymous metrics of different evaluators.
            If prefix is not provided in the argument, self.default_prefix will
            be used instead. Defaults to None.
    """

    def __init__(self,
                 iou_thr: Union[float, List[float]] = [0.5],
                 collect_device: str = 'cpu',
                 prefix: Optional[str] = None):
        super(Indoor2DMetric, self).__init__(prefix=prefix,
                                             collect_device=collect_device)
        self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr

    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
        """Process one batch of data samples and predictions.

        The processed results should be stored in ``self.results``, which will
        be used to compute the metrics when all batches have been processed.

        Args:
            data_batch (dict): A batch of data from the dataloader.
            data_samples (Sequence[dict]): A batch of outputs from the model.
        """
        for data_sample in data_samples:
            pred = data_sample['pred_instances']
            eval_ann_info = data_sample['eval_ann_info']
            ann = dict(labels=eval_ann_info['gt_bboxes_labels'],
                       bboxes=eval_ann_info['gt_bboxes'])

            pred_bboxes = pred['bboxes'].cpu().numpy()
            pred_scores = pred['scores'].cpu().numpy()
            pred_labels = pred['labels'].cpu().numpy()

            dets = []
            for label in range(len(self.dataset_meta['classes'])):
                index = np.where(pred_labels == label)[0]
                pred_bbox_scores = np.hstack(
                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
                dets.append(pred_bbox_scores)

            self.results.append((ann, dets))

    def compute_metrics(self, results: list) -> Dict[str, float]:
        """Compute the metrics from processed results.

        Args:
            results (list): The processed results of each batch.

        Returns:
            Dict[str, float]: The computed metrics. The keys are the names of
            the metrics, and the values are corresponding results.
        """
        logger: MMLogger = MMLogger.get_current_instance()
        annotations, preds = zip(*results)
        eval_results = OrderedDict()
        for iou_thr_2d_single in self.iou_thr:
            mean_ap, _ = eval_map(preds,
                                  annotations,
                                  scale_ranges=None,
                                  iou_thr=iou_thr_2d_single,
                                  dataset=self.dataset_meta['classes'],
                                  logger=logger)
            eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
        return eval_results


================================================
FILE: bip3d/eval/metrics/grounding_metric.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import os
from typing import Dict, List, Optional, Sequence
import pickle

import mmengine
from mmengine.evaluator import BaseMetric
from mmengine.logging import MMLogger, print_log
from terminaltables import AsciiTable

from bip3d.registry import METRICS
from bip3d.structures import EulerDepthInstance3DBoxes


@METRICS.register_module()
class GroundingMetric(BaseMetric):
    """Lanuage grounding evaluation metric. We calculate the grounding
    performance based on the alignment score of each bbox with the input
    prompt.

    Args:
        iou_thr (float or List[float]): List of iou threshold when calculate
            the metric. Defaults to [0.25, 0.5].
        collect_device (str): Device name used for collecting results from
            different ranks during distributed training. Must be 'cpu' or
            'gpu'. Defaults to 'cpu'.
        prefix (str, optional): The prefix that will be added in the metric
            names to disambiguate homonymous metrics of different evaluators.
            If prefix is not provided in the argument, self.default_prefix will
            be used instead. Defaults to None.
        format_only (bool): Whether to only inference the predictions without
            evaluation. Defaults to False.
        result_dir (str): Dir to save results, e.g., if result_dir = './',
            the result file will be './test_results.json'. Defaults to ''.
    """

    def __init__(self,
                 iou_thr: List[float] = [0.25, 0.5],
                 collect_device: str = 'cpu',
                 prefix: Optional[str] = None,
                 format_only=False,
                 submit_info=None,
                 result_dir='', **kwargs) -> None:
        super(GroundingMetric, self).__init__(prefix=prefix,
                                              collect_device=collect_device,
                                              **kwargs)
        self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
        self.prefix = prefix
        self.format_only = format_only
        self.result_dir = result_dir
        self.submit_info = submit_info if submit_info is not None else {}

    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
        """Process one batch of data samples and predictions.

        The processed results should be stored in ``self.results``, which will
        be used to compute the metrics when all batches have been processed.

        Args:
            data_batch (dict): A batch of data from the dataloader.
            data_samples (Sequence[dict]): A batch of outputs from the model.
        """
        for data_sample in data_samples:
            pred_3d = data_sample['pred_instances_3d']
            eval_ann_info = data_sample['eval_ann_info']
            eval_ann_info["scan_id"] = data_sample["scan_id"]
            cpu_pred_3d = dict()
            for k, v in pred_3d.items():
                if hasattr(v, 'to'):
                    cpu_pred_3d[k] = v.to('cpu')
                else:
                    cpu_pred_3d[k] = v
            self.results.append((eval_ann_info, cpu_pred_3d))

    def ground_eval(self, gt_annos, det_annos, logger=None, part="Overall"):

        assert len(det_annos) == len(gt_annos)

        pred = {}
        gt = {}

        object_types = [
            'Easy', 'Hard', 'View-Dep', 'View-Indep', 'Unique', 'Multi',
            'Overall'
        ]

        for t in self.iou_thr:
            for object_type in object_types:
                pred.update({object_type + '@' + str(t): 0})
                gt.update({object_type + '@' + str(t): 1e-14})

        for sample_id in range(len(det_annos)):
            det_anno = det_annos[sample_id]
            gt_anno = gt_annos[sample_id]
            target_scores = det_anno['target_scores_3d']  # (num_query, )

            bboxes = det_anno['bboxes_3d']
            gt_bboxes = gt_anno['gt_bboxes_3d']
            bboxes = EulerDepthInstance3DBoxes(bboxes.tensor,
                                               origin=(0.5, 0.5, 0.5))
            gt_bboxes = EulerDepthInstance3DBoxes(gt_bboxes.tensor,
                                                  origin=(0.5, 0.5, 0.5))

            view_dep = gt_anno['is_view_dep']
            hard = gt_anno['is_hard']
            unique = gt_anno['is_unique']

            box_index = target_scores.argsort(dim=-1, descending=True)[:10]
            top_bbox = bboxes[box_index]

            iou = top_bbox.overlaps(top_bbox, gt_bboxes)  # (num_query, 1)

            for t in self.iou_thr:
                threshold = iou > t
                found = int(threshold.any())
                if view_dep:
                    gt['View-Dep@' + str(t)] += 1
                    pred['View-Dep@' + str(t)] += found
                else:
                    gt['View-Indep@' + str(t)] += 1
                    pred['View-Indep@' + str(t)] += found
                if hard:
                    gt['Hard@' + str(t)] += 1
                    pred['Hard@' + str(t)] += found
                else:
                    gt['Easy@' + str(t)] += 1
                    pred['Easy@' + str(t)] += found
                if unique:
                    gt['Unique@' + str(t)] += 1
                    pred['Unique@' + str(t)] += found
                else:
                    gt['Multi@' + str(t)] += 1
                    pred['Multi@' + str(t)] += found

                gt['Overall@' + str(t)] += 1
                pred['Overall@' + str(t)] += found

        header = ['Type']
        header.extend(object_types)
        ret_dict = {}

        for t in self.iou_thr:
            if part == "Overall":
                table_columns = [[f'AP@{t:.2f}']]
            else:
                table_columns = [[f'{part}_AP@{t:.2f}']]

            for object_type in object_types:
                metric = object_type + '@' + str(t)
                value = pred[metric] / max(gt[metric], 1)
                ret_dict[metric] = value
                table_columns.append([f'{value:.4f}'])

            table_data = [header]
            table_rows = list(zip(*table_columns))
            table_data += table_rows
            table = AsciiTable(table_data)
            table.inner_footing_row_border = True
            print_log('\n' + table.table, logger=logger)

        return ret_dict

    def compute_metrics(self, results: list) -> Dict[str, float]:
        """Compute the metrics from processed results after all batches have
        been processed.

        Args:
            results (list): The processed results of each batch.

        Returns:
            Dict[str, float]: The computed metrics. The keys are the names of
            the metrics, and the values are corresponding results.
        """
        logger: MMLogger = MMLogger.get_current_instance()  # noqa
        annotations, preds = zip(*results)
        ret_dict = {}
        if self.format_only:
            # preds is a list of dict
            results = []
            for pred in preds:
                result = dict()
                # convert the Euler boxes to the numpy array to save
                bboxes_3d = pred['bboxes_3d'].tensor
                scores_3d = pred['scores_3d']
                # Note: hard-code save top-20 predictions
                # eval top-10 predictions during the test phase by default
                box_index = scores_3d.argsort(dim=-1, descending=True)[:20]
                top_bboxes_3d = bboxes_3d[box_index]
                top_scores_3d = scores_3d[box_index]
                result['bboxes_3d'] = top_bboxes_3d.numpy()
                result['scores_3d'] = top_scores_3d.numpy()
                results.append(result)
            mmengine.dump(results,
                          os.path.join(self.result_dir, 'test_results.json'))
            for x in results:
                x["bboxes_3d"] = x["bboxes_3d"].tolist()
                x["scores_3d"] = x["scores_3d"].tolist()
            submission = {
                'method': 'xxx',
                'team': 'xxx',
                'authors': ['xxx'],
                'e-mail': 'xxx',
                'institution': 'xxx',
                'country': 'xxx',
                'results': results,
            }
            submission.update(self.submit_info)
            pickle.dump(submission, open(os.path.join(self.result_dir, 'submission.pkl'), "wb"))
            return ret_dict

        ret_dict = self.ground_eval(annotations, preds)
        for part in ["scannet", "3rscan", "matterport3d", "arkit"]:
            ann = [x for x in annotations if part in x["scan_id"]]
            pred = [y for x,y in zip(annotations, preds) if part in x["scan_id"]]
            if len(pred) > 0:
                part_ret_dict= self.ground_eval(ann, pred, part=part)
                part_ret_dict = {f"{part}--{k}": v for k, v in part_ret_dict.items()}
                ret_dict.update(part_ret_dict)

        return ret_dict


================================================
FILE: bip3d/grid_mask.py
================================================
import torch
import torch.nn as nn
import numpy as np
from PIL import Image


class Grid(object):
    def __init__(
        self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0
    ):
        self.use_h = use_h
        self.use_w = use_w
        self.rotate = rotate
        self.offset = offset
        self.ratio = ratio
        self.mode = mode
        self.st_prob = prob
        self.prob = prob

    def set_prob(self, epoch, max_epoch):
        self.prob = self.st_prob * epoch / max_epoch

    def __call__(self, img, label):
        if np.random.rand() > self.prob:
            return img, label
        h = img.size(1)
        w = img.size(2)
        self.d1 = 2
        self.d2 = min(h, w)
        hh = int(1.5 * h)
        ww = int(1.5 * w)
        d = np.random.randint(self.d1, self.d2)
        if self.ratio == 1:
            self.l = np.random.randint(1, d)
        else:
            self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
        mask = np.ones((hh, ww), np.float32)
        st_h = np.random.randint(d)
        st_w = np.random.randint(d)
        if self.use_h:
            for i in range(hh // d):
                s = d * i + st_h
                t = min(s + self.l, hh)
                mask[s:t, :] *= 0
        if self.use_w:
            for i in range(ww // d):
                s = d * i + st_w
                t = min(s + self.l, ww)
                mask[:, s:t] *= 0

        r = np.random.randint(self.rotate)
        mask = Image.fromarray(np.uint8(mask))
        mask = mask.rotate(r)
        mask = np.asarray(mask)
        mask = mask[
            (hh - h) // 2 : (hh - h) // 2 + h,
            (ww - w) // 2 : (ww - w) // 2 + w,
        ]

        mask = torch.from_numpy(mask).float()
        if self.mode == 1:
            mask = 1 - mask

        mask = mask.expand_as(img)
        if self.offset:
            offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
            offset = (1 - mask) * offset
            img = img * mask + offset
        else:
            img = img * mask

        return img, label


class GridMask(object):
    def __init__(
        self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0
    ):
        super(GridMask, self).__init__()
        self.use_h = use_h
        self.use_w = use_w
        self.rotate = rotate
        self.offset = offset
        self.ratio = ratio
        self.mode = mode
        self.st_prob = prob
        self.prob = prob

    def set_prob(self, epoch, max_epoch):
        self.prob = self.st_prob * epoch / max_epoch  # + 1.#0.5

    def __call__(self, x, offset=None):
        if np.random.rand() > self.prob:
            return x
        n, c, h, w = x.size()
        x = x.view(-1, h, w)
        hh = int(1.5 * h)
        ww = int(1.5 * w)
        d = np.random.randint(2, h)
        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
        mask = np.ones((hh, ww), np.float32)
        st_h = np.random.randint(d)
        st_w = np.random.randint(d)
        if self.use_h:
            for i in range(hh // d):
                s = d * i + st_h
                t = min(s + self.l, hh)
                mask[s:t, :] *= 0
        if self.use_w:
            for i in range(ww // d):
                s = d * i + st_w
                t = min(s + self.l, ww)
                mask[:, s:t] *= 0

        r = np.random.randint(self.rotate)
        mask = Image.fromarray(np.uint8(mask))
        mask = mask.rotate(r)
        mask = np.asarray(mask)
        mask = mask[
            (hh - h) // 2 : (hh - h) // 2 + h,
            (ww - w) // 2 : (ww - w) // 2 + w,
        ]

        mask = torch.tensor(mask).float().cuda()
        if self.mode == 1:
            mask = 1 - mask
        mask = mask.expand_as(x)
        if offset is not None:
            x = x.view(n, c, h, w)
            mask = mask.view(n, c, h, w)
            x = x * mask + offset * (1 - mask)
            return x
        elif self.offset:
            offset = (
                torch.from_numpy(2 * (np.random.rand(h, w) - 0.5))
                .float()
                .cuda()
            )
            x = x * mask + offset * (1 - mask)
        else:
            x = x * mask

        return x.view(n, c, h, w)


================================================
FILE: bip3d/models/__init__.py
================================================
from .structure import *
from .feature_enhancer import *
from .spatial_enhancer import *
from .bbox3d_decoder import *
from .instance_bank import *
from .target import *
from .data_preprocessors import *
from .deformable_aggregation import *
from .bert import *


================================================
FILE: bip3d/models/base_target.py
================================================
from torch import nn
from abc import ABC, abstractmethod


__all__ = ["BaseTargetWithDenoising"]


class BaseTargetWithDenoising(ABC, nn.Module):
    def __init__(self, num_dn=0, num_temp_dn_groups=0):
        super(BaseTargetWithDenoising, self).__init__()
        self.num_dn = num_dn
        self.num_temp_dn_groups = num_temp_dn_groups
        self.dn_metas = None

    @abstractmethod
    def sample(self, cls_pred, box_pred, cls_target, box_target):
        """
        Perform Hungarian matching between predictions and ground truth,
        returning the matched ground truth corresponding to the predictions
        along with the corresponding regression weights.
        """

    def get_dn_anchors(self, cls_target, box_target, *args, **kwargs):
        """
        Generate noisy instances for the current frame, with a total of
        'self.num_dn_groups' groups.
        """
        return None

    def update_dn(self, instance_feature, anchor, *args, **kwargs):
        """
        Insert the previously saved 'self.dn_metas' into the noisy instances
        of the current frame.
        """

    def cache_dn(
        self,
        dn_instance_feature,
        dn_anchor,
        dn_cls_target,
        valid_mask,
        dn_id_target,
    ):
        """
        Randomly save information for 'self.num_temp_dn_groups' groups of
        temporal noisy instances to 'self.dn_metas'.
        """
        if self.num_temp_dn_groups < 0:
            return
        self.dn_metas = dict(dn_anchor=dn_anchor[:, : self.num_temp_dn_groups])


================================================
FILE: bip3d/models/bbox3d_decoder.py
================================================
import math
from typing import List, Optional, Tuple, Union

import numpy as np
import torch
from torch import nn
from torch.cuda.amp.autocast_mode import autocast

from mmcv.cnn import Linear, Scale
from mmcv.ops import nms3d_normal, nms3d
from mmengine.model import BaseModel
from mmengine.structures import InstanceData
from mmdet.models.layers import SinePositionalEncoding
from mmdet.models.layers.transformer.deformable_detr_layers import (
    DeformableDetrTransformerEncoder as DDTE,
)
from mmdet.models.layers.transformer.utils import MLP

from mmdet.utils import reduce_mean
from mmdet.models.detectors.glip import create_positive_map_label_to_token
from mmdet.models.dense_heads.atss_vlfusion_head import (
    convert_grounding_to_cls_scores,
)

from bip3d.registry import MODELS, TASK_UTILS
from bip3d.structures.bbox_3d.utils import rotation_3d_in_euler
from .utils import (
    deformable_format,
    wasserstein_distance,
    permutation_corner_distance,
    center_distance,
    get_positive_map,
    get_entities,
    linear_act_ln,
)

__all__ = ["BBox3DDecoder"]


X, Y, Z, W, L, H, ALPHA, BETA, GAMMA = range(9)


def decode_box(box, min_size=None, max_size=None):
    size = box[..., 3:6].exp()
    # size = box[..., 3:6]
    if min_size is not None or max_size is not None:
        size = size.clamp(min=min_size, max=max_size)
    box = torch.cat(
        [box[..., :3], size, box[..., 6:]],
        dim=-1,
    )
    return box


@MODELS.register_module()
class BBox3DDecoder(BaseModel):
    def __init__(
        self,
        instance_bank: dict,
        anchor_encoder: dict,
        graph_model: dict,
        norm_layer: dict,
        ffn: dict,
        deformable_model: dict,
        refine_layer: dict,
        num_decoder: int = 6,
        num_single_frame_decoder: int = -1,
        temp_graph_model: dict = None,
        text_cross_attn: dict = None,
        loss_cls: dict = None,
        loss_reg: dict = None,
        post_processor: dict = None,
        sampler: dict = None,
        gt_cls_key: str = "gt_labels_3d",
        gt_reg_key: str = "gt_bboxes_3d",
        gt_id_key: str = "instance_id",
        with_instance_id: bool = True,
        task_prefix: str = "det",
        reg_weights: List = None,
        operation_order: Optional[List[str]] = None,
        cls_threshold_to_reg: float = -1,
        look_forward_twice: bool = False,
        init_cfg: dict = None,
        **kwargs,
    ):
        super().__init__(init_cfg)
        self.num_decoder = num_decoder
        self.num_single_frame_decoder = num_single_frame_decoder
        self.gt_cls_key = gt_cls_key
        self.gt_reg_key = gt_reg_key
        self.gt_id_key = gt_id_key
        self.with_instance_id = with_instance_id
        self.task_prefix = task_prefix
        self.cls_threshold_to_reg = cls_threshold_to_reg
        self.look_forward_twice = look_forward_twice

        if reg_weights is None:
            self.reg_weights = [1.0] * 9
        else:
            self.reg_weights = reg_weights

        if operation_order is None:
            operation_order = [
                "gnn",
                "norm",
                "text_cross_attn",
                "norm",
                "deformable",
                "norm",
                "ffn",
                "norm",
                "refine",
            ] * num_decoder
        self.operation_order = operation_order

        # =========== build modules ===========
        def build(cfg, registry=MODELS):
            if cfg is None:
                return None
            return registry.build(cfg)

        self.instance_bank = build(instance_bank)
        self.anchor_encoder = build(anchor_encoder)
        self.sampler = build(sampler, TASK_UTILS)
        self.post_processor = build(post_processor, TASK_UTILS)
        self.loss_cls = build(loss_cls)
        self.loss_reg = build(loss_reg)
        self.op_config_map = {
            "temp_gnn": temp_graph_model,
            "gnn": graph_model,
            "norm": norm_layer,
            "ffn": ffn,
            "deformable": deformable_model,
            "text_cross_attn": text_cross_attn,
            "refine": refine_layer,
        }
        self.layers = nn.ModuleList(
            [
                build(self.op_config_map.get(op, None))
                for op in self.operation_order
            ]
        )
        self.embed_dims = self.instance_bank.embed_dims
        self.norm = nn.LayerNorm(self.embed_dims)

    def init_weights(self):
        from mmengine.model import constant_init
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        for i, op in enumerate(self.operation_order):
            if op == "refine":
                m = self.layers[i]
                constant_init(m.layers[-2], 0, bias=0)
                constant_init(m.layers[-1], 1)
                nn.init.constant_(m.layers[-2].bias.data[2:], 0.0)

    def forward(
        self,
        feature_maps,
        text_dict=None,
        batch_inputs=None,
        depth_prob=None,
        **kwargs,
    ):
        batch_size = feature_maps[0].shape[0]
        feature_maps = list(deformable_format(feature_maps))

        # ========= get instance info ============
        if (
            self.sampler.dn_metas is not None
            and self.sampler.dn_metas["dn_anchor"].shape[0] != batch_size
        ):
            self.sampler.dn_metas = None
        (
            instance_feature,
            anchor,
            temp_instance_feature,
            temp_anchor,
            time_interval,
        ) = self.instance_bank.get(
            batch_size, batch_inputs, dn_metas=self.sampler.dn_metas
        )

        # ========= prepare for denosing training ============
        # 1. get dn metas: noisy-anchors and corresponding GT
        # 2. concat learnable instances and noisy instances
        # 3. get attention mask
        attn_mask = None
        dn_metas = None
        temp_dn_reg_target = None
        if self.training and hasattr(self.sampler, "get_dn_anchors"):
            dn_metas = self.sampler.get_dn_anchors(
                batch_inputs[self.gt_cls_key],
                batch_inputs[self.gt_reg_key],
                text_dict=text_dict,
                label=batch_inputs["gt_labels_3d"],
            )
        if dn_metas is not None:
            (
                dn_anchor,
                dn_reg_target,
                dn_cls_target,
                dn_attn_mask,
                valid_mask,
                dn_query,
            ) = dn_metas
            num_dn_anchor = dn_anchor.shape[1]
            if dn_anchor.shape[-1] != anchor.shape[-1]:
                remain_state_dims = anchor.shape[-1] - dn_anchor.shape[-1]
                dn_anchor = torch.cat(
                    [
                        dn_anchor,
                        dn_anchor.new_zeros(
                            batch_size, num_dn_anchor, remain_state_dims
                        ),
                    ],
                    dim=-1,
                )
            anchor = torch.cat([anchor, dn_anchor], dim=1)
            if dn_query is None:
                dn_query = instance_feature.new_zeros(
                    batch_size, num_dn_anchor, instance_feature.shape[-1]
                ),
            instance_feature = torch.cat(
                [instance_feature, dn_query], dim=1,
            )
            num_instance = instance_feature.shape[1]
            num_free_instance = num_instance - num_dn_anchor
            attn_mask = anchor.new_ones(
                (num_instance, num_instance), dtype=torch.bool
            )
            attn_mask[:num_free_instance, :num_free_instance] = False
            attn_mask[num_free_instance:, num_free_instance:] = dn_attn_mask
        else:
            num_dn_anchor = None
            num_free_instance = None

        anchor_embed = self.anchor_encoder(anchor)
        if temp_anchor is not None:
            temp_anchor_embed = self.anchor_encoder(temp_anchor)
        else:
            temp_anchor_embed = None

        # =================== forward the layers ====================
        prediction = []
        classification = []
        quality = []
        _anchor = None
        for i, (op, layer) in enumerate(
            zip(self.operation_order, self.layers)
        ):
            if self.layers[i] is None:
                continue
            elif op == "temp_gnn":
                instance_feature = layer(
                    query=instance_feature,
                    key=temp_instance_feature,
                    value=temp_instance_feature,
                    query_pos=anchor_embed,
                    key_pos=temp_anchor_embed,
                    attn_mask=(
                        attn_mask if temp_instance_feature is None else None
                    ),
                )
            elif op == "gnn":
                instance_feature = layer(
                    query=instance_feature,
                    key=instance_feature,
                    value=instance_feature,
                    query_pos=anchor_embed,
                    key_pos=anchor_embed,
                    attn_mask=attn_mask,
                )
            elif op == "norm" or op == "ffn":
                instance_feature = layer(instance_feature)
            elif op == "deformable":
                instance_feature = layer(
                    instance_feature,
                    anchor,
                    anchor_embed,
                    feature_maps,
                    batch_inputs,
                    depth_prob=depth_prob,
                )
            elif op == "text_cross_attn":
                text_feature = text_dict["embedded"]
                instance_feature = layer(
                    query=instance_feature,
                    key=text_feature,
                    value=text_feature,
                    query_pos=anchor_embed,
                    key_padding_mask=~text_dict["text_token_mask"],
                    key_pos=0,
                )
            elif op == "refine":
                _instance_feature = self.norm(instance_feature)
                if self.look_forward_twice:
                    if _anchor is None:
                        _anchor = anchor.clone()
                    _anchor, cls, qt = layer(
                        _instance_feature,
                        _anchor,
                        anchor_embed,
                        time_interval=time_interval,
                        text_feature=text_feature,
                        text_token_mask=text_dict["text_token_mask"],
                    )
                    prediction.append(_anchor)
                    anchor = layer(
                        instance_feature,
                        anchor,
                        anchor_embed,
                        time_interval=time_interval,
                    )[0]
                    anchor_embed = self.anchor_encoder(anchor)
                    _anchor = anchor
                    anchor = anchor.detach()
                else:
                    anchor, cls, qt = layer(
                        _instance_feature,
                        anchor,
                        anchor_embed,
                        time_interval=time_interval,
                        text_feature=text_feature,
                        text_token_mask=text_dict["text_token_mask"],
                    )
                    anchor_embed = self.anchor_encoder(anchor)
                    prediction.append(anchor)
                classification.append(cls)
                quality.append(qt)

                if len(prediction) == self.num_single_frame_decoder:
                    instance_feature, anchor = self.instance_bank.update(
                        instance_feature, anchor if _anchor is None else _anchor,
                        cls, num_dn_anchor
                    )
                    anchor_embed = self.anchor_encoder(anchor)
                    if self.look_forward_twice:
                        _anchor = anchor
                        anchor = anchor.detach()
                    if dn_metas is not None:
                        num_instance = instance_feature.shape[1]
                        attn_mask = anchor.new_ones(
                            (num_instance, num_instance), dtype=torch.bool
                        )
                        attn_mask[:-num_dn_anchor, :-num_dn_anchor] = False
                        attn_mask[-num_dn_anchor:, -num_dn_anchor:] = dn_attn_mask

                if (
                    len(prediction) > self.num_single_frame_decoder
                    and temp_anchor_embed is not None
                ):
                    temp_anchor_embed = anchor_embed[
                        :, : self.instance_bank.num_temp_instances
                    ]
            else:
                raise NotImplementedError(f"{op} is not supported.")

        output = {}

        # split predictions of learnable instances and noisy instances
        if dn_metas is not None:
            dn_classification = [
                x[:, -num_dn_anchor:] for x in classification
            ]
            classification = [x[:, :-num_dn_anchor] for x in classification]
            dn_prediction = [x[:, -num_dn_anchor:] for x in prediction]
            prediction = [x[:, :-num_dn_anchor] for x in prediction]
            quality = [
                x[:, :-num_dn_anchor] if x is not None else None
                for x in quality
            ]
            output.update(
                {
                    "dn_prediction": dn_prediction,
                    "dn_classification": dn_classification,
                    "dn_reg_target": dn_reg_target,
                    "dn_cls_target": dn_cls_target,
                    "dn_valid_mask": valid_mask,
                }
            )
            if temp_dn_reg_target is not None:
                output.update(
                    {
                        "temp_dn_reg_target": temp_dn_reg_target,
                        "temp_dn_cls_target": temp_dn_cls_target,
                        "temp_dn_valid_mask": temp_valid_mask,
                        # "dn_id_target": dn_id_target,
                    }
                )
                dn_cls_target = temp_dn_cls_target
                valid_mask = temp_valid_mask
            dn_instance_feature = instance_feature[:, -num_dn_anchor:]
            dn_anchor = anchor[:, -num_dn_anchor:]
            instance_feature = instance_feature[:, :-num_dn_anchor]
            anchor_embed = anchor_embed[:, :-num_dn_anchor]
            anchor = anchor[:, :-num_dn_anchor]
            cls = cls[:, :-num_dn_anchor]

        output.update(
            {
                "classification": classification,
                "prediction": prediction,
                "quality": quality,
                "instance_feature": instance_feature,
                "anchor_embed": anchor_embed,
            }
        )

        # cache current instances for temporal modeling
        self.instance_bank.cache(
            instance_feature, anchor, cls, batch_inputs, feature_maps
        )
        if self.with_instance_id:
            instance_id = self.instance_bank.get_instance_id(
                cls, anchor, self.decoder.score_threshold
            )
            output["instance_id"] = instance_id
        return output

    def loss(self, model_outs, data, text_dict=None):
        # ===================== prediction losses ======================
        cls_scores = model_outs["classification"]
        reg_preds = model_outs["prediction"]
        quality = model_outs["quality"]
        output = {}
        for decoder_idx, (cls, reg, qt) in enumerate(
            zip(cls_scores, reg_preds, quality)
        ):
            reg = reg[..., : len(self.reg_weights)]
            reg = decode_box(reg)
            cls_target, reg_target, reg_weights, ignore_mask = self.sampler.sample(
                cls,
                reg,
                data[self.gt_cls_key],
                data[self.gt_reg_key],
                text_dict=text_dict,
                ignore_mask=data.get("ignore_mask"),
            )
            reg_target = reg_target[..., : len(self.reg_weights)]
            reg_target_full = reg_target.clone()
            mask = torch.logical_not(torch.all(reg_target == 0, dim=-1))
            mask = mask.reshape(-1)
            if ignore_mask is not None:
                ignore_mask = ~ignore_mask.reshape(-1)
                mask = torch.logical_and(mask, ignore_mask)
                ignore_mask = ignore_mask.tile(1, cls.shape[-1])
                
            mask_valid = mask.clone()

            num_pos = max(
                reduce_mean(torch.sum(mask).to(dtype=reg.dtype)), 1.0
            )

            cls = cls.flatten(end_dim=1)
            cls_target = cls_target.flatten(end_dim=1)
            token_mask = torch.logical_not(cls.isinf())
            cls = cls[token_mask]
            cls_target = cls_target[token_mask]

            if ignore_mask is None:
                cls_loss = self.loss_cls(cls, cls_target, avg_factor=num_pos)
            else:
                ignore_mask = ignore_mask[token_mask]
                cls_loss = self.loss_cls(
                    cls[ignore_mask],
                    cls_target[ignore_mask],
                    avg_factor=num_pos,
                    weight=cls_mask[ignore_mask],
                )

            reg_weights = reg_weights * reg.new_tensor(self.reg_weights)
            reg_target = reg_target.flatten(end_dim=1)[mask]
            reg = reg.flatten(end_dim=1)[mask]
            reg_weights = reg_weights.flatten(end_dim=1)[mask]
            reg_target = torch.where(
                reg_target.isnan(), reg.new_tensor(0.0), reg_target
            )
            # cls_target = cls_target[mask]
            if qt is not None:
                qt = qt.flatten(end_dim=1)[mask]

            reg_loss = self.loss_reg(
                reg,
                reg_target,
                weight=reg_weights,
                avg_factor=num_pos,
                prefix=f"{self.task_prefix}_",
                suffix=f"_{decoder_idx}",
                quality=qt,
                # cls_target=cls_target,
            )

            output[f"{self.task_prefix}_loss_cls_{decoder_idx}"] = cls_loss
            output.update(reg_loss)

        if "dn_prediction" not in model_outs:
            return output

        # ===================== denoising losses ======================
        dn_cls_scores = model_outs["dn_classification"]
        dn_reg_preds = model_outs["dn_prediction"]

        (
            dn_valid_mask,
            dn_cls_target,
            dn_reg_target,
            dn_pos_mask,
            reg_weights,
            num_dn_pos,
        ) = self.prepare_for_dn_loss(model_outs)

        for decoder_idx, (cls, reg) in enumerate(
            zip(dn_cls_scores, dn_reg_preds)
        ):
            if (
                "temp_dn_valid_mask" in model_outs
                and decoder_idx == self.num_single_frame_decoder
            ):
                (
                    dn_valid_mask,
                    dn_cls_target,
                    dn_reg_target,
                    dn_pos_mask,
                    reg_weights,
                    num_dn_pos,
                ) = self.prepare_for_dn_loss(model_outs, prefix="temp_")

            cls = cls.flatten(end_dim=1)[dn_valid_mask]
            mask = torch.logical_not(cls.isinf())
            cls_loss = self.loss_cls(
                cls[mask],
                dn_cls_target[mask],
                avg_factor=num_dn_pos,
            )

            reg = reg.flatten(end_dim=1)[dn_valid_mask][dn_pos_mask][
                ..., : len(self.reg_weights)
            ]
            reg = decode_box(reg)
            reg_loss = self.loss_reg(
                reg,
                dn_reg_target,
                avg_factor=num_dn_pos,
                weight=reg_weights,
                prefix=f"{self.task_prefix}_",
                suffix=f"_dn_{decoder_idx}",
            )
            output[f"{self.task_prefix}_loss_cls_dn_{decoder_idx}"] = cls_loss
            output.update(reg_loss)
        return output

    def prepare_for_dn_loss(self, model_outs, prefix=""):
        dn_valid_mask = model_outs[f"{prefix}dn_valid_mask"].flatten(end_dim=1)
        dn_cls_target = model_outs[f"{prefix}dn_cls_target"].flatten(
            end_dim=1
        )[dn_valid_mask]
        dn_reg_target = model_outs[f"{prefix}dn_reg_target"].flatten(
            end_dim=1
        )[dn_valid_mask][..., : len(self.reg_weights)]
        dn_pos_mask = dn_cls_target.sum(dim=-1) > 0
        dn_reg_target = dn_reg_target[dn_pos_mask]
        reg_weights = dn_reg_target.new_tensor(self.reg_weights)[None].tile(
            dn_reg_target.shape[0], 1
        )
        num_dn_pos = max(
            reduce_mean(torch.sum(dn_valid_mask).to(dtype=reg_weights.dtype)),
            1.0,
        )
        return (
            dn_valid_mask,
            dn_cls_target,
            dn_reg_target,
            dn_pos_mask,
            reg_weights,
            num_dn_pos,
        )

    def post_process(
        self,
        model_outs,
        text_dict,
        batch_inputs,
        batch_data_samples,
        results_in_data_samples=True,
    ):
        results = self.post_processor(
            model_outs["classification"],
            model_outs["prediction"],
            model_outs.get("instance_id"),
            model_outs.get("quality"),
            text_dict=text_dict,
            batch_inputs=batch_inputs,
        )
        if results_in_data_samples:
            for i, ret in enumerate(results):
                instances = InstanceData()
                for k, v in ret.items():
                    if k == "bboxes_3d":
                        type = batch_data_samples[i].metainfo["box_type_3d"]
                        v = type(
                            v,
                            box_dim=v.shape[1],
                            origin=(0.5, 0.5, 0.5),
                        )
                    instances.__setattr__(k, v)
                batch_data_samples[i].pred_instances_3d = instances
            return batch_data_samples
        return results


@MODELS.register_module()
class GroundingRefineClsHead(BaseModel):
    def __init__(
        self,
        embed_dims=256,
        output_dim=9,
        scale=None,
        cls_layers=False,
        cls_bias=True,
    ):
        super().__init__()
        self.embed_dims = embed_dims
        self.output_dim = output_dim
        self.refine_state = list(range(output_dim))
        self.scale = scale
        self.layers = nn.Sequential(
            *linear_act_ln(embed_dims, 2, 2),
            # MLP(embed_dims, embed_dims, embed_dims, 2),
            # nn.LayerNorm(self.embed_dims),
            # MLP(embed_dims, embed_dims, embed_dims, 2),
            Linear(self.embed_dims, self.output_dim),
            Scale([1.0] * self.output_dim),
        )
        if cls_layers:
            self.cls_layers = nn.Sequential(
                MLP(embed_dims, embed_dims, embed_dims, 2),
                nn.LayerNorm(self.embed_dims),
            )
        else:
            self.cls_layers = nn.Identity()
        if cls_bias:
            bias_value = -math.log((1 - 0.01) / 0.01)
            self.bias = nn.Parameter(
                torch.Tensor([bias_value]), requires_grad=True
            )
        else:
            self.bias = None

    def forward(
        self,
        instance_feature: torch.Tensor,
        anchor: torch.Tensor = None,
        anchor_embed: torch.Tensor = None,
        time_interval: torch.Tensor = 1.0,
        text_feature=None,
        text_token_mask=None,
        **kwargs,
    ):
        if anchor_embed is not None:
            feature = instance_feature + anchor_embed
        else:
            feature = instance_feature
        output = self.layers(feature)
        if self.scale is not None:
            output = output * output.new_tensor(self.scale)
        if anchor is not None:
            output = output + anchor

        if text_feature is not None:
            cls = self.cls_layers(
                instance_feature) @ text_feature.transpose(-1, -2)
            cls = cls / math.sqrt(instance_feature.shape[-1])
            if self.bias is not None:
                cls = cls + self.bias
            if text_token_mask is not None:
                cls.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
        else:
            cls = None
        return output, cls, None


@MODELS.register_module()
class DoF9BoxLoss(nn.Module):
    def __init__(
        self,
        loss_weight_wd=1.0,
        loss_weight_pcd=0.0,
        loss_weight_cd=0.8,
        decode_pred=False,
    ):
        super().__init__()
        self.loss_weight_wd = loss_weight_wd
        self.loss_weight_pcd = loss_weight_pcd
        self.loss_weight_cd = loss_weight_cd
        self.decode_pred = decode_pred

    def forward(
        self,
        box,
        box_target,
        weight=None,
        avg_factor=None,
        prefix="",
        suffix="",
        **kwargs,
    ):
        if box_target.shape[0] == 0:
            loss = box.sum() * 0
            return {f"{prefix}loss_box{suffix}": loss}
        if self.decode_pred:
            box = decode_box(box)
        loss = 0
        if self.loss_weight_wd > 0:
            loss += self.loss_weight_wd * wasserstein_distance(box, box_target)
        if self.loss_weight_pcd > 0:
            loss += self.loss_weight_pcd * permutation_corner_distance(
                box, box_target
            )
        if self.loss_weight_cd > 0:
            loss += self.loss_weight_cd * center_distance(box, box_target)

        if avg_factor is None:
            loss = loss.mean()
        else:
            loss = loss.sum() / avg_factor
        output = {f"{prefix}loss_box{suffix}": loss}
        return output


@TASK_UTILS.register_module()
class GroundingBox3DPostProcess:
    def __init__(
        self,
        num_output: int = 300,
        score_threshold: Optional[float] = None,
        sorted: bool = True,
    ):
        super(GroundingBox3DPostProcess, self).__init__()
        self.num_output = num_output
        self.score_threshold = score_threshold
        self.sorted = sorted

    def __call__(
        self,
        cls_scores,
        box_preds,
        instance_id=None,
        quality=None,
        output_idx=-1,
        text_dict=None,
        batch_inputs=None,
    ):
        cls_scores = cls_scores[output_idx].sigmoid()
        if "tokens_positive" in batch_inputs:
            tokens_positive_maps = get_positive_map(
                batch_inputs["tokens_positive"],
                text_dict,
            )
            label_to_token = [
                create_positive_map_label_to_token(x, plus=1)
                for x in tokens_positive_maps
            ]
            cls_scores = convert_grounding_to_cls_scores(
                cls_scores, label_to_token
            )
            entities = get_entities(
                batch_inputs["text"],
                batch_inputs["tokens_positive"],
            )
        else:
            cls_scores, _ = cls_scores.max(dim=-1, keepdim=True)
            entities = batch_inputs["text"]

        # if squeeze_cls:
        #     cls_scores, cls_ids = cls_scores.max(dim=-1)
        #     cls_scores = cls_scores.unsqueeze(dim=-1)

        box_preds = box_preds[output_idx]
        bs, num_pred, num_cls = cls_scores.shape
        num_output = min(self.num_output, num_pred*num_cls)
        cls_scores, indices = cls_scores.flatten(start_dim=1).topk(
            num_output, dim=1, sorted=self.sorted
        )
        # if not squeeze_cls:
        cls_ids = indices % num_cls
        if self.score_threshold is not None:
            mask = cls_scores >= self.score_threshold

        if quality[output_idx] is None:
            quality = None
        if quality is not None:
            centerness = quality[output_idx][..., CNS]
            centerness = torch.gather(centerness, 1, indices // num_cls)
            cls_scores_origin = cls_scores.clone()
            cls_scores *= centerness.sigmoid()
            cls_scores, idx = torch.sort(cls_scores, dim=1, descending=True)
            # if not squeeze_cls:
            cls_ids = torch.gather(cls_ids, 1, idx)
            if self.score_threshold is not None:
                mask = torch.gather(mask, 1, idx)
            indices = torch.gather(indices, 1, idx)

        output = []
        for i in range(bs):
            category_ids = cls_ids[i]
            # if squeeze_cls:
            #     category_ids = category_ids[indices[i]]
            scores = cls_scores[i]
            box = box_preds[i, indices[i] // num_cls]
            if self.score_threshold is not None:
                category_ids = category_ids[mask[i]]
                scores = scores[mask[i]]
                box = box[mask[i]]

            # nms_idx = nms3d(
            #     box[..., :7],
            #     scores,
            #     iou_threshold=0.4
            # )
            # box = box[nms_idx]
            # scores = scores[nms_idx]
            # category_ids = category_ids[nms_idx]

            if quality is not None:
                scores_origin = cls_scores_origin[i]
                if self.score_threshold is not None:
                    scores_origin = scores_origin[mask[i]]

            box = decode_box(box, 0.1, 20)
            category_ids = category_ids.cpu()

            label_names = []
            for id in category_ids.tolist():
                if isinstance(entities[i], (tuple, list)):
                    label_names.append(entities[i][id])
                else:
                    label_names.append(entities[i])

            output.append(
                {
                    "bboxes_3d": box.cpu(),
                    "scores_3d": scores.cpu(),
                    "labels_3d": category_ids,
                    "target_scores_3d": scores.cpu(),
                    "label_names": label_names,
                }
            )
            if quality is not None:
                output[-1]["cls_scores"] = scores_origin.cpu()
            if instance_id is not None:
                ids = instance_id[i, indices[i]]
                if self.score_threshold is not None:
                    ids = ids[mask[i]]
                output[-1]["instance_ids"] = ids
        return output


@MODELS.register_module()
class DoF9BoxEncoder(nn.Module):
    def __init__(
        self,
        embed_dims,
        rot_dims=3,
        output_fc=True,
        in_loops=1,
        out_loops=2,
    ):
        super().__init__()
        self.embed_dims = embed_dims

        def embedding_layer(input_dims, output_dims):
            return nn.Sequential(
                *linear_act_ln(output_dims, in_loops, out_loops, input_dims)
            )

        if not isinstance(embed_dims, (list, tuple)):
            embed_dims = [embed_dims] * 5
        self.pos_fc = embedding_layer(3, embed_dims[0])
        self.size_fc = embedding_layer(3, embed_dims[1])
        self.yaw_fc = embedding_layer(rot_dims, embed_dims[2])
        self.rot_dims = rot_dims
        if output_fc:
            self.output_fc = embedding_layer(embed_dims[-1], embed_dims[-1])
        else:
            self.output_fc = None

    def forward(self, box_3d: torch.Tensor):
        pos_feat = self.pos_fc(box_3d[..., [X, Y, Z]])
        if box_3d.shape[-1] == 3:
            return pos_feat
        size_feat = self.size_fc(box_3d[..., [W, L, H]])
        yaw_feat = self.yaw_fc(box_3d[..., ALPHA : ALPHA + self.rot_dims])
        output = pos_feat + size_feat + yaw_feat
        if self.output_fc is not None:
            output = self.output_fc(output)
        return output


@MODELS.register_module()
class SparseBox3DKeyPointsGenerator(nn.Module):
    def __init__(
        self,
        embed_dims=256,
        num_learnable_pts=0,
        fix_scale=None,
    ):
        super(SparseBox3DKeyPointsGenerator, self).__init__()
        self.embed_dims = embed_dims
        self.num_learnable_pts = num_learnable_pts
        if fix_scale is None:
            fix_scale = ((0.0, 0.0, 0.0),)
        self.fix_scale = torch.tensor(fix_scale)
        self.num_pts = len(self.fix_scale) + num_learnable_pts
        if num_learnable_pts > 0:
            self.learnable_fc = Linear(self.embed_dims, num_learnable_pts * 3)

    def forward(
        self,
        anchor,
        instance_feature=None,
        T_cur2temp_list=None,
        cur_timestamp=None,
        temp_timestamps=None,
    ):
        bs, num_anchor = anchor.shape[:2]
        size = anchor[..., None, [W, L, H]].exp()
        key_points = self.fix_scale.to(anchor) * size
        if self.num_learnable_pts > 0 and instance_feature is not None:
            learnable_scale = (
                self.learnable_fc(instance_feature)
                .reshape(bs, num_anchor, self.num_learnable_pts, 3)
                .sigmoid()
                - 0.5
            )
            key_points = torch.cat(
                [key_points, learnable_scale * size], dim=-2
            )

        key_points = rotation_3d_in_euler(
            key_points.flatten(0, 1),
            anchor[..., [ALPHA, BETA, GAMMA]].flatten(0, 1),
        ).unflatten(0, (bs, num_anchor))
        key_points = key_points + anchor[..., None, [X, Y, Z]]

        if (
            cur_timestamp is None
            or temp_timestamps is None
            or len(temp_timestamps) == 0
        ) and T_cur2temp_list is None:
            return key_points

        temp_key_points_list = []
        velocity = anchor[..., VX:]
        for i, t_time in enumerate(temp_timestamps):
            time_interval = cur_timestamp - t_time
            translation = (
                velocity
                * time_interval.to(dtype=velocity.dtype)[:, None, None]
            )
            temp_key_points = key_points - translation[:, :, None]
            if T_cur2temp_list is not None:
                T_cur2temp = T_cur2temp_list[i].to(dtype=key_points.dtype)
                temp_key_points = T_cur2temp[:, None, None, :3] @ torch.cat(
                    [
                        temp_key_points,
                        torch.ones_like(temp_key_points[..., :1]),
                    ],
                    dim=-1,
                ).unsqueeze(-1)
            temp_key_points = temp_key_points.squeeze(-1)
            temp_key_points_list.append(temp_key_points)
        return key_points, temp_key_points_list

    @staticmethod
    def anchor_projection(
        anchor,
        T_src2dst_list,
        src_timestamp=None,
        dst_timestamps=None,
        time_intervals=None,
    ):
        dst_anchors = []
        for i in range(len(T_src2dst_list)):
            vel = anchor[..., VX:]
            vel_dim = vel.shape[-1]
            T_src2dst = torch.unsqueeze(
                T_src2dst_list[i].to(dtype=anchor.dtype), dim=1
            )

            center = anchor[..., [X, Y, Z]]
            if time_intervals is not None:
                time_interval = time_intervals[i]
            elif src_timestamp is not None and dst_timestamps is not None:
                time_interval = (src_timestamp - dst_timestamps[i]).to(
                    dtype=vel.dtype
                )
            else:
                time_interval = None
            if time_interval is not None:
                translation = vel.transpose(0, -1) * time_interval
                translation = translation.transpose(0, -1)
                center = center - translation
            center = (
                torch.matmul(
                    T_src2dst[..., :3, :3], center[..., None]
                ).squeeze(dim=-1)
                + T_src2dst[..., :3, 3]
            )
            size = anchor[..., [W, L, H]]
            yaw = torch.matmul(
                T_src2dst[..., :2, :2],
                anchor[..., [COS_YAW, SIN_YAW], None],
            ).squeeze(-1)
            yaw = yaw[..., [1, 0]]
            vel = torch.matmul(
                T_src2dst[..., :vel_dim, :vel_dim], vel[..., None]
            ).squeeze(-1)
            dst_anchor = torch.cat([center, size, yaw, vel], dim=-1)
            dst_anchors.append(dst_anchor)
        return dst_anchors

    @staticmethod
    def distance(anchor):
        return torch.norm(anchor[..., :2], p=2, dim=-1)


================================================
FILE: bip3d/models/bert.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import Sequence

import torch
from mmengine.model import BaseModel
from torch import nn

try:
    from transformers import AutoTokenizer, BertConfig
    from transformers import BertModel as HFBertModel
except ImportError:
    AutoTokenizer = None
    HFBertModel = None

from bip3d.registry import MODELS


def generate_masks_with_special_tokens_and_transfer_map(
    tokenized, special_tokens_list
):
    """Generate attention mask between each pair of special tokens.

    Only token pairs in between two special tokens are attended to
    and thus the attention mask for these pairs is positive.

    Args:
        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
        special_tokens_mask (list): special tokens mask.

    Returns:
        Tuple(Tensor, Tensor):
        - attention_mask is the attention mask between each tokens.
          Only token pairs in between two special tokens are positive.
          Shape: [bs, num_token, num_token].
        - position_ids is the position id of tokens within each valid sentence.
          The id starts from 0 whenenver a special token is encountered.
          Shape: [bs, num_token]
    """
    input_ids = tokenized["input_ids"]
    bs, num_token = input_ids.shape
    # special_tokens_mask:
    # bs, num_token. 1 for special tokens. 0 for normal tokens
    special_tokens_mask = torch.zeros(
        (bs, num_token), device=input_ids.device
    ).bool()

    for special_token in special_tokens_list:
        special_tokens_mask |= input_ids == special_token

    # idxs: each row is a list of indices of special tokens
    idxs = torch.nonzero(special_tokens_mask)

    # generate attention mask and positional ids
    attention_mask = (
        torch.eye(num_token, device=input_ids.device)
        .bool()
        .unsqueeze(0)
        .repeat(bs, 1, 1)
    )
    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
    previous_col = 0
    for i in range(idxs.shape[0]):
        row, col = idxs[i]
        if col == 0:
            # if (col == 0) or (col == num_token - 1):
            attention_mask[row, col, col] = True
            position_ids[row, col] = 0
        else:
            # attention_mask[row, previous_col + 1:col,
            #                previous_col + 1:col] = True
            # position_ids[row, previous_col + 1:col] = torch.arange(
            #     0, col - previous_col - 1, device=input_ids.device)
            attention_mask[
                row, previous_col + 1 : col + 1, previous_col + 1 : col + 1
            ] = True
            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
                0, col - previous_col, device=input_ids.device
            )
        previous_col = col

    return attention_mask, position_ids.to(torch.long)


@MODELS.register_module()
class BertModel(BaseModel):
    """BERT model for language embedding only encoder.

    Args:
        name (str, optional): name of the pretrained BERT model from
            HuggingFace. Defaults to bert-base-uncased.
        max_tokens (int, optional): maximum number of tokens to be
            used for BERT. Defaults to 256.
        pad_to_max (bool, optional): whether to pad the tokens to max_tokens.
             Defaults to True.
        use_sub_sentence_represent (bool, optional): whether to use sub
            sentence represent introduced in `Grounding DINO
            <https://arxiv.org/abs/2303.05499>`. Defaults to False.
        special_tokens_list (list, optional): special tokens used to split
            subsentence. It cannot be None when `use_sub_sentence_represent`
            is True. Defaults to None.
        add_pooling_layer (bool, optional): whether to adding pooling
            layer in bert encoder. Defaults to False.
        num_layers_of_embedded (int, optional): number of layers of
            the embedded model. Defaults to 1.
        use_checkpoint (bool, optional): whether to use gradient checkpointing.
             Defaults to False.
    """

    def __init__(
        self,
        name: str = "bert-base-uncased",
        max_tokens: int = 256,
        pad_to_max: bool = True,
        use_sub_sentence_represent: bool = False,
        special_tokens_list: list = None,
        add_pooling_layer: bool = False,
        num_layers_of_embedded: int = 1,
        use_checkpoint: bool = False,
        return_tokenized: bool = False,
        **kwargs
    ) -> None:

        super().__init__(**kwargs)
        self.max_tokens = max_tokens
        self.pad_to_max = pad_to_max
        self.return_tokenized = return_tokenized

        if AutoTokenizer is None:
            raise RuntimeError(
                "transformers is not installed, please install it by: "
                "pip install transformers."
            )

        self.tokenizer = AutoTokenizer.from_pretrained(name)
        self.language_backbone = nn.Sequential(
            OrderedDict(
                [
                    (
                        "body",
                        BertEncoder(
                            name,
                            add_pooling_layer=add_pooling_layer,
                            num_layers_of_embedded=num_layers_of_embedded,
                            use_checkpoint=use_checkpoint,
                        ),
                    )
                ]
            )
        )

        self.use_sub_sentence_represent = use_sub_sentence_represent
        if self.use_sub_sentence_represent:
            assert (
                special_tokens_list is not None
            ), "special_tokens should not be None \
                    if use_sub_sentence_represent is True"

            self.special_tokens = self.tokenizer.convert_tokens_to_ids(
                special_tokens_list
            )

    def forward(self, captions: Sequence[str], **kwargs) -> dict:
        """Forward function."""
        device = next(self.language_backbone.parameters()).device
        tokenized = self.tokenizer.batch_encode_plus(
            captions,
            max_length=self.max_tokens,
            padding="max_length" if self.pad_to_max else "longest",
            return_special_tokens_mask=True,
            return_tensors="pt",
            truncation=True,
        ).to(device)
        input_ids = tokenized.input_ids
        if self.use_sub_sentence_represent:
            attention_mask, position_ids = (
                generate_masks_with_special_tokens_and_transfer_map(
                    tokenized, self.special_tokens
                )
            )
            token_type_ids = tokenized["token_type_ids"]

        else:
            attention_mask = tokenized.attention_mask
            position_ids = None
            token_type_ids = None

        tokenizer_input = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "token_type_ids": token_type_ids,
        }
        language_dict_features = self.language_backbone(tokenizer_input)
        if self.use_sub_sentence_represent:
            language_dict_features["position_ids"] = position_ids
            language_dict_features["text_token_mask"] = (
                tokenized.attention_mask.bool()
            )
        if self.return_tokenized:
            language_dict_features["tokenized"] = tokenized
        return language_dict_features


class BertEncoder(nn.Module):
    """BERT encoder for language embedding.

    Args:
        name (str): name of the pretrained BERT model from HuggingFace.
                Defaults to bert-base-uncased.
        add_pooling_layer (bool): whether to add a pooling layer.
        num_layers_of_embedded (int): number of layers of the embedded model.
                Defaults to 1.
        use_checkpoint (bool): whether to use gradient checkpointing.
                Defaults to False.
    """

    def __init__(
        self,
        name: str,
        add_pooling_layer: bool = False,
        num_layers_of_embedded: int = 1,
        use_checkpoint: bool = False,
    ):
        super().__init__()
        if BertConfig is None:
            raise RuntimeError(
                "transformers is not installed, please install it by: "
                "pip install transformers."
            )
        config = BertConfig.from_pretrained(name)
        config.gradient_checkpointing = use_checkpoint
        # only encoder
        self.model = HFBertModel.from_pretrained(
            name, add_pooling_layer=add_pooling_layer, config=config
        )
        self.language_dim = config.hidden_size
        self.num_layers_of_embedded = num_layers_of_embedded

    def forward(self, x) -> dict:
        mask = x["attention_mask"]

        outputs = self.model(
            input_ids=x["input_ids"],
            attention_mask=mask,
            position_ids=x["position_ids"],
            token_type_ids=x["token_type_ids"],
            output_hidden_states=True,
        )

        # outputs has 13 layers, 1 input layer and 12 hidden layers
        encoded_layers = outputs.hidden_states[1:]
        features = torch.stack(
            encoded_layers[-self.num_layers_of_embedded :], 1
        ).mean(1)
        # language embedding has shape [len(phrase), seq_len, language_dim]
        features = features / self.num_layers_of_embedded
        if mask.dim() == 2:
            embedded = features * mask.unsqueeze(-1).float()
        else:
            embedded = features

        results = {
            "embedded": embedded,
            "masks": mask,
            "hidden": encoded_layers[-1],
        }
        return results


================================================
FILE: bip3d/models/data_preprocessors/__init__.py
================================================
from .custom_data_preprocessor import CustomDet3DDataPreprocessor

__all__ = ["CustomDet3DDataPreprocessor"]


================================================
FILE: bip3d/models/data_preprocessors/custom_data_preprocessor.py
================================================
import math
from numbers import Number
from typing import Dict, List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
from mmdet.models import DetDataPreprocessor
from mmdet.models.utils.misc import samplelist_boxtype2tensor
from mmengine.model import stack_batch
from mmengine.structures import InstanceData
from mmengine.utils import is_seq_of
from torch import Tensor
from torch.nn import functional as F

from bip3d.registry import MODELS
from bip3d.utils.typing_config import ConfigType, SampleList
from bip3d.structures.bbox_3d import get_proj_mat_by_coord_type

from .utils import multiview_img_stack_batch


@MODELS.register_module()
class CustomDet3DDataPreprocessor(DetDataPreprocessor):
    """Points / Image pre-processor for point clouds / vision-only / multi-
    modality 3D detection tasks.

    It provides the data pre-processing as follows

    - Collate and move image and point cloud data to the target device.

    - 1) For image data:

      - Pad images in inputs to the maximum size of current batch with defined
        ``pad_value``. The padding size can be divisible by a defined
        ``pad_size_divisor``.
      - Stack images in inputs to batch_imgs.
      - Convert images in inputs from bgr to rgb if the shape of input is
        (3, H, W).
      - Normalize images in inputs with defined std and mean.
      - Do batch augmentations during training.

    - 2) For point cloud data:

      - If no voxelization, directly return list of point cloud data.
      - If voxelization is applied, voxelize point cloud according to
        ``voxel_type`` and obtain ``voxels``.

    Args:
        voxel (bool): Whether to apply voxelization to point cloud.
            Defaults to False.
        voxel_type (str): Voxelization type. Two voxelization types are
            provided: 'hard' and 'dynamic', respectively for hard voxelization
            and dynamic voxelization. Defaults to 'hard'.
        voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
            config. Defaults to None.
        batch_first (bool): Whether to put the batch dimension to the first
            dimension when getting voxel coordinates. Defaults to True.
        max_voxels (int, optional): Maximum number of voxels in each voxel
            grid. Defaults to None.
        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
            Defaults to None.
        std (Sequence[Number], optional): The pixel standard deviation of
            R, G, B channels. Defaults to None.
        pad_size_divisor (int): The size of padded image should be divisible by
            ``pad_size_divisor``. Defaults to 1.
        pad_value (float or int): The padded pixel value. Defaults to 0.
        pad_mask (bool): Whether to pad instance masks. Defaults to False.
        mask_pad_value (int): The padded pixel value for instance masks.
            Defaults to 0.
        pad_seg (bool): Whether to pad semantic segmentation maps.
            Defaults to False.
        seg_pad_value (int): The padded pixel value for semantic segmentation
            maps. Defaults to 255.
        bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
            Defaults to False.
        rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
            Defaults to False.
        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
            bboxes data to ``Tensor`` type. Defaults to True.
        non_blocking (bool): Whether to block current process when transferring
            data to device. Defaults to False.
        batch_augments (List[dict], optional): Batch-level augmentations.
            Defaults to None.
        batchwise_inputs (bool): Pack the input as a batch of samples
            with 1-N frames for the continuous 3D perception setting.
            Defaults to False.
    """

    def __init__(
        self,
        voxel: bool = False,
        voxel_type: str = "hard",
        voxel_layer: Optional[ConfigType] = None,
        batch_first: bool = True,
        max_voxels: Optional[int] = None,
        mean: Sequence[Number] = None,
        std: Sequence[Number] = None,
        pad_size_divisor: int = 1,
        pad_value: Union[float, int] = 0,
        pad_mask: bool = False,
        mask_pad_value: int = 0,
        pad_seg: bool = False,
        seg_pad_value: int = 255,
        bgr_to_rgb: bool = False,
        rgb_to_bgr: bool = False,
        boxtype2tensor: bool = True,
        non_blocking: bool = False,
        batch_augments: Optional[List[dict]] = None,
        batchwise_inputs: bool = False,
    ):
        super().__init__(
            mean=mean,
            std=std,
            pad_size_divisor=pad_size_divisor,
            pad_value=pad_value,
            pad_mask=pad_mask,
            mask_pad_value=mask_pad_value,
            pad_seg=pad_seg,
            seg_pad_value=seg_pad_value,
            bgr_to_rgb=bgr_to_rgb,
            rgb_to_bgr=rgb_to_bgr,
            boxtype2tensor=boxtype2tensor,
            non_blocking=non_blocking,
            batch_augments=batch_augments,
        )
        self.voxel = voxel
        self.voxel_type = voxel_type
        self.batch_first = batch_first
        self.max_voxels = max_voxels
        self.batchwise_inputs = batchwise_inputs
        if voxel:
            self.voxel_layer = VoxelizationByGridShape(**voxel_layer)

    def forward(self, data: Union[dict, List[dict]], training: bool = False):
        """Perform normalization, padding and bgr2rgb conversion based on
        ``BaseDataPreprocessor``.

        Args:
            data (dict or List[dict]): Data from dataloader. The dict contains
                the whole batch data, when it is a list[dict], the list
                indicates test time augmentation.
            training (bool): Whether to enable training time augmentation.
                Defaults to False.

        Returns:
            dict or List[dict]: Data in the same format as the model input.
        """
        if isinstance(data, list):
            num_augs = len(data)
            aug_batch_data = []
            for aug_id in range(num_augs):
                single_aug_batch_data = self.simple_process(
                    data[aug_id], training
                )
                aug_batch_data.append(single_aug_batch_data)
            return aug_batch_data
        else:
            return self.simple_process(data, training)

    def simple_process(self, data: dict, training: bool = False):
        """Perform normalization, padding and bgr2rgb conversion for img data
        based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
        is set to be True.

        Args:
            data (dict): Data sampled from dataloader.
            training (bool): Whether to enable training time augmentation.
                Defaults to False.

        Returns:
            dict: Data in the same format as the model input.
        """
        if "img" in data["inputs"]:
            batch_pad_shape = self._get_pad_shape(data)

        if self.batchwise_inputs:
            data_samples = data["data_samples"]
            batchwise_data_samples = []
            if "bboxes_3d" in data_samples[0].gt_instances_3d:
                assert isinstance(
                    data_samples[0].gt_instances_3d.labels_3d, list
                )
                bboxes_3d = data_samples[0].gt_instances_3d.bboxes_3d
                labels_3d = data_samples[0].gt_instances_3d.labels_3d
            if "gt_occupancy_masks" in data_samples[0]:
                gt_occupancy_masks = [
                    mask.clone() for mask in data_samples[0].gt_occupancy_masks
                ]
            if (
                "eval_ann_info" in data_samples[0]
                and data_samples[0].eval_ann_info is not None
            ):
                eval_ann_info = data_samples[0].eval_ann_info
            for idx in range(len(labels_3d)):
                data_sample = data_samples[0].clone()
                if "bboxes_3d" in data_sample.gt_instances_3d:
                    data_sample.gt_instances_3d = InstanceData()
                    data_sample.gt_instances_3d.bboxes_3d = bboxes_3d[idx]
                    data_sample.gt_instances_3d.labels_3d = labels_3d[idx]
                if "gt_occupancy_masks" in data_sample:
                    data_sample.gt_occupancy_masks = gt_occupancy_masks[idx]
                if "eval_ann_info" in data_sample:
                    if data_sample.eval_ann_info is not None:
                        data_sample.eval_ann_info = dict()
                        data_sample.eval_ann_info["gt_bboxes_3d"] = (
                            eval_ann_info["gt_bboxes_3d"][idx]
                        )
                        data_sample.eval_ann_info["gt_labels_3d"] = (
                            eval_ann_info["gt_labels_3d"][idx]
                        )
                batchwise_data_samples.append(data_sample)
            data["data_samples"] = batchwise_data_samples

        data = self.collate_data(data)
        inputs, data_samples = data["inputs"], data["data_samples"]
        batch_inputs = dict()
        batch_inputs.update(self.process_camera_params(data_samples))

        for key in ["depth_img", "depth_prob_gt"]:
            if key not in inputs:
                continue
            batch_inputs[key] = torch.stack(inputs[key])

        for key in ["text", "scan_id", "tokens_positive", "ignore_mask"]:
            if not hasattr(data_samples[0], key):
                continue
            batch_inputs[key] = [getattr(x, key) for x in data_samples]
        if hasattr(data_samples[0], "gt_instances_3d"):
            batch_inputs["gt_bboxes_3d"] = [
                x.gt_instances_3d.bboxes_3d for x in data_samples
            ]
            batch_inputs["gt_labels_3d"] = [
                x.gt_instances_3d.labels_3d for x in data_samples
            ]

        if "imgs" in inputs:
            imgs = inputs["imgs"]

            if data_samples is not None:
                # NOTE the batched image size information may be useful, e.g.
                # in DETR, this is needed for the construction of masks, which
                # is then used for the transformer_head.
                batch_input_shape = tuple(imgs[0].size()[-2:])
                for data_sample, pad_shape in zip(
                    data_samples, batch_pad_shape
                ):
                    data_sample.set_metainfo(
                        {
                            "batch_input_shape": batch_input_shape,
                            "pad_shape": pad_shape,
                        }
                    )

                if self.boxtype2tensor:
                    samplelist_boxtype2tensor(data_samples)
                if self.pad_mask:
                    self.pad_gt_masks(data_samples)
                if self.pad_seg:
                    self.pad_gt_sem_seg(data_samples)

            if training and self.batch_augments is not None:
                for batch_aug in self.batch_augments:
                    imgs, data_samples = batch_aug(imgs, data_samples)
            batch_inputs["imgs"] = imgs

        return {"inputs": batch_inputs, "data_samples": data_samples}

    def process_camera_params(self, data_samples):
        projection_mat = []
        extrinsic_list = []
        intrinsic_list = []
        image_wh = []
        for data_sample in data_samples:
            proj_mat = get_proj_mat_by_coord_type(
                data_sample.metainfo, "DEPTH"
            )

            img_scale_factor = data_sample.metainfo.get("scale_factor", [1, 1])
            img_flip = data_sample.metainfo.get("flip", False)
            img_crop_offset = data_sample.metainfo.get(
                "img_crop_offset", [0, 0]
            )
            trans_mat = np.eye(4)
            trans_mat[0, 0] = img_scale_factor[0]
            trans_mat[1, 1] = img_scale_factor[1]
            trans_mat[0, 2] = -img_crop_offset[0]
            trans_mat[1, 2] = -img_crop_offset[1]
            if img_flip:
                assert False
            if "trans_mat" in proj_mat:
                trans_mat = np.stack(proj_mat["trans_mat"]) @ trans_mat

            if isinstance(proj_mat, dict):
                extrinsic = np.stack(proj_mat["extrinsic"])
                intrinsic = np.stack(proj_mat["intrinsic"])
                proj_mat = intrinsic @ extrinsic
                extrinsic_list.append(extrinsic)
                intrinsic_list.append(trans_mat @ intrinsic)
            else:
                extrinsic_list.append(
                    np.tile(np.eye(4)[None], (proj_mat.shape[0], 1, 1))
                )
                intrinsic_list.append(
                    np.tile(np.eye(4)[None], (proj_mat.shape[0], 1, 1))
                )
            proj_mat = trans_mat @ proj_mat
            projection_mat.append(proj_mat)
            image_wh.append(data_sample.metainfo["img_shape"][:2])

        to_tensor = lambda x: torch.from_numpy(x).cuda().to(torch.float32)
        projection_mat = to_tensor(np.stack(projection_mat))
        image_wh = to_tensor(np.array(image_wh))
        image_wh = image_wh[:, None].tile(1, projection_mat.shape[1], 1)
        extrinsic = to_tensor(np.stack(extrinsic_list))
        intrinsic = to_tensor(np.stack(intrinsic_list))
        return {
            "projection_mat": projection_mat,
            "image_wh": image_wh,
            "extrinsic": extrinsic,
            "intrinsic": intrinsic,
        }

    def preprocess_img(self, _batch_img: Tensor):
        # channel transform
        if self._channel_conversion:
            _batch_img = _batch_img[[2, 1, 0], ...]
        # Convert to float after channel conversion to ensure
        # efficiency
        _batch_img = _batch_img.float()
        # Normalization.
        if self._enable_normalize:
            if self.mean.shape[0] == 3:
                assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
                    "If the mean has 3 values, the input tensor "
                    "should in shape of (3, H, W), but got the "
                    f"tensor with shape {_batch_img.shape}"
                )
            _batch_img = (_batch_img - self.mean) / self.std
        return _batch_img

    def collate_data(self, data: dict):
        """Copy data to the target device and perform normalization, padding
        and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``.

        Collates the data sampled from dataloader into a list of dict and list
        of labels, and then copies tensor to the target device.

        Args:
            data (dict): Data sampled from dataloader.

        Returns:
            dict: Data in the same format as the model input.
        """
        data = self.cast_data(data)  # type: ignore

        if "img" in data["inputs"]:
            _batch_imgs = data["inputs"]["img"]
            # Process data with `pseudo_collate`.
            if is_seq_of(_batch_imgs, torch.Tensor):
                batch_imgs = []
                img_dim = _batch_imgs[0].dim()
                for _batch_img in _batch_imgs:
                    if img_dim == 3:  # standard img
                        _batch_img = self.preprocess_img(_batch_img)
                    elif img_dim == 4:
                        _batch_img = [
                            self.preprocess_img(_img) for _img in _batch_img
                        ]

                        _batch_img = torch.stack(_batch_img, dim=0)

                    batch_imgs.append(_batch_img)

                # Pad and stack Tensor.
                if img_dim == 3:
                    batch_imgs = stack_batch(
                        batch_imgs, self.pad_size_divisor, self.pad_value
                    )
                elif img_dim == 4:
                    batch_imgs = multiview_img_stack_batch(
                        batch_imgs, self.pad_size_divisor, self.pad_value
                    )

            # Process data with `default_collate`.
            elif isinstance(_batch_imgs, torch.Tensor):
                assert _batch_imgs.dim() == 4, (
                    "The input of `ImgDataPreprocessor` should be a NCHW "
                    "tensor or a list of tensor, but got a tensor with "
                    f"shape: {_batch_imgs.shape}"
                )
                if self._channel_conversion:
                    _batch_imgs = _batch_imgs[:, [2, 1, 0], ...]
                # Convert to float after channel conversion to ensure
                # efficiency
                _batch_imgs = _batch_imgs.float()
                if self._enable_normalize:
                    _batch_imgs = (_batch_imgs - self.mean) / self.std
                h, w = _batch_imgs.shape[2:]
                target_h = (
                    math.ceil(h / self.pad_size_divisor)
                    * self.pad_size_divisor
                )
                target_w = (
                    math.ceil(w / self.pad_size_divisor)
                    * self.pad_size_divisor
                )
                pad_h = target_h - h
                pad_w = target_w - w
                batch_imgs = F.pad(
                    _batch_imgs,
                    (0, pad_w, 0, pad_h),
                    "constant",
                    self.pad_value,
                )
            else:
                raise TypeError(
                    "Output of `cast_data` should be a list of dict "
                    "or a tuple with inputs and data_samples, but got "
                    f"{type(data)}: {data}"
                )

            data["inputs"]["imgs"] = batch_imgs

        data.setdefault("data_samples", None)

        return data

    def _get_pad_shape(self, data: dict):
        """Get the pad_shape of each image based on data and
        pad_size_divisor."""
        # rewrite `_get_pad_shape` for obtaining image inputs.
        _batch_inputs = data["inputs"]["img"]
        # Process data with `pseudo_collate`.
        if is_seq_of(_batch_inputs, torch.Tensor):
            batch_pad_shape = []
            for ori_input in _batch_inputs:
                if ori_input.dim() == 4:
                    # mean multiview input, select one of the
                    # image to calculate the pad shape
                    ori_input = ori_input[0]
                pad_h = (
                    int(np.ceil(ori_input.shape[1] / self.pad_size_divisor))
                    * self.pad_size_divisor
                )
                pad_w = (
                    int(np.ceil(ori_input.shape[2] / self.pad_size_divisor))
                    * self.pad_size_divisor
                )
                batch_pad_shape.append((pad_h, pad_w))
        # Process data with `default_collate`.
        elif isinstance(_batch_inputs, torch.Tensor):
            assert _batch_inputs.dim() == 4, (
                "The input of `ImgDataPreprocessor` should be a NCHW tensor "
                "or a list of tensor, but got a tensor with shape: "
                f"{_batch_inputs.shape}"
            )
            pad_h = (
                int(np.ceil(_batch_inputs.shape[1] / self.pad_size_divisor))
                * self.pad_size_divisor
            )
            pad_w = (
                int(np.ceil(_batch_inputs.shape[2] / self.pad_size_divisor))
                * self.pad_size_divisor
            )
            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
        else:
            raise TypeError(
                "Output of `cast_data` should be a list of dict "
                "or a tuple with inputs and data_samples, but got "
                f"{type(data)}: {data}"
            )
        return batch_pad_shape


================================================
FILE: bip3d/models/data_preprocessors/utils.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union

import torch
import torch.nn.functional as F
from torch import Tensor


def multiview_img_stack_batch(
    tensor_list: List[Tensor],
    pad_size_divisor: int = 1,
    pad_value: Union[int, float] = 0,
) -> Tensor:
    """Compared to the ``stack_batch`` in `mmengine.model.utils`,
    multiview_img_stack_batch further handle the multiview images.

    See diff of padded_sizes[:, :-2] = 0 vs padded_sizes[:, 0] = 0 in line 47.

    Stack multiple tensors to form a batch and pad the tensor to the max shape
    use the right bottom padding mode in these images. If
    ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
    divisible by ``pad_size_divisor``.

    Args:
        tensor_list (List[Tensor]): A list of tensors with the same dim.
        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding to
            ensure the shape of each dim is divisible by ``pad_size_divisor``.
            This depends on the model, and many models need to be divisible by
            32. Defaults to 1.
        pad_value (int or float): The padding value. Defaults to 0.

    Returns:
        Tensor: The n dim tensor.
    """
    assert isinstance(
        tensor_list, list
    ), f"Expected input type to be list, but got {type(tensor_list)}"
    assert tensor_list, "`tensor_list` could not be an empty list"
    assert len({tensor.ndim for tensor in tensor_list}) == 1, (
        "Expected the dimensions of all tensors must be the same, "
        f"but got {[tensor.ndim for tensor in tensor_list]}"
    )

    dim = tensor_list[0].dim()
    num_img = len(tensor_list)
    all_sizes: torch.Tensor = torch.Tensor(
        [tensor.shape for tensor in tensor_list]
    )
    max_sizes = (
        torch.ceil(torch.max(all_sizes, dim=0)[0] / pad_size_divisor)
        * pad_size_divisor
    )
    padded_sizes = max_sizes - all_sizes
    # The first dim normally means channel, which should not be padded.
    padded_sizes[:, :-2] = 0
    if padded_sizes.sum() == 0:
        return torch.stack(tensor_list)
    # `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
    # it means that padding the last dim with 1(left) 2(right), padding the
    # penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
    # the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
    # and only odd index of pad should be assigned to keep padding "right" and
    # "bottom".
    pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
    pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
    batch_tensor = []
    for idx, tensor in enumerate(tensor_list):
        batch_tensor.append(
            F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value)
        )
    return torch.stack(batch_tensor)


================================================
FILE: bip3d/models/deformable_aggregation.py
================================================
from typing import List, Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
from torch.cuda.amp.autocast_mode import autocast

from mmcv.cnn import build_norm_layer
from mmcv.cnn import Linear
from mmengine.model import (
    Sequential,
    BaseModule,
    xavier_init,
    constant_init,
    ModuleList,
)
from mmcv.cnn.bricks.transformer import FFN

from ..ops import deformable_aggregation_func
from .utils import linear_act_ln
from bip3d.registry import MODELS

__all__ = [
    "DeformableFeatureAggregation",
]


@MODELS.register_module()
class DeformableFeatureAggregation(BaseModule):
    def __init__(
        self,
        embed_dims: int = 256,
        num_groups: int = 8,
        num_levels: int = 4,
        num_cams: int = 6,
        proj_drop: float = 0.0,
        attn_drop: float = 0.0,
        kps_generator: dict = None,
        temporal_fusion_module=None,
        use_temporal_anchor_embed=True,
        use_deformable_func=False,
        use_camera_embed=False,
        residual_mode="add",
        batch_first=True,
        ffn_cfg=None,
        with_value_proj=False,
        filter_outlier=False,
        with_depth=False,
        min_depth=None,
        max_depth=None,
    ):
        super(DeformableFeatureAggregation, self).__init__()
        assert batch_first
        if embed_dims % num_groups != 0:
            raise ValueError(
                f"embed_dims must be divisible by num_groups, "
                f"but got {embed_dims} and {num_groups}"
            )
        self.group_dims = int(embed_dims / num_groups)
        self.embed_dims = embed_dims
        self.num_levels = num_levels
        self.num_groups = num_groups
        self.num_cams = num_cams
        self.use_temporal_anchor_embed = use_temporal_anchor_embed
        self.use_deformable_func = use_deformable_func
        self.attn_drop = attn_drop
        self.residual_mode = residual_mode
        self.proj_drop = nn.Dropout(proj_drop)
        if kps_generator is not None:
            kps_generator["embed_dims"] = embed_dims
            self.kps_generator = MODELS.build(kps_generator)
            self.num_pts = self.kps_generator.num_pts
        else:
            self.kps_generator = None
            self.num_pts = 1
        if temporal_fusion_module is not None:
            if "embed_dims" not in temporal_fusion_module:
                temporal_fusion_module["embed_dims"] = embed_dims
            self.temp_module = MODELS.build(temporal_fusion_module)
        else:
            self.temp_module = None
        self.output_proj = Linear(embed_dims, embed_dims)

        if use_camera_embed:
            self.camera_encoder = Sequential(
                *linear_act_ln(embed_dims, 1, 2, 12)
            )
            self.weights_fc = Linear(
                embed_dims, num_groups * num_levels * self.num_pts
            )
        else:
            self.camera_encoder = None
            self.weights_fc = Linear(
                embed_dims, num_groups * num_cams * num_levels * self.num_pts
            )
        if ffn_cfg is not None:
            ffn_cfg["embed_dims"] = embed_dims
            self.ffn = FFN(**ffn_cfg)
            self.norms = ModuleList(
                build_norm_layer(dict(type="LN"), embed_dims)[1]
                for _ in range(2)
            )
        else:
            self.ffn = None
        self.with_value_proj = with_value_proj
        self.filter_outlier = filter_outlier
        if self.with_value_proj:
            self.value_proj = Linear(embed_dims, embed_dims)
        self.with_depth = with_depth
        if self.with_depth:
            assert min_depth is not None and max_depth is not None
            self.min_depth = min_depth
            self.max_depth = max_depth

    def init_weight(self):
        constant_init(self.weights_fc, val=0.0, bias=0.0)
        xavier_init(self.output_proj, distribution="uniform", bias=0.0)

    def get_spatial_shape_3D(self, spatial_shape, depth_dim):
        spatial_shape_depth = spatial_shape.new_ones(*spatial_shape.shape[:-1], 1) * depth_dim
        spatial_shape_3D = torch.cat([spatial_shape, spatial_shape_depth], dim=-1)
        return spatial_shape_3D.contiguous()

    def forward(
        self,
        instance_feature: torch.Tensor,
        anchor: torch.Tensor,
        anchor_embed: torch.Tensor,
        feature_maps: List[torch.Tensor],
        metas: dict,
        depth_prob=None,
        **kwargs: dict,
    ):
        bs, num_anchor = instance_feature.shape[:2]
        if self.kps_generator is not None:
            key_points = self.kps_generator(anchor, instance_feature)
        else:
            key_points = anchor[:, :, None]

        points_2d, depth, mask = self.project_points(
            key_points,
            metas["projection_mat"],
            metas.get("image_wh"),
        )
        weights = self._get_weights(
            instance_feature, anchor_embed, metas, mask
        )

        if self.use_deformable_func:
            if self.with_value_proj:
                feature_maps[0] = self.value_proj(feature_maps[0])

            points_2d = points_2d.permute(0, 2, 3, 1, 4).reshape(
                bs, num_anchor * self.num_pts, -1, 2
            )
            weights = (
                weights.permute(0, 1, 4, 2, 3, 5)
                .contiguous()
                .reshape(
                    bs,
                    num_anchor * self.num_pts,
                    -1,
                    self.num_levels,
                    self.num_groups,
                )
            )
            if self.with_depth:
                depth = depth.permute(0, 2, 3, 1).reshape(
                    bs, num_anchor * self.num_pts, -1, 1
                )
                # normalize depth to [0, depth_prob.shape[-1]-1]
                depth = (depth - self.min_depth) / (self.max_depth - self.min_depth)
                depth = depth * (depth_prob.shape[-1] - 1)
                features = deformable_aggregation_func(
                    *feature_maps, points_2d, weights, depth_prob, depth
                )
            else:
                features = deformable_aggregation_func(*feature_maps, points_2d, weights)
            features = features.reshape(bs, num_anchor, self.num_pts, self.embed_dims)
            features = features.sum(dim=2)
        else:
            assert False
            assert not self.with_value_proj
            features = self.feature_sampling(
                feature_maps,
                key_points,
                metas["projection_mat"],
                metas.get("image_wh"),
            )
            features = self.multi_view_level_fusion(features, weights)
            features = features.sum(dim=2)  # fuse multi-point features
        output = self.proj_drop(self.output_proj(features))
        if self.residual_mode == "add":
            output = output + instance_feature
        elif self.residual_mode == "cat":
            output = torch.cat([output, instance_feature], dim=-1)
        if self.ffn is not None:
            output = self.norms[0](output)
            output = self.ffn(output)
            output = self.norms[1](output)
        return output

    def _get_weights(
        self, instance_feature, anchor_embed, metas=None, mask=None
    ):
        bs, num_anchor = instance_feature.shape[:2]
        feature = instance_feature + anchor_embed
        if self.camera_encoder is not None:
            num_cams = metas["projection_mat"].shape[1]
            camera_embed = self.camera_encoder(
                metas["projection_mat"][:, :, :3].reshape(bs, num_cams, -1)
            )
            feature = feature[:, :, None] + camera_embed[:, None]

        weights = self.weights_fc(feature)
        if mask is not None and self.filter_outlier:
            num_cams = weights.shape[2]
            mask = mask.permute(0, 2, 1, 3)[..., None, :, None]
            weights = weights.reshape(
                bs,
                num_anchor,
                num_cams,
                self.num_levels,
                self.num_pts,
                self.num_groups,
            )
            weights = weights.masked_fill(
                torch.logical_and(~mask, mask.sum(dim=2, keepdim=True) != 0),
                float("-inf"),
            )
        weights = (
            weights.reshape(bs, num_anchor, -1, self.num_groups)
            .softmax(dim=-2)
            .reshape(
                bs,
                num_anchor,
                -1,
                self.num_levels,
                self.num_pts,
                self.num_groups,
            )
        )
        if self.training and self.attn_drop > 0:
            mask = torch.rand(bs, num_anchor, -1, 1, self.num_pts, 1)
            mask = mask.to(device=weights.device, dtype=weights.dtype)
            weights = ((mask > self.attn_drop) * weights) / (
                1 - self.attn_drop
            )
        return weights

    @staticmethod
    def project_points(key_points, projection_mat, image_wh=None):
        bs, num_anchor, num_pts = key_points.shape[:3]

        pts_extend = torch.cat(
            [key_points, torch.ones_like(key_points[..., :1])], dim=-1
        )
        points_2d = torch.matmul(
            projection_mat[:, :, None, None], pts_extend[:, None, ..., None]
        ).squeeze(-1)
        depth = points_2d[..., 2]
        mask = depth > 1e-5
        points_2d = points_2d[..., :2] / torch.clamp(
            points_2d[..., 2:3], min=1e-5
        )
        mask = mask & (points_2d[..., 0] > 0) & (points_2d[..., 1] > 0)
        if image_wh is not None:
            points_2d = points_2d / image_wh[:, :, None, None]
            mask = mask & (points_2d[..., 0] < 1) & (points_2d[..., 1] < 1)
        return points_2d, depth, mask

    @staticmethod
    def feature_sampling(
        feature_maps: List[torch.Tensor],
        key_points: torch.Tensor,
        projection_mat: torch.Tensor,
        image_wh: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        num_levels = len(feature_maps)
        num_cams = feature_maps[0].shape[1]
        bs, num_anchor, num_pts = key_points.shape[:3]

        points_2d = DeformableFeatureAggregation.project_points(
            key_points, projection_mat, image_wh
        )
        points_2d = points_2d * 2 - 1
        points_2d = points_2d.flatten(end_dim=1)

        features = []
        for fm in feature_maps:
            features.append(
                torch.nn.functional.grid_sample(
                    fm.flatten(end_dim=1), points_2d
                )
            )
        features = torch.stack(features, dim=1)
        features = features.reshape(
            bs, num_cams, num_levels, -1, num_anchor, num_pts
        ).permute(
            0, 4, 1, 2, 5, 3
        )  # bs, num_anchor, num_cams, num_levels, num_pts, embed_dims

        return features

    def multi_view_level_fusion(
        self,
        features: torch.Tensor,
        weights: torch.Tensor,
    ):
        bs, num_anchor = weights.shape[:2]
        features = weights[..., None] * features.reshape(
            features.shape[:-1] + (self.num_groups, self.group_dims)
        )
        features = features.sum(dim=2).sum(dim=2)
        features = features.reshape(
            bs, num_anchor, self.num_pts, self.embed_dims
        )
        return features


================================================
FILE: bip3d/models/feature_enhancer.py
================================================
import torch
from torch import nn

from mmengine.model import BaseModel
from mmdet.models.layers.transformer.utils import get_text_sine_pos_embed
from mmdet.models.layers import SinePositionalEncoding
from mmdet.models.layers.transformer.deformable_detr_layers import (
    DeformableDetrTransformerEncoder as DDTE,
)
from mmdet.models.layers.transformer.deformable_detr_layers import (
    DeformableDetrTransformerEncoderLayer,
)
from mmdet.models.layers.transformer.detr_layers import (
    DetrTransformerEncoderLayer,
)
from mmdet.models.utils.vlfuse_helper import SingleScaleBiAttentionBlock
from bip3d.registry import MODELS

from .utils import deformable_format


@MODELS.register_module()
class TextImageDeformable2DEnhancer(BaseModel):
    def __init__(
        self,
        num_layers,
        text_img_attn_block,
        img_attn_block,
        text_attn_block=None,
        embed_dims=256,
        num_feature_levels=4,
        positional_encoding=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.num_layers = num_layers
        self.num_feature_levels = num_feature_levels
        self.embed_dims = embed_dims
        self.positional_encoding = positional_encoding
        self.text_img_attn_blocks = nn.ModuleList()
        self.img_attn_blocks = nn.ModuleList()
        self.text_attn_blocks = nn.ModuleList()
        for _ in range(self.num_layers):
            self.text_img_attn_blocks.append(
                SingleScaleBiAttentionBlock(**text_img_attn_block)
            )
            self.img_attn_blocks.append(
                DeformableDetrTransformerEncoderLayer(**img_attn_block)
            )
            self.text_attn_blocks.append(
                DetrTransformerEncoderLayer(**text_attn_block)
            )
        self.positional_encoding = SinePositionalEncoding(
            **self.positional_encoding
        )
        self.level_embed = nn.Parameter(
            torch.Tensor(self.num_feature_levels, self.embed_dims)
        )

    def forward(
        self,
        feature_maps,
        text_dict=None,
        **kwargs,
    ):
        with_cams = feature_maps[0].dim() == 5
        if with_cams:
            bs, num_cams = feature_maps[0].shape[:2]
            feature_maps = [x.flatten(0, 1) for x in feature_maps]
        else:
            bs = feature_maps[0].shape[0]
            num_cams = 1
        pos_2d = self.get_2d_position_embed(feature_maps)
        feature_2d, spatial_shapes, level_start_index = deformable_format(
            feature_maps
        )

        reference_points = DDTE.get_encoder_reference_points(
            spatial_shapes,
            valid_ratios=feature_2d.new_ones(
                [bs * num_cams, self.num_feature_levels, 2]
            ),
            device=feature_2d.device,
        )

        text_feature = text_dict["embedded"]
        pos_text = get_text_sine_pos_embed(
            text_dict["position_ids"][..., None],
            num_pos_feats=self.embed_dims,
            exchange_xy=False,
        )

        for layer_id in range(self.num_layers):
            feature_2d_fused = feature_2d[:, level_start_index[-1] :]
            if with_cams:
                feature_2d_fused = feature_2d_fused.unflatten(
                    0, (bs, num_cams)
                )
                feature_2d_fused = feature_2d_fused.flatten(1, 2)
            feature_2d_fused, text_feature = self.text_img_attn_blocks[
                layer_id
            ](
                feature_2d_fused,
                text_feature,
                attention_mask_l=text_dict["text_token_mask"],
            )
            if with_cams:
                feature_2d_fused = feature_2d_fused.unflatten(
                    1, (num_cams, -1)
                )
                feature_2d_fused = feature_2d_fused.flatten(0, 1)
            feature_2d = torch.cat(
                [feature_2d[:, : level_start_index[-1]], feature_2d_fused],
                dim=1,
            )

            feature_2d = self.img_attn_blocks[layer_id](
                query=feature_2d,
                query_pos=pos_2d,
                reference_points=reference_points,
                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index,
                key_padding_mask=None,
            )

            text_attn_mask = text_dict.get("masks")
            if text_attn_mask is not None:
                text_num_heads = self.text_attn_blocks[
                    layer_id
                ].self_attn_cfg.num_heads
                text_attn_mask = ~text_attn_mask.repeat(text_num_heads, 1, 1)
            text_feature = self.text_attn_blocks[layer_id](
                query=text_feature,
                query_pos=pos_text,
                attn_mask=text_attn_mask,
                key_padding_mask=None,
            )
        feature_2d = deformable_format(
            feature_2d, spatial_shapes, batch_size=bs if with_cams else None
        )
        return feature_2d, text_feature

    def get_2d_position_embed(self, feature_maps):
        pos_2d = []
        for lvl, feat in enumerate(feature_maps):
            batch_size, c, h, w = feat.shape
            pos = self.positional_encoding(None, feat)
            pos = pos.view(batch_size, c, h * w).permute(0, 2, 1)
            pos = pos + self.level_embed[lvl]
            pos_2d.append(pos)
        pos_2d = torch.cat(pos_2d, 1)
        return pos_2d


================================================
FILE: bip3d/models/instance_bank.py
================================================
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

from bip3d.registry import MODELS

__all__ = ["InstanceBank"]


def topk(confidence, k, *inputs):
    bs, N = confidence.shape[:2]
    confidence, indices = torch.topk(confidence, k, dim=1)
    indices = (
        indices + torch.arange(bs, device=indices.device)[:, None] * N
    ).reshape(-1)
    outputs = []
    for input in inputs:
        outputs.append(input.flatten(end_dim=1)[indices].reshape(bs, k, -1))
    return confidence, outputs


@MODELS.register_module()
class InstanceBank(nn.Module):
    def __init__(
        self,
        num_anchor,
        embed_dims,
        anchor,
        anchor_handler=None,
        num_current_instance=None,
        num_temp_instances=0,
        default_time_interval=0.5,
        confidence_decay=0.6,
        anchor_grad=True,
        feat_grad=True,
        max_time_interval=2,
        anchor_in_camera=True,
    ):
        super(InstanceBank, self).__init__()
        self.embed_dims = embed_dims
        self.num_current_instance = num_current_instance
        self.num_temp_instances = num_temp_instances
        self.default_time_interval = default_time_interval
        self.confidence_decay = confidence_decay
        self.max_time_interval = max_time_interval

        if anchor_handler is not None:
            anchor_handler = MODELS.build(anchor_handler)
            assert hasattr(anchor_handler, "anchor_projection")
        self.anchor_handler = anchor_handler
        if isinstance(anchor, str):
            anchor = np.load(anchor)
        elif isinstance(anchor, (list, tuple)):
            anchor = np.array(anchor)
        if len(anchor.shape) == 3:  # for map
            anchor = anchor.reshape(anchor.shape[0], -1)
        self.num_anchor = min(len(anchor), num_anchor)
        self.anchor = anchor[:num_anchor]
        # self.anchor = nn.Parameter(
        #     torch.tensor(anchor, dtype=torch.float32),
        #     requires_grad=anchor_grad,
        # )
        self.anchor_init = anchor
        self.instance_feature = nn.Parameter(
            torch.zeros([1, self.embed_dims]),
            # torch.zeros([self.anchor.shape[0], self.embed_dims]),
            requires_grad=feat_grad,
        )
        self.anchor_in_camera = anchor_in_camera
        self.reset()

    def init_weight(self):
        self.anchor.data = self.anchor.data.new_tensor(self.anchor_init)
        if self.instance_feature.requires_grad:
            torch.nn.init.xavier_uniform_(self.instance_feature.data, gain=1)

    def reset(self):
        self.cached_feature = None
        self.cached_anchor = None
        self.metas = None
        self.mask = None
        self.confidence = None
        self.temp_confidence = None
        self.instance_id = None
        self.prev_id = 0

    def bbox_transform(self, bbox, matrix):
        # bbox: bs, n, 9
        # matrix: bs, cam, 4, 4
        # output: bs, n*cam, 9
        bbox = bbox.unsqueeze(dim=2)
        matrix = matrix.unsqueeze(dim=1)
        points = bbox[..., :3]
        points_extend = torch.concat(
            [points, torch.ones_like(points[..., :1])], dim=-1
        )
        points_trans = torch.matmul(matrix, points_extend[..., None])[
            ..., :3, 0
        ]

        size = bbox[..., 3:6].tile(1, 1, points_trans.shape[2], 1)
        angle = bbox[..., 6:].tile(1, 1, points_trans.shape[2], 1)

        bbox = torch.cat([points_trans, size, angle], dim=-1)
        bbox = bbox.flatten(1, 2)
        return bbox

    def get(self, batch_size, metas=None, dn_metas=None):
        instance_feature = torch.tile(
            self.instance_feature[None], (batch_size, self.anchor.shape[0], 1)
        )
        anchor = torch.tile(
            instance_feature.new_tensor(self.anchor)[None], (batch_size, 1, 1)
        )

        if self.anchor_in_camera:
            cam2global = np.linalg.inv(metas["extrinsic"].cpu().numpy())
            cam2global = torch.from_numpy(cam2global).to(anchor)
            anchor = self.bbox_transform(anchor, cam2global)
            instance_feature = instance_feature.tile(1, cam2global.shape[1], 1)

        if (
            self.cached_anchor is not None
            and batch_size == self.cached_anchor.shape[0]
        ):
            # assert False, "TODO: linxuewu"
            # history_time = self.metas["timestamp"]
            # time_interval = metas["timestamp"] - history_time
            # time_interval = time_interval.to(dtype=instance_feature.dtype)
            # self.mask = torch.abs(time_interval) <= self.max_time_interval

            last_scan_id = self.metas["scan_id"]
            current_scan_id = metas["scan_id"]
            self.mask = torch.tensor(
                [x==y for x,y in zip(last_scan_id, current_scan_id)],
                device=anchor.device,
            )
            assert self.mask.shape[0] == 1
            if not self.mask:
                self.reset()
            time_interval = instance_feature.new_tensor(
                [self.default_time_interval] * batch_size
            )
        else:
            self.reset()
            time_interval = instance_feature.new_tensor(
                [self.default_time_interval] * batch_size
            )

        return (
            instance_feature,
            anchor,
            self.cached_feature,
            self.cached_anchor,
            time_interval,
        )

    def update(self, instance_feature, anchor, confidence, num_dn=None):
        if self.cached_feature is None:
            return instance_feature, anchor

        if num_dn is not None and num_dn > 0:
            dn_instance_feature = instance_feature[:, -num_dn:]
            dn_anchor = anchor[:, -num_dn:]
            instance_feature = instance_feature[:, : -num_dn]
            anchor = anchor[:, : -num_dn]
            confidence = confidence[:, : -num_dn]

        N = self.num_current_instance
        if N is not None and N < confidence.shape[1]:
            confidence = confidence.max(dim=-1).values
            _, (selected_feature, selected_anchor) = topk(
                confidence, N, instance_feature, anchor
            )
        else:
            selected_feature, selected_anchor = instance_feature, anchor
        instance_feature = torch.cat(
            [self.cached_feature, selected_feature], dim=1
        )
        anchor = torch.cat(
            [self.cached_anchor, selected_anchor], dim=1
        )

        if num_dn is not None and num_dn > 0:
            instance_feature = torch.cat(
                [instance_feature, dn_instance_feature], dim=1
            )
            anchor = torch.cat([anchor, dn_anchor], dim=1)
        return instance_feature, anchor

    def cache(
        self,
        instance_feature,
        anchor,
        confidence,
        metas=None,
        feature_maps=None,
    ):
        if self.num_temp_instances <= 0:
            return
        instance_feature = instance_feature.detach()
        anchor = anchor.detach()
        confidence = confidence.detach()

        self.metas = metas
        confidence = confidence.max(dim=-1).values.sigmoid()
        if self.confidence is not None:
            N = self.confidence.shape[1]
            confidence[:, : N] = torch.maximum(
                self.confidence * self.confidence_decay,
                confidence[:, : N],
            )
        self.temp_confidence = confidence

        if self.num_temp_instances < confidence.shape[1]:
            (
                self.confidence,
                (self.cached_feature, self.cached_anchor),
            ) = topk(
                confidence, self.num_temp_instances, instance_feature, anchor
            )
        else:
            self.confidence, self.cached_feature, self.cached_anchor = (
                confidence, instance_feature, anchor
            )

    def get_instance_id(self, confidence, anchor=None, threshold=None):
        confidence = confidence.max(dim=-1).values.sigmoid()
        instance_id = confidence.new_full(confidence.shape, -1).long()

        if (
            self.instance_id is not None
            and self.instance_id.shape[0] == instance_id.shape[0]
        ):
            instance_id[:, : self.instance_id.shape[1]] = self.instance_id

        mask = instance_id < 0
        if threshold is not None:
            mask = mask & (confidence >= threshold)
        num_new_instance = mask.sum()
        new_ids = torch.arange(num_new_instance).to(instance_id) + self.prev_id
        instance_id[torch.where(mask)] = new_ids
        self.prev_id += num_new_instance
        self.update_instance_id(instance_id, confidence)
        return instance_id

    def update_instance_id(self, instance_id=None, confidence=None):
        if self.temp_confidence is None:
            if confidence.dim() == 3:  # bs, num_anchor, num_cls
                temp_conf = confidence.max(dim=-1).values
            else:  # bs, num_anchor
                temp_conf = confidence
        else:
            temp_conf = self.temp_confidence
        instance_id = topk(temp_conf, self.num_temp_instances, instance_id)[1][
            0
        ]
        instance_id = instance_id.squeeze(dim=-1)
        self.instance_id = F.pad(
            instance_id,
            (0, self.num_anchor - self.num_temp_instances),
            value=-1,
        )


================================================
FILE: bip3d/models/spatial_enhancer.py
================================================
import torch
from torch import nn

from mmengine.model import BaseModel
from mmdet.models.layers.transformer.utils import MLP
from mmcv.cnn.bricks.transformer import FFN

from .utils import deformable_format
from bip3d.registry import MODELS


@MODELS.register_module()
class DepthFusionSpatialEnhancer(BaseModel):
    def __init__(
        self,
        embed_dims=256,
        feature_3d_dim=32,
        num_depth_layers=2,
        min_depth=0.25,
        max_depth=10,
        num_depth=64,
        with_feature_3d=True,
        loss_depth_weight=-1,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.embed_dims = embed_dims
        self.feature_3d_dim = feature_3d_dim
        self.num_depth_layers = num_depth_layers
        self.min_depth = min_depth
        self.max_depth = max_depth
        self.num_depth = num_depth
        self.with_feature_3d = with_feature_3d
        self.loss_depth_weight = loss_depth_weight

        fusion_dim = self.embed_dims + self.feature_3d_dim
        if self.with_feature_3d:
            self.pts_prob_pre_fc = nn.Linear(
                self.embed_dims, self.feature_3d_dim
            )
            dim = self.feature_3d_dim * 2
            fusion_dim += self.feature_3d_dim
        else:
            dim = self.embed_dims
        self.pts_prob_fc = MLP(
            dim,
            dim,
            self.num_depth,
            self.num_depth_layers,
        )
        self.pts_fc = nn.Linear(3, self.feature_3d_dim)
        self.fusion_fc = nn.Sequential(
            FFN(embed_dims=fusion_dim, feedforward_channels=1024),
            nn.Linear(fusion_dim, self.embed_dims),
        )
        self.fusion_norm = nn.LayerNorm(self.embed_dims)

    def forward(
        self,
        feature_maps,
        feature_3d=None,
        batch_inputs=None,
        **kwargs,
    ):
        with_cams = feature_maps[0].dim() == 5
        if with_cams:
            bs, num_cams = feature_maps[0].shape[:2]
        else:
            bs = feature_maps[0].shape[0]
            num_cams = 1

        feature_2d, spatial_shapes, _ = deformable_format(feature_maps)
        pts = self.get_pts(
            spatial_shapes,
            batch_inputs["image_wh"][0, 0],
            batch_inputs["projection_mat"],
            feature_2d.device,
            feature_2d.dtype,
        )

        if self.with_feature_3d:
            feature_3d = deformable_format(feature_3d)[0]
            depth_prob_feat = self.pts_prob_pre_fc(feature_2d)
            depth_prob_feat = torch.cat([depth_prob_feat, feature_3d], dim=-1)
            depth_prob = self.pts_prob_fc(depth_prob_feat).softmax(dim=-1)
            feature_fused = [feature_2d, feature_3d]
        else:
            depth_prob = self.pts_prob_fc(feature_2d).softmax(dim=-1)
            feature_fused = [feature_2d]

        pts_feature = self.pts_fc(pts)
        pts_feature = (depth_prob.unsqueeze(dim=-1) * pts_feature).sum(dim=-2)
        feature_fused.append(pts_feature)
        feature_fused = torch.cat(feature_fused, dim=-1)
        feature_fused = self.fusion_fc(feature_fused) + feature_2d
        feature_fused = self.fusion_norm(feature_fused)
        feature_fused = deformable_format(feature_fused, spatial_shapes)
        if self.loss_depth_weight > 0 and self.training:
            loss_depth = self.depth_prob_loss(depth_prob, batch_inputs)
        else:
            loss_depth = None
        return feature_fused, depth_prob, loss_depth

    def get_pts(self, spatial_shapes, image_wh, projection_mat, device, dtype):
        pixels = []
        for i, shape in enumerate(spatial_shapes):
            stride = image_wh[0] / shape[1]
            u = torch.linspace(
                0, image_wh[0] - stride, shape[1], device=device, dtype=dtype
            )
            v = torch.linspace(
                0, image_wh[1] - stride, shape[0], device=device, dtype=dtype
            )
            u = u[None].tile(shape[0], 1)
            v = v[:, None].tile(1, shape[1])
            uv = torch.stack([u, v], dim=-1).flatten(0, 1)
            pixels.append(uv)
        pixels = torch.cat(pixels, dim=0)[:, None]
        depths = torch.linspace(
            self.min_depth,
            self.max_depth,
            self.num_depth,
            device=device,
            dtype=dtype,
        )
        depths = depths[None, :, None]
        pts = pixels * depths
        depths = depths.tile(pixels.shape[0], 1, 1)
        pts = torch.cat([pts, depths, torch.ones_like(depths)], dim=-1)

        pts = torch.linalg.solve(
            projection_mat.mT.unsqueeze(dim=2), pts, left=False
        )[
            ..., :3
        ]  # b,cam,N,3
        return pts

    def depth_prob_loss(self, depth_prob, batch_inputs):
        mask = batch_inputs["depth_prob_gt"][..., 0] != 1
        loss_depth = (
            torch.nn.functional.binary_cross_entropy(
                depth_prob[mask], batch_inputs["depth_prob_gt"][mask]
            )
            * self.loss_depth_weight
        )
        return loss_depth


================================================
FILE: bip3d/models/structure.py
================================================
from torch import nn

from mmdet.models.detectors import BaseDetector
from mmdet.models.detectors.deformable_detr import (
    MultiScaleDeformableAttention,
)

from bip3d.registry import MODELS


@MODELS.register_module()
class BIP3D(BaseDetector):
    def __init__(
        self,
        backbone,
        decoder,
        neck=None,
        text_encoder=None,
        feature_enhancer=None,
        spatial_enhancer=None,
        data_preprocessor=None,
        backbone_3d=None,
        neck_3d=None,
        init_cfg=None,
        input_2d="imgs",
        input_3d="depth_img",
        embed_dims=256,
        use_img_grid_mask=False,
        use_depth_grid_mask=False,
    ):
        super().__init__(data_preprocessor, init_cfg)

        def build(cfg):
            if cfg is None:
                return None
            return MODELS.build(cfg)

        self.backbone = build(backbone)
        self.decoder = build(decoder)
        self.neck = build(neck)
        self.text_encoder = build(text_encoder)
        self.feature_enhancer = build(feature_enhancer)
        self.spatial_enhancer = build(spatial_enhancer)
        self.backbone_3d = build(backbone_3d)
        self.neck_3d = build(neck_3d)
        self.input_2d = input_2d
        self.input_3d = input_3d
        self.embed_dims = embed_dims
        if text_encoder is not None:
            self.text_feat_map = nn.Linear(
                self.text_encoder.language_backbone.body.language_dim,
                self.embed_dims,
                bias=True,
            )

        self.use_img_grid_mask = use_img_grid_mask
        self.use_depth_grid_mask = use_depth_grid_mask
        if use_depth_grid_mask or use_img_grid_mask:
            from ..grid_mask import GridMask

            self.grid_mask = GridMask(
                True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7
            )

    def init_weights(self):
        """Initialize weights for Transformer and other components."""
        for p in self.feature_enhancer.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        self.decoder.init_weights()
        for m in self.modules():
            if isinstance(m, MultiScaleDeformableAttention):
                m.init_weights()
        nn.init.normal_(self.feature_enhancer.level_embed)
        nn.init.constant_(self.text_feat_map.bias.data, 0)
        nn.init.xavier_uniform_(self.text_feat_map.weight.data)

    def extract_feat(self, batch_inputs_dict, batch_data_samples):
        imgs = batch_inputs_dict.get(self.input_2d)
        if imgs.dim() == 5:
            bs, num_cams = imgs.shape[:2]
            imgs = imgs.flatten(end_dim=1)
        else:
            bs = imgs.shape[0]
            num_cams = 1

        if self.use_img_grid_mask and self.training:
            img = self.grid_mask(
                img,
                offset=-self.data_preprocessor.mean
                / self.data_preprocessor.std,
            )
        feature_maps = self.backbone(imgs)
        if self.neck is not None:
            feature_maps = self.neck(feature_maps)
        feature_maps = [x.unflatten(0, (bs, num_cams)) for x in feature_maps]

        input_3d = batch_inputs_dict.get(self.input_3d)
        if self.backbone_3d is not None and input_3d is not None:
            if self.input_3d == "depth_img" and input_3d.dim() == 5:
                assert input_3d.shape[1] == num_cams
                input_3d = input_3d.flatten(end_dim=1)
            if self.use_depth_grid_mask and self.training:
                input_3d = self.grid_mask(input_3d)
            feature_3d = self.backbone_3d(input_3d)
            if self.neck_3d is not None:
                feature_3d = self.neck_3d(feature_3d)
            feature_3d = [x.unflatten(0, (bs, num_cams)) for x in feature_3d]
        else:
            feature_3d = None
        return feature_maps, feature_3d

    def extract_text_feature(self, batch_inputs_dict):
        if self.text_encoder is not None:
            text_dict = self.text_encoder(batch_inputs_dict["text"])
            text_dict["embedded"] = self.text_feat_map(text_dict["embedded"])
        else:
            text_dict = None
        return text_dict

    def loss(self, batch_inputs, batch_data_samples):
        model_outs, text_dict, loss_depth = self._forward(
            batch_inputs, batch_data_samples
        )
        loss = self.decoder.loss(model_outs, batch_inputs, text_dict=text_dict)
        if loss_depth is not None:
            loss["loss_depth"] = loss_depth
        return loss

    def predict(self, batch_inputs, batch_data_samples):
        model_outs, text_dict = self._forward(batch_inputs, batch_data_samples)
        results = self.decoder.post_process(
            model_outs, text_dict, batch_inputs, batch_data_samples
        )
        return results

    def _forward(self, batch_inputs, batch_data_samples):
        feature_maps, feature_3d = self.extract_feat(
            batch_inputs, batch_data_samples
        )
        text_dict = self.extract_text_feature(batch_inputs)
        if self.feature_enhancer is not None:
            feature_maps, text_feature = self.feature_enhancer(
                feature_maps=feature_maps,
                feature_3d=feature_3d,
                text_dict=text_dict,
                batch_inputs=batch_inputs,
                batch_data_samples=batch_data_samples,
            )
            text_dict["embedded"] = text_feature
        if self.spatial_enhancer is not None:
            feature_maps, depth_prob, loss_depth = self.spatial_enhancer(
                feature_maps=feature_maps,
                feature_3d=feature_3d,
                text_dict=text_dict,
                batch_inputs=batch_inputs,
                batch_data_samples=batch_data_samples,
            )
        else:
            loss_depth = depth_prob = None
        model_outs = self.decoder(
            feature_maps=feature_maps,
            feature_3d=feature_3d,
            text_dict=text_dict,
            batch_inputs=batch_inputs,
            batch_data_samples=batch_data_samples,
            depth_prob=depth_prob,
        )
        if self.training:
            return model_outs, text_dict, loss_depth
        return model_outs, text_dict


================================================
FILE: bip3d/models/target.py
================================================
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment

from .base_target import BaseTargetWithDenoising
from .utils import (
    wasserstein_distance,
    permutation_corner_distance,
    center_distance,
    get_positive_map,
)
from bip3d.registry import TASK_UTILS


__all__ = ["Grounding3DTarget"]


X, Y, Z, W, L, H, ALPHA, BETA, GAMMA = range(9)


@TASK_UTILS.register_module()
class Grounding3DTarget(BaseTargetWithDenoising):
    def __init__(
        self,
        cls_weight=1.0,
        alpha=0.25,
        gamma=2,
        eps=1e-12,
        box_weight=1.0,
        cls_wise_reg_weights=None,
        num_dn=0,
        dn_noise_scale=0.5,
        add_neg_dn=True,
        num_temp_dn_groups=0,
        with_dn_query=False,
        num_classes=None,
        embed_dims=256,
        label_noise_scale=0.5,
        cost_weight_wd=1.0,
        cost_weight_pcd=0.0,
        cost_weight_cd=0.8,
        use_ignore_mask=False,
    ):
        super(Grounding3DTarget, self).__init__(num_dn, num_temp_dn_groups)
        self.cls_weight = cls_weight
        self.box_weight = box_weight
        self.alpha = alpha
        self.gamma = gamma
        self.eps = eps
        self.cls_wise_reg_weights = cls_wise_reg_weights
        self.dn_noise_scale = dn_noise_scale
        self.add_neg_dn = add_neg_dn
        self.with_dn_query = with_dn_query
        self.cost_weight_wd = cost_weight_wd
        self.cost_weight_pcd = cost_weight_pcd
        self.cost_weight_cd = cost_weight_cd
        self.use_ignore_mask = use_ignore_mask
        if self.with_dn_query:
            self.num_classes = num_classes
            self.embed_dims = embed_dims
            self.label_noise_scale = label_noise_scale
            self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims)

    def encode_reg_target(self, box_target, device=None):
        outputs = []
        for box in box_target:
            if not isinstance(box, torch.Tensor):
                box = torch.cat(
                    [box.gravity_center, box.tensor[..., 3:]], dim=-1
                )
            output = torch.cat(
                [box[..., [X, Y, Z]], box[..., [W, L, H]], box[..., ALPHA:]],
                dim=-1,
            )
            if device is not None:
                output = output.to(device=device)
            outputs.append(output)
        return outputs

    @torch.no_grad()
    def sample(
        self,
        cls_pred,
        box_pred,
        char_positives,
        box_target,
        text_dict,
        ignore_mask=None,
    ):
        bs, num_pred, num_cls = cls_pred.shape

        token_positive_maps = get_positive_map(char_positives, text_dict)
        token_positive_maps = [
            x.to(cls_pred).bool().float() for x in token_positive_maps
        ]
        cls_cost = self._cls_cost(cls_pred, token_positive_maps, text_dict["text_token_mask"])
        box_target = self.encode_reg_target(box_target, box_pred.device)
        box_cost = self._box_cost(box_pred, box_target)

        indices = []
        for i in range(bs):
            if cls_cost[i] is not None and box_cost[i] is not None:
                cost = (cls_cost[i] + box_cost[i]).detach().cpu().numpy()
                cost = np.where(np.isneginf(cost) | np.isnan(cost), 1e8, cost)
                assign = linear_sum_assignment(cost)
                indices.append(
                    [cls_pred.new_tensor(x, dtype=torch.int64) for x in assign]
                )
            else:
                indices.append([None, None])

        output_cls_target = torch.zeros_like(cls_pred)
        output_box_target = torch.zeros_like(box_pred)
        output_reg_weights = torch.ones_like(box_pred)
        if self.use_ignore_mask:
            output_ignore_mask = torch.zeros_like(cls_pred[..., 0], dtype=torch.bool)
            ignore_mask = [output_ignore_mask.new_tensor(x) for x in ignore_mask]
        else:
            output_ignore_mask = None
        for i, (pred_idx, target_idx) in enumerate(indices):
            if len(box_target[i]) == 0:
                continue
            output_cls_target[i, pred_idx] = token_positive_maps[i][target_idx]
            output_box_target[i, pred_idx] = box_target[i][target_idx]
            if self.use_ignore_mask:
                output_ignore_mask[i, pred_idx] = ignore_mask[i][target_idx]
        self.indices = indices
        return output_cls_target, output_box_target, output_reg_weights, output_ignore_mask

    def _cls_cost(self, cls_pred, token_positive_maps, text_token_mask):
        bs = cls_pred.shape[0]
        cls_pred = cls_pred.sigmoid()
        cost = []
        for i in range(bs):
            if len(token_positive_maps[i]) > 0:
                pred = cls_pred[i][:, text_token_mask[i]]
                neg_cost = (
                    -(1 - pred + self.eps).log()
                    * (1 - self.alpha)
                    * pred.pow(self.gamma)
                )
                pos_cost = (
                    -(pred + self.eps).log()
                    * self.alpha
                    * (1 - pred).pow(self.gamma)
                )
                gt = token_positive_maps[i][:, text_token_mask[i]]
                cls_cost = torch.einsum(
                    "nc,mc->nm", pos_cost, gt
                ) + torch.einsum("nc,mc->nm", neg_cost, (1 - gt))
                cost.append(cls_cost)
            else:
                cost.append(None)
        return cost

    def _box_cost(self, box_pred, box_target):
        bs = box_pred.shape[0]
        cost = []
        for i in range(bs):
            if len(box_target[i]) > 0:
                pred = box_pred[i].unsqueeze(dim=-2)
                gt = box_target[i].unsqueeze(dim=-3)
                _cost = 0
                if self.cost_weight_wd > 0:
                    _cost += self.cost_weight_wd * wasserstein_distance(
                        pred, gt
                    )
                if self.cost_weight_pcd > 0:
                    _cost += (
                        self.cost_weight_pcd
                        * permutation_corner_distance(pred, gt)
                    )
                if self.cost_weight_cd > 0:
                    _cost += self.cost_weight_cd * center_distance(pred, gt)
                _cost *= self.box_weight
                cost.append(_cost)
            else:
                cost.append(None)
        return cost

    def get_dn_anchors(self, char_positives, box_target, text_dict, label=None, ignore_mask=None):
        if self.num_dn <= 0:
            return None
        if self.num_temp_dn_groups <= 0:
            gt_instance_id = None

        char_positives = [x[: self.num_dn] for x in char_positives]
        box_target = [x[: self.num_dn] for x in box_target]

        max_dn_gt = max([len(x) for x in char_positives] + [1])
        token_positive_maps = get_positive_map(char_positives, text_dict)
        token_positive_maps = torch.stack(
            [
                F.pad(x, (0, 0, 0, max_dn_gt - x.shape[0]), value=-1)
                for x in token_positive_maps
            ]
        )
        box_target = self.encode_reg_target(box_target, box_target[0].device)
        box_target = torch.stack(
            [F.pad(x, (0, 0, 0, max_dn_gt - x.shape[0])) for x in box_target]
        )
        token_positive_maps = token_positive_maps.to(box_target)
        box_target = torch.where(
            (token_positive_maps == -1).all(dim=-1, keepdim=True),
            box_target.new_tensor(0),
            box_target,
        )

        bs, num_gt, state_dims = box_target.shape
        num_dn_groups = self.num_dn // max(num_gt, 1)

        if num_dn_groups > 1:
            token_positive_maps = token_positive_maps.tile(num_dn_groups, 1, 1)
            box_target = box_target.tile(num_dn_groups, 1, 1)

        noise = torch.rand_like(box_target) * 2 - 1
        noise *= box_target.new_tensor(self.dn_noise_scale)
        noise[..., :3] *= box_target[..., 3:6]
        noise[..., 3:6] *= box_target[..., 3:6]
        dn_anchor = box_target + noise
        if self.add_neg_dn:
            noise_neg = torch.rand_like(box_target) + 1
            flag = torch.where(
                torch.rand_like(box_target) > 0.5,
                noise_neg.new_tensor(1),
                noise_neg.new_tensor(-1),
            )
            noise_neg *= flag
            noise_neg *= box_target.new_tensor(self.dn_noise_scale)
            noise_neg[..., :3] *= box_target[..., 3:6]
            noise_neg[..., 3:6] *= box_target[..., 3:6]
            dn_anchor = torch.cat([dn_anchor, box_target + noise_neg], dim=1)
            num_gt *= 2

        box_cost = self._box_cost(dn_anchor, box_target)
        dn_box_target = torch.zeros_like(dn_anchor)
        dn_token_positive_maps = -torch.ones_like(token_positive_maps) * 3
        if self.add_neg_dn:
            dn_token_positive_maps = torch.cat(
                [dn_token_positive_maps, dn_token_positive_maps], dim=1
            )

        for i in range(dn_anchor.shape[0]):
            if box_cost[i] is None:
                continue
            cost = box_cost[i].cpu().numpy()
            anchor_idx, gt_idx = linear_sum_assignment(cost)
            anchor_idx = dn_anchor.new_tensor(anchor_idx, dtype=torch.int64)
            gt_idx = dn_anchor.new_tensor(gt_idx, dtype=torch.int64)
            dn_box_target[i, anchor_idx] = box_target[i, gt_idx]
            dn_token_positive_maps[i, anchor_idx] = token_positive_maps[
                i, gt_idx
            ]
        dn_anchor = (
            dn_anchor.reshape(num_dn_groups, bs, num_gt, state_dims)
            .permute(1, 0, 2, 3)
            .flatten(1, 2)
        )
        dn_box_target = (
            dn_box_target.reshape(num_dn_groups, bs, num_gt, state_dims)
            .permute(1, 0, 2, 3)
            .flatten(1, 2)
        )
        text_length = dn_token_positive_maps.shape[-1]
        dn_token_positive_maps = (
            dn_token_positive_maps.reshape(
                num_dn_groups, bs, num_gt, text_length
            )
            .permute(1, 0, 2, 3)
            .flatten(1, 2)
        )

        valid_mask = (dn_token_positive_maps >= 0).all(dim=-1)
        if self.add_neg_dn:
            token_positive_maps = (
                torch.cat([token_positive_maps, token_positive_maps], dim=1)
                .reshape(num_dn_groups, bs, num_gt, text_length)
                .permute(1, 0, 2, 3)
                .flatten(1, 2)
            )
            valid_mask = torch.logical_or(
                valid_mask,
                (
                    (token_positive_maps >= 0).all(dim=-1)
                    & (dn_token_positive_maps == -3).all(dim=-1)
                ),
            )  # valid denotes the items is not from pad.

        attn_mask = dn_box_target.new_ones(
            num_gt * num_dn_groups, num_gt * num_dn_groups
        )
        dn_token_positive_maps = torch.clamp(dn_token_positive_maps, min=0)
        for i in range(num_dn_groups):
            start = num_gt * i
            end = start + num_gt
            attn_mask[start:end, start:end] = 0
        attn_mask = attn_mask == 1
        dn_anchor[..., 3:6] = torch.clamp(dn_anchor[..., 3:6], min=0.01).log()

        if label is not None and self.with_dn_query:
            label = torch.stack(
                [F.pad(x, (0, max_dn_gt - x.shape[0]), value=-1) for x in label]
            )
            label = label.tile(num_dn_groups, 1)
            if self.add_neg_dn:
                label = torch.cat([label, label], dim=1)
            label = label.reshape(num_dn_groups, bs, num_gt).permute(1, 0, 2).flatten(1, 2)

            label_noise_mask = torch.logical_or(
                torch.rand_like(label.float()) < self.label_noise_scale * 0.5,
                label == -1
            )
            label = torch.where(
                label_noise_mask,
                torch.randint_like(label, 0, self.num_classes),
                label,
            )
            dn_query = self.label_embedding(label)
        else:
            dn_query = None

        return (
            dn_anchor,
            dn_box_target,
            dn_token_positive_maps,
            attn_mask,
            valid_mask,
            dn_query,
        )


================================================
FILE: bip3d/models/utils.py
================================================
import torch
from torch import nn
from pytorch3d.transforms import euler_angles_to_matrix


def deformable_format(
    feature_maps,
    spatial_shapes=None,
    level_start_index=None,
    flat_batch=False,
    batch_size=None,
):
    if spatial_shapes is None:
        if flat_batch and feature_maps[0].dim() > 4:
            feature_maps = [x.flatten(end_dim=-4) for x in feature_maps]
        feat_flatten = []
        spatial_shapes = []
        for lvl, feat in enumerate(feature_maps):
            spatial_shape = torch._shape_as_tensor(feat)[-2:].to(feat.device)
            feat = feat.flatten(start_dim=-2).transpose(-1, -2)
            feat_flatten.append(feat)
            spatial_shapes.append(spatial_shape)

        # (bs, num_feat_points, dim)
        feat_flatten = torch.cat(feat_flatten, -2)
        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
        level_start_index = torch.cat(
            (
                spatial_shapes.new_zeros((1,)),  # (num_level)
                spatial_shapes.prod(1).cumsum(0)[:-1],
            )
        )
        return feat_flatten, spatial_shapes, level_start_index
    else:
        split_size = (spatial_shapes[:, 0] * spatial_shapes[:, 1]).tolist()
        feature_maps = feature_maps.transpose(-1, -2)
        feature_maps = list(torch.split(feature_maps, split_size, dim=-1))
        for i, feat in enumerate(feature_maps):
            feature_maps[i] = feature_maps[i].unflatten(
                -1, (spatial_shapes[i, 0], spatial_shapes[i, 1])
            )
            if batch_size is not None:
                if isinstance(batch_size, int):
                    feature_maps[i] = feature_maps[i].unflatten(
                        0, (batch_size, -1)
                    )
                else:
                    feature_maps[i] = feature_maps[i].unflatten(
                        0, batch_size + (-1,)
                    )
        return feature_maps


def bbox_to_corners(bbox, permutation=False):
    assert (
        len(bbox.shape) == 2
    ), "bbox must be 2D tensor of shape (N, 6) or (N, 7) or (N, 9)"
    if bbox.shape[-1] == 6:
        rot_mat = (
            torch.eye(3, device=bbox.device)
            .unsqueeze(0)
            .repeat(bbox.shape[0], 1, 1)
        )
    elif bbox.shape[-1] == 7:
        angles = bbox[:, 6:]
        fake_angles = torch.zeros_like(angles).repeat(1, 2)
        angles = torch.cat((angles, fake_angles), dim=1)
        rot_mat = euler_angles_to_matrix(angles, "ZXY")
    elif bbox.shape[-1] == 9:
        rot_mat = euler_angles_to_matrix(bbox[:, 6:], "ZXY")
    else:
        raise NotImplementedError
    centers = bbox[:, :3].unsqueeze(1).repeat(1, 8, 1)  # shape (N, 8, 3)
    half_sizes = (
        bbox[:, 3:6].unsqueeze(1).repeat(1, 8, 1) / 2
    )  # shape (N, 8, 3)
    eight_corners_x = (
        torch.tensor([1, 1, 1, 1, -1, -1, -1, -1], device=bbox.device)
        .unsqueeze(0)
        .repeat(bbox.shape[0], 1)
    )  # shape (N, 8)
    eight_corners_y = (
        torch.tensor([1, 1, -1, -1, 1, 1, -1, -1], device=bbox.device)
        .unsqueeze(0)
        .repeat(bbox.shape[0], 1)
    )  # shape (N, 8)
    eight_corners_z = (
        torch.tensor([1, -1, 1, -1, 1, -1, 1, -1], device=bbox.device)
        .unsqueeze(0)
        .repeat(bbox.shape[0], 1)
    )  # shape (N, 8)
    eight_corners = torch.stack(
        (eight_corners_x, eight_corners_y, eight_corners_z), dim=-1
    )  # shape (N, 8, 3)
    eight_corners = eight_corners * half_sizes  # shape (N, 8, 3)
    # rot_mat: (N, 3, 3), eight_corners: (N, 8, 3)
    rotated_corners = torch.matmul(
        eight_corners, rot_mat.transpose(1, 2)
    )  # shape (N, 8, 3)
    corners = rotated_corners + centers

    if permutation:
        corners = corners[:, PERMUTE_INDEX]
    return corners


def wasserstein_distance(source, target):
    rot_mat_src = euler_angles_to_matrix(source[..., 6:9], "ZXY")
    sqrt_sigma_src = rot_mat_src @ (
        source[..., 3:6, None] * rot_mat_src.transpose(-2, -1)
    )

    rot_mat_tgt = euler_angles_to_matrix(target[..., 6:9], "ZXY")
    sqrt_sigma_tgt = rot_mat_tgt @ (
        target[..., 3:6, None] * rot_mat_tgt.transpose(-2, -1)
    )

    sigma_distance = sqrt_sigma_src - sqrt_sigma_tgt
    sigma_distance = sigma_distance.pow(2).sum(dim=-1).sum(dim=-1)
    center_distance = ((source[..., :3] - target[..., :3]) ** 2).sum(dim=-1)
    distance = sigma_distance + center_distance
    distance = distance.clamp(1e-7).sqrt()
    return distance


def permutation_corner_distance(source, target):
    source_corners = bbox_to_corners(source)  # N, 8, 3
    target_corners = bbox_to_corners(target, True)  # N, 48, 8, 3
    distance = torch.norm(
        source_corners.unsqueeze(dim=-2) - target_corners, p=2, dim=-1
    )
    distance = distance.mean(dim=-1).min(dim=-1).values
    return distance


def center_distance(source, target):
    return torch.norm(source[..., :3] - target[..., :3], p=2, dim=-1)


def get_positive_map(char_positive, text_dict):
    bs, text_length = text_dict["embedded"].shape[:2]
    tokenized = text_dict["tokenized"]
    positive_maps = []
    for i in range(bs):
        num_target = len(char_positive[i])
        positive_map = torch.zeros(
            (num_target, text_length), dtype=torch.float
        )
        for j, tok_list in enumerate(char_positive[i]):
            for beg, end in tok_list:
                try:
                    beg_pos = tokenized.char_to_token(i, beg)
                    end_pos = tokenized.char_to_token(i, end - 1)
                except Exception as e:
                    print("beg:", beg, "end:", end)
                    print("token_positive:", char_positive[i])
                    raise e
                if beg_pos is None:
                    try:
                        beg_pos = tokenized.char_to_token(i, beg + 1)
                        if beg_pos is None:
                            beg_pos = tokenized.char_to_token(i, beg + 2)
                    except Exception:
                        beg_pos = None
                if end_pos is None:
                    try:
                        end_pos = tokenized.char_to_token(i, end - 2)
                        if end_pos is None:
                            end_pos = tokenized.char_to_token(i, end - 3)
                    except Exception:
                        end_pos = None
                if beg_pos is None or end_pos is None:
                    continue

                assert beg_pos is not None and end_pos is not None
                positive_map[j, beg_pos : end_pos + 1].fill_(1)
        positive_map /= (positive_map.sum(-1)[:, None] + 1e-6)
        positive_maps.append(positive_map)
    return positive_maps


def get_entities(text, char_positive, sep_token="[SEP]"):
    batch_entities = []
    for bs_idx in range(len(char_positive)):
        entities = []
        for obj_idx in range(len(char_positive[bs_idx])):
            entity = ""
            for beg, end in char_positive[bs_idx][obj_idx]:
                if len(entity) == 0:
                    entity = text[bs_idx][beg:end]
                else:
                    entity += sep_token + text[bs_idx][beg:end]
            entities.append(entity)
        batch_entities.append(entities)
    return batch_entities


def linear_act_ln(
    embed_dims, in_loops, out_loops, input_dims=None,
    act_cfg=None,
):
    if act_cfg is None:
        act_cfg = dict(type='ReLU', inplace=True)
    from mmcv.cnn import build_activation_layer
    if input_dims is None:
        input_dims = embed_dims
    layers = []
    for _ in range(out_loops):
        for _ in range(in_loops):
            layers.append(nn.Linear(input_dims, embed_dims))
            layers.append(build_activation_layer(act_cfg))
            input_dims = embed_dims
        layers.append(nn.LayerNorm(embed_dims))
    return layers


================================================
FILE: bip3d/ops/__init__.py
================================================
from .deformable_aggregation import (
    deformable_aggregation_func,
    feature_maps_format
)


================================================
FILE: bip3d/ops/deformable_aggregation.py
================================================
import torch
from torch.autograd.function import Function, once_differentiable
from torch.cuda.amp import custom_fwd, custom_bwd

from . import deformable_aggregation_ext as da
from . import deformable_aggregation_with_depth_ext as dad


class DeformableAggregationFunction(Function):
    @staticmethod
    @custom_fwd
    def forward(
        ctx,
        mc_ms_feat,
        spatial_shape,
        scale_start_index,
        sampling_location,
        weights,
    ):
        # output: [bs, num_pts, num_embeds]
        mc_ms_feat = mc_ms_feat.contiguous().float()
        spatial_shape = spatial_shape.contiguous().int()
        scale_start_index = scale_start_index.contiguous().int()
        sampling_location = sampling_location.contiguous().float()
        weights = weights.contiguous().float()
        output = da.deformable_aggregation_forward(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
        )
        ctx.save_for_backward(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
        )
        return output

    @staticmethod
    @once_differentiable
    @custom_bwd
    def backward(ctx, grad_output):
        (
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
        ) = ctx.saved_tensors
        mc_ms_feat = mc_ms_feat.contiguous().float()
        spatial_shape = spatial_shape.contiguous().int()
        scale_start_index = scale_start_index.contiguous().int()
        sampling_location = sampling_location.contiguous().float()
        weights = weights.contiguous().float()

        grad_mc_ms_feat = torch.zeros_like(mc_ms_feat)
        grad_sampling_location = torch.zeros_like(sampling_location)
        grad_weights = torch.zeros_like(weights)
        da.deformable_aggregation_backward(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
            grad_output.contiguous(),
            grad_mc_ms_feat,
            grad_sampling_location,
            grad_weights,
        )
        # print(grad_mc_ms_feat.abs().mean(), grad_sampling_location.abs().mean(), grad_weights.abs().mean())
        return (
            grad_mc_ms_feat,
            None,
            None,
            grad_sampling_location,
            grad_weights,
        )

class DeformableAggregationWithDepthFunction(Function):
    @staticmethod
    @custom_fwd
    def forward(
        ctx,
        mc_ms_feat,
        spatial_shape,
        scale_start_index,
        sampling_location,
        weights,
        num_depths,
    ):
        # output: [bs, num_pts, num_embeds]
        mc_ms_feat = mc_ms_feat.contiguous().float()
        spatial_shape = spatial_shape.contiguous().int()
        scale_start_index = scale_start_index.contiguous().int()
        sampling_location = sampling_location.contiguous().float()
        weights = weights.contiguous().float()
        output = dad.deformable_aggregation_with_depth_forward(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
            num_depths,
        )
        ctx.save_for_backward(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
        )
        ctx._num_depths = num_depths
        return output

    @staticmethod
    @once_differentiable
    @custom_bwd
    def backward(ctx, grad_output):
        (
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
        ) = ctx.saved_tensors
        num_depths = ctx._num_depths
        mc_ms_feat = mc_ms_feat.contiguous().float()
        spatial_shape = spatial_shape.contiguous().int()
        scale_start_index = scale_start_index.contiguous().int()
        sampling_location = sampling_location.contiguous().float()
        weights = weights.contiguous().float()

        grad_mc_ms_feat = torch.zeros_like(mc_ms_feat)
        grad_sampling_location = torch.zeros_like(sampling_location)
        grad_weights = torch.zeros_like(weights)
        dad.deformable_aggregation_with_depth_backward(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
            num_depths,
            grad_output.contiguous(),
            grad_mc_ms_feat,
            grad_sampling_location,
            grad_weights,
        )
        # print(grad_mc_ms_feat.abs().mean(), grad_sampling_location.abs().mean(), grad_weights.abs().mean())
        # print(grad_mc_ms_feat.abs().max(), grad_sampling_location.abs().max(), grad_weights.abs().max())
        # print("")
        return (
            grad_mc_ms_feat,
            None,
            None,
            grad_sampling_location,
            grad_weights,
            None,
        )


def deformable_aggregation_func(
    mc_ms_feat,
    spatial_shape,
    scale_start_index,
    sampling_location,
    weights,
    depth_prob=None,
    depth=None
):
    if depth_prob is not None and depth is not None:
        mc_ms_feat = torch.cat([mc_ms_feat, depth_prob], dim=-1)
        sampling_location = torch.cat([sampling_location, depth], dim=-1)
        return DeformableAggregationWithDepthFunction.apply(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
            depth_prob.shape[-1],
        )
    else:
        return DeformableAggregationFunction.apply(
            mc_ms_feat,
            spatial_shape,
            scale_start_index,
            sampling_location,
            weights,
        )
 
def feature_maps_format(feature_maps, inverse=False):
    bs, num_cams = feature_maps[0].shape[:2]
    if not inverse:
        spatial_shape = []
        scale_start_index = [0]

        col_feats = []
        for i, feat in enumerate(feature_maps):
            spatial_shape.append(feat.shape[-2:])
            scale_start_index.append(
                feat.shape[-1] * feat.shape[-2] + scale_start_index[-1]
            )
            col_feats.append(torch.reshape(
                feat, (bs, num_cams, feat.shape[2], -1)
            ))
        scale_start_index.pop()
        col_feats = torch.cat(col_feats, dim=-1).permute(0, 1, 3, 2)
        feature_maps = [
            col_feats,
            torch.tensor(
                spatial_shape,
                dtype=torch.int64,
                device=col_feats.device,
            ),
            torch.tensor(
                scale_start_index,
                dtype=torch.int64,
                device=col_feats.device,
            ),
        ]
    else:
        spatial_shape = feature_maps[1].int()
        split_size = (spatial_shape[:, 0] * spatial_shape[:, 1]).tolist()
        feature_maps = feature_maps[0].permute(0, 1, 3, 2)
        feature_maps = list(torch.split(feature_maps, split_size, dim=-1))
        for i, feat in enumerate(feature_maps):
            feature_maps[i] = feat.reshape(
                feat.shape[:3] + (spatial_shape[i, 0], spatial_shape[i, 1])
            )
    return feature_maps


================================================
FILE: bip3d/ops/gcc.sh
================================================
export CC=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/bin/gcc
export CXX=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/bin/g++
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib/dyninst:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib64/dyninst:$LD_LIBRARY_PATH


================================================
FILE: bip3d/ops/setup.py
================================================
import os

import torch
from setuptools import setup
from torch.utils.cpp_extension import (
    BuildExtension,
    CppExtension,
    CUDAExtension,
)


def make_cuda_ext(
    name,
    module,
    sources,
    sources_cuda=[],
    extra_args=[],
    extra_include_path=[],
):

    define_macros = []
    extra_compile_args = {"cxx": [] + extra_args}

    if torch.cuda.is_available() or os.getenv("FORCE_CUDA", "0") == "1":
        define_macros += [("WITH_CUDA", None)]
        extension = CUDAExtension
        extra_compile_args["nvcc"] = extra_args + [
            "-D__CUDA_NO_HALF_OPERATORS__",
            "-D__CUDA_NO_HALF_CONVERSIONS__",
            "-D__CUDA_NO_HALF2_OPERATORS__",
        ]
        sources += sources_cuda
    else:
        print("Compiling {} without CUDA".format(name))
        extension = CppExtension

    return extension(
        name="{}.{}".format(module, name),
        sources=[os.path.join(*module.split("."), p) for p in sources],
        include_dirs=extra_include_path,
        define_macros=define_macros,
        extra_compile_args=extra_compile_args,
    )


if __name__ == "__main__":
    setup(
        name="deformable_aggregation_with_depth_ext",
        ext_modules=[
            make_cuda_ext(
                "deformable_aggregation_with_depth_ext",
                module=".",
                sources=[
                    f"src/deformable_aggregation_with_depth.cpp",
                    f"src/deformable_aggregation_with_depth_cuda.cu",
                ],
            ),
        ],
        cmdclass={"build_ext": BuildExtension},
    )
    setup(
        name="deformable_aggregation_ext",
        ext_modules=[
            make_cuda_ext(
                "deformable_aggregation_ext",
                module=".",
                sources=[
                    f"src/deformable_aggregation.cpp",
                    f"src/deformable_aggregation_cuda.cu",
                ],
            ),
        ],
        cmdclass={"build_ext": BuildExtension},
    )


================================================
FILE: bip3d/ops/src/deformable_aggregation.cpp
================================================
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>

void deformable_aggregation(
  float* output,
  const float* mc_ms_feat,
  const int* spatial_shape,
  const int* scale_start_index,
  const float* sample_location,
  const float* weights,
  int batch_size,
  int num_cams,
  int num_feat,
  int num_embeds,
  int num_scale,
  int num_pts,
  int num_groups
);
  

void deformable_aggregation_grad(
  const float* mc_ms_feat,
  const int* spatial_shape,
  const int* scale_start_index,
  const float* sample_location,
  const float* weights,
  const float* grad_output,
  float* grad_mc_ms_feat,
  float* grad_sampling_location,
  float* grad_weights,
  int batch_size,
  int num_cams,
  int num_feat,
  int num_embeds,
  int num_scale,
  int num_pts,
  int num_groups
);

/* _mc_ms_feat: b, cam, feat, C */
/* _spatial_shape: scale, 2 */
/* _scale_start_index: scale */
/* _sampling_location: bs, pts, */ 


at::Tensor deformable_aggregation_forward(
  const at::Tensor &_mc_ms_feat,
  const at::Tensor &_spatial_shape,
  const at::Tensor &_scale_start_index,
  const at::Tensor &_sampling_location,
  const at::Tensor &_weights
) {
  at::DeviceGuard guard(_mc_ms_feat.device());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
  int batch_size = _mc_ms_feat.size(0);
  int num_cams = _mc_ms_feat.size(1);
  int num_feat = _mc_ms_feat.size(2);
  int num_embeds = _mc_ms_feat.size(3);
  int num_scale = _spatial_shape.size(0);
  int num_pts = _sampling_location.size(1);
  int num_groups = _weights.size(4);

  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
  const int* spatial_shape = _spatial_shape.data_ptr<int>();
  const int* scale_start_index = _scale_start_index.data_ptr<int>();
  const float* sampling_location = _sampling_location.data_ptr<float>();
  const float* weights = _weights.data_ptr<float>();

  auto output = at::zeros({batch_size, num_pts, num_embeds}, _mc_ms_feat.options());
  deformable_aggregation(
    output.data_ptr<float>(),
    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
    batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
  );
  return output;
}

void deformable_aggregation_backward(
  const at::Tensor &_mc_ms_feat,
  const at::Tensor &_spatial_shape,
  const at::Tensor &_scale_start_index,
  const at::Tensor &_sampling_location,
  const at::Tensor &_weights,
  const at::Tensor &_grad_output,
  at::Tensor &_grad_mc_ms_feat,
  at::Tensor &_grad_sampling_location,
  at::Tensor &_grad_weights
) {
  at::DeviceGuard guard(_mc_ms_feat.device());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
  int batch_size = _mc_ms_feat.size(0);
  int num_cams = _mc_ms_feat.size(1);
  int num_feat = _mc_ms_feat.size(2);
  int num_embeds = _mc_ms_feat.size(3);
  int num_scale = _spatial_shape.size(0);
  int num_pts = _sampling_location.size(1);
  int num_groups = _weights.size(4);

  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
  const int* spatial_shape = _spatial_shape.data_ptr<int>();
  const int* scale_start_index = _scale_start_index.data_ptr<int>();
  const float* sampling_location = _sampling_location.data_ptr<float>();
  const float* weights = _weights.data_ptr<float>();
  const float* grad_output = _grad_output.data_ptr<float>();

  float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr<float>();
  float* grad_sampling_location = _grad_sampling_location.data_ptr<float>();
  float* grad_weights = _grad_weights.data_ptr<float>();

  deformable_aggregation_grad(
    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
    grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
    batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
  );
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "deformable_aggregation_forward",
    &deformable_aggregation_forward,
    "deformable_aggregation_forward"
  );
  m.def(
    "deformable_aggregation_backward",
    &deformable_aggregation_backward,
    "deformable_aggregation_backward"
  );
}


================================================
FILE: bip3d/ops/src/deformable_aggregation_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <THC/THCAtomics.cuh>


__device__ float bilinear_sampling(
    const float *&bottom_data, const int &height, const int &width,
    const int &num_embeds, const float &h_im, const float &w_im,
    const int &base_ptr
) {
  const int h_low = floorf(h_im);
  const int w_low = floorf(w_im);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const float lh = h_im - h_low;
  const float lw = w_im - w_low;
  const float hh = 1 - lh, hw = 1 - lw;

  const int w_stride = num_embeds;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;

  float v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
  }
  float v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
  }
  float v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
  }
  float v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
  }

  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;

  const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  return val;
}


__device__ void bilinear_sampling_grad(
    const float *&bottom_data, const float &weight,
    const int &height, const int &width,
    const int &num_embeds, const float &h_im, const float &w_im,
    const int &base_ptr,
    const float &grad_output,
    float *&grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights) {
  const int h_low = floorf(h_im);
  const int w_low = floorf(w_im);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;

  const float lh = h_im - h_low;
  const float lw = w_im - w_low;
  const float hh = 1 - lh, hw = 1 - lw;

  const int w_stride = num_embeds;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;

  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const float top_grad_mc_ms_feat = grad_output * weight;
  float grad_h_weight = 0, grad_w_weight = 0;

  float v1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
    v1 = bottom_data[ptr1];
    grad_h_weight -= hw * v1;
    grad_w_weight -= hh * v1;
    atomicAdd(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
  }
  float v2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
    v2 = bottom_data[ptr2];
    grad_h_weight -= lw * v2;
    grad_w_weight += hh * v2;
    atomicAdd(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
  }
  float v3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
    atomicAdd(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
  }
  float v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
    v4 = bottom_data[ptr4];
    grad_h_weight += lw * v4;
    grad_w_weight += lh * v4;
    atomicAdd(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
  }

  const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
  atomicAdd(grad_weights, grad_output * val);
  atomicAdd(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
  atomicAdd(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
}


__global__ void deformable_aggregation_kernel(
    const int num_kernels,
    float* output,
    const float* mc_ms_feat,
    const int* spatial_shape,
    const int* scale_start_index,
    const float* sample_location,
    const float* weights,
    int batch_size,
    int num_cams,
    int num_feat,
    int num_embeds,
    int num_scale,
    int num_pts,
    int num_groups
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= num_kernels) return;

    float *output_ptr = output + idx;
    const int channel_index = idx % num_embeds;
    const int groups_index = channel_index / (num_embeds / num_groups);
    idx /= num_embeds;
    const int pts_index = idx % num_pts;
    idx /= num_pts;
    const int batch_index = idx;

    const int value_cam_stride = num_feat * num_embeds;
    const int weight_cam_stride = num_scale * num_groups;
    int loc_offset = (batch_index * num_pts + pts_index) * num_cams << 1;
    int value_offset = batch_index * num_cams * value_cam_stride + channel_index;
    int weight_offset = (
        (batch_index * num_pts + pts_index) * num_cams * weight_cam_stride
        + groups_index
    );

    float result = 0;
    for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 2) {
        const float loc_w = sample_location[loc_offset];
        const float loc_h = sample_location[loc_offset + 1];
        
        if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1) {
            for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
                const int scale_offset = scale_start_index[scale_index] * num_embeds;

                const int spatial_shape_ptr = scale_index << 1;
                const int h = spatial_shape[spatial_shape_ptr];
                const int w = spatial_shape[spatial_shape_ptr + 1];

                const float h_im = loc_h * h - 0.5;
                const float w_im = loc_w * w - 0.5;

                const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index;
                const float *weights_ptr = (
                    weights + weight_offset + scale_index * num_groups
                    + weight_cam_stride * cam_index
                );
                result += bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_ptr) * *weights_ptr;
            }
        }
    }
    *output_ptr = result;
}


__global__ void deformable_aggregation_grad_kernel(
    const int num_kernels,
    const float* mc_ms_feat,
    const int* spatial_shape,
    const int* scale_start_index,
    const float* sample_location,
    const float* weights,
    const float* grad_output,
    float* grad_mc_ms_feat,
    float* grad_sampling_location,
    float* grad_weights,
    int batch_size,
    int num_cams,
    int num_feat,
    int num_embeds,
    int num_scale,
    int num_pts,
    int num_groups
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= num_kernels) return;
    const float grad = grad_output[idx];

    const int channel_index = idx % num_embeds;
    const int groups_index = channel_index / (num_embeds / num_groups);
    idx /= num_embeds;
    const int pts_index = idx % num_pts;
    idx /= num_pts;
    const int batch_index = idx;

    const int value_cam_stride = num_feat * num_embeds;
    const int weight_cam_stride = num_scale * num_groups;
    int loc_offset = (batch_index * num_pts + pts_index) * num_cams << 1;
    int value_offset = batch_index * num_cams * value_cam_stride + channel_index;
    int weight_offset = (
        (batch_index * num_pts + pts_index) * num_cams * weight_cam_stride
        + groups_index
    );

    for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 2) {
        const float loc_w = sample_location[loc_offset];
        const float loc_h = sample_location[loc_offset + 1];
        
        if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1) {
            for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
                const int scale_offset = scale_start_index[scale_index] * num_embeds;

                const int spatial_shape_ptr = scale_index << 1;
                const int h = spatial_shape[spatial_shape_ptr];
                const int w = spatial_shape[spatial_shape_ptr + 1];

                const float h_im = loc_h * h - 0.5;
                const float w_im = loc_w * w - 0.5;

                const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index;
                const int weights_ptr = weight_offset + scale_index * num_groups + weight_cam_stride * cam_index;
                const float weight = weights[weights_ptr];

                float *grad_location_ptr = grad_sampling_location + loc_offset;
                float *grad_weights_ptr = grad_weights + weights_ptr;
                bilinear_sampling_grad(
                    mc_ms_feat, weight, h, w, num_embeds, h_im, w_im,
                    value_ptr,
                    grad,
                    grad_mc_ms_feat, grad_location_ptr, grad_weights_ptr
                );
            }
        }
    }
}


void deformable_aggregation(
    float* output,
    const float* mc_ms_feat,
    const int* spatial_shape,
    const int* scale_start_index,
    const float* sample_location,
    const float* weights,
    int batch_size,
    int num_cams,
    int num_feat,
    int num_embeds,
    int num_scale,
    int num_pts,
    int num_groups
) {
    const int num_kernels = batch_size * num_pts * num_embeds;
    deformable_aggregation_kernel
        <<<(int)ceil(((double)num_kernels/512)), 512>>>(
        num_kernels, output,
        mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
        batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
    );
}


void deformable_aggregation_grad(
  const float* mc_ms_feat,
  const int* spatial_shape,
  const int* scale_start_index,
  const float* sample_location,
  const float* weights,
  const float* grad_output,
  float* grad_mc_ms_feat,
  float* grad_sampling_location,
  float* grad_weights,
  int batch_size,
  int num_cams,
  int num_feat,
  int num_embeds,
  int num_scale,
  int num_pts,
  int num_groups
) {
    const int num_kernels = batch_size * num_pts * num_embeds;
    deformable_aggregation_grad_kernel
        <<<(int)ceil(((double)num_kernels/512)), 512>>>(
        num_kernels,
        mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
        grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
        batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
    );
}


================================================
FILE: bip3d/ops/src/deformable_aggregation_with_depth.cpp
================================================
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>

void deformable_aggregation_with_depth(
  float* output,
  const float* mc_ms_feat,
  const int* spatial_shape,
  const int* scale_start_index,
  const float* sample_location,
  const float* weights,
  int batch_size,
  int num_cams,
  int num_feat,
  int num_embeds,
  int num_scale,
  int num_pts,
  int num_groups,
  int num_depths
);
  

void deformable_aggregation_with_depth_grad(
  const float* mc_ms_feat,
  const int* spatial_shape,
  const int* scale_start_index,
  const float* sample_location,
  const float* weights,
  const float* grad_output,
  float* grad_mc_ms_feat,
  float* grad_sampling_location,
  float* grad_weights,
  int batch_size,
  int num_cams,
  int num_feat,
  int num_embeds,
  int num_scale,
  int num_pts,
  int num_groups,
  int num_depths
);

/* _mc_ms_feat: b, cam, feat, C+D */
/* _spatial_shape: scale, 2 */
/* _scale_start_index: scale */
/* _sampling_location: bs, pts, */ 


at::Tensor deformable_aggregation_with_depth_forward(
  const at::Tensor &_mc_ms_feat,
  const at::Tensor &_spatial_shape,
  const at::Tensor &_scale_start_index,
  const at::Tensor &_sampling_location,
  const at::Tensor &_weights,
  const int num_depths
) {
  at::DeviceGuard guard(_mc_ms_feat.device());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
  int batch_size = _mc_ms_feat.size(0);
  int num_cams = _mc_ms_feat.size(1);
  int num_feat = _mc_ms_feat.size(2);
  int num_embeds = _mc_ms_feat.size(3) - num_depths;
  int num_scale = _spatial_shape.size(0);
  int num_pts = _sampling_location.size(1);
  int num_groups = _weights.size(4);

  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
  const int* spatial_shape = _spatial_shape.data_ptr<int>();
  const int* scale_start_index = _scale_start_index.data_ptr<int>();
  const float* sampling_location = _sampling_location.data_ptr<float>();
  const float* weights = _weights.data_ptr<float>();

  auto output = at::zeros({batch_size, num_pts, num_embeds}, _mc_ms_feat.options());
  deformable_aggregation_with_depth(
    output.data_ptr<float>(),
    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
    batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups,
    num_depths
  );
  return output;
}

void deformable_aggregation_with_depth_backward(
  const at::Tensor &_mc_ms_feat,
  const at::Tensor &_spatial_shape,
  const at::Tensor &_scale_start_index,
  const at::Tensor &_sampling_location,
  const at::Tensor &_weights,
  const int num_depths,
  const at::Tensor &_grad_output,
  at::Tensor &_grad_mc_ms_feat,
  at::Tensor &_grad_sampling_location,
  at::Tensor &_grad_weights
) {
  at::DeviceGuard guard(_mc_ms_feat.device());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
  int batch_size = _mc_ms_feat.size(0);
  int num_cams = _mc_ms_feat.size(1);
  int num_feat = _mc_ms_feat.size(2);
  int num_embeds = _mc_ms_feat.size(3) - num_depths;
  int num_scale = _spatial_shape.size(0);
  int num_pts = _sampling_location.size(1);
  int num_groups = _weights.size(4);

  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
  const int* spatial_shape = _spatial_shape.data_ptr<int>();
  const int* scale_start_index = _scale_start_index.data_ptr<int>();
  const float* sampling_location = _sampling_location.data_ptr<float>();
  const float* weights = _weights.data_ptr<float>();
  const float* grad_output = _grad_output.data_ptr<float>();

  float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr<float>();
  float* grad_sampling_location = _grad_sampling_location.data_ptr<float>();
  float* grad_weights = _grad_weights.data_ptr<float>();

  deformable_aggregation_with_depth_grad(
    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
    grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
    batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups,
    num_depths
  );
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "deformable_aggregation_with_depth_forward",
    &deformable_aggregation_with_depth_forward,
    "deformable_aggregation_with_depth_forward"
  );
  m.def(
    "deformable_aggregation_with_depth_backward",
    &deformable_aggregation_with_depth_backward,
    "deformable_aggregation_with_depth_backward"
  );
}


================================================
FILE: bip3d/ops/src/deformable_aggregation_with_depth_cuda.cu
================================================
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <THC/THCAtomics.cuh>

__device__ float bilinear_sampling(const float *&bottom_data, const int &height,
                                   const int &width,
                                   const int &num_embeds_with_depth,
                                   const int &num_depths, const float &h_im,
                                   const float &w_im, const float &loc_d,
                                   const int &base_ptr, const int &depth_ptr) {
  const int h_low = floorf(h_im);
  const int w_low = floorf(w_im);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;
  const int d_low = floorf(loc_d);
  const int d_high = d_low + 1;

  const float lh = h_im - h_low;
  const float lw = w_im - w_low;
  const float hh = 1 - lh, hw = 1 - lw;

  const float ld = loc_d - d_low;
  const float hd = 1 - ld;

  const int w_stride = num_embeds_with_depth;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;

  float v1 = 0;
  float dp_low1 = 0;
  float dp_high1 = 0;
  const bool flag_d_low = d_low >= 0 && d_low < num_depths;
  const bool flag_d_high = d_high >= 0 && d_high < num_depths;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset;
    v1 = bottom_data[ptr1 + base_ptr];
    const int ptr_d1 = ptr1 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low1 = bottom_data[ptr_d1];
    }
    if (flag_d_high) {
      dp_high1 = bottom_data[ptr_d1 + 1];
    }
  }

  float v2 = 0;
  float dp_low2 = 0;
  float dp_high2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset;
    v2 = bottom_data[ptr2 + base_ptr];
    const int ptr_d2 = ptr2 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low2 = bottom_data[ptr_d2];
    }
    if (flag_d_high) {
      dp_high2 = bottom_data[ptr_d2 + 1];
    }
  }

  float v3 = 0;
  float dp_low3 = 0;
  float dp_high3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset;
    v3 = bottom_data[ptr3 + base_ptr];
    const int ptr_d3 = ptr3 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low3 = bottom_data[ptr_d3];
    }
    if (flag_d_high) {
      dp_high3 = bottom_data[ptr_d3 + 1];
    }
  }

  float v4 = 0;
  float dp_low4 = 0;
  float dp_high4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset;
    v4 = bottom_data[ptr4 + base_ptr];
    const int ptr_d4 = ptr4 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low4 = bottom_data[ptr_d4];
    }
    if (flag_d_high) {
      dp_high4 = bottom_data[ptr_d4 + 1];
    }
  }

  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const float val = hd * (w1 * v1 * dp_low1 + w2 * v2 * dp_low2 +
                          w3 * v3 * dp_low3 + w4 * v4 * dp_low4) +
                    ld * (w1 * v1 * dp_high1 + w2 * v2 * dp_high2 +
                          w3 * v3 * dp_high3 + w4 * v4 * dp_high4);
  return val;
}

__device__ void
bilinear_sampling_grad(const float *&bottom_data, const float &weight,
                       const int &height, const int &width,
                       const int &num_embeds_with_depth, const int &num_depths,
                       const float &h_im, const float &w_im, const float &loc_d,
                       const int &base_ptr, const int &depth_ptr,
                       const float &grad_output, float *&grad_mc_ms_feat,
                       float *grad_sampling_location, float *grad_weights) {
  const int h_low = floorf(h_im);
  const int w_low = floorf(w_im);
  const int h_high = h_low + 1;
  const int w_high = w_low + 1;
  const int d_low = floorf(loc_d);
  const int d_high = d_low + 1;

  const float lh = h_im - h_low;
  const float lw = w_im - w_low;
  const float hh = 1 - lh, hw = 1 - lw;
  const float ld = loc_d - d_low;
  const float hd = 1 - ld;

  const int w_stride = num_embeds_with_depth;
  const int h_stride = width * w_stride;
  const int h_low_ptr_offset = h_low * h_stride;
  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
  const int w_low_ptr_offset = w_low * w_stride;
  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;

  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
  const float top_grad_mc_ms_feat = grad_output * weight;

  const bool flag_d_low = d_low >= 0 && d_low < num_depths;
  const bool flag_d_high = d_high >= 0 && d_high < num_depths;

  float v1 = 0;
  float dp_low1 = 0;
  float dp_high1 = 0;
  if (h_low >= 0 && w_low >= 0) {
    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset;
    v1 = bottom_data[ptr1 + base_ptr];
    const int ptr_d1 = ptr1 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low1 = bottom_data[ptr_d1];
      atomicAdd(grad_mc_ms_feat + ptr_d1, w1 * top_grad_mc_ms_feat * v1 * hd);
    }
    if (flag_d_high) {
      dp_high1 = bottom_data[ptr_d1 + 1];
      atomicAdd(grad_mc_ms_feat + ptr_d1 + 1,
                w1 * top_grad_mc_ms_feat * v1 * ld);
    }
    atomicAdd(grad_mc_ms_feat + ptr1 + base_ptr,
              w1 * top_grad_mc_ms_feat * (dp_low1 * hd + dp_high1 * ld));
  }
  float v2 = 0;
  float dp_low2 = 0;
  float dp_high2 = 0;
  if (h_low >= 0 && w_high <= width - 1) {
    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset;
    v2 = bottom_data[ptr2 + base_ptr];
    const int ptr_d2 = ptr2 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low2 = bottom_data[ptr_d2];
      atomicAdd(grad_mc_ms_feat + ptr_d2, w2 * top_grad_mc_ms_feat * v2 * hd);
    }
    if (flag_d_high) {
      dp_high2 = bottom_data[ptr_d2 + 1];
      atomicAdd(grad_mc_ms_feat + ptr_d2 + 1,
                w2 * top_grad_mc_ms_feat * v2 * ld);
    }

    atomicAdd(grad_mc_ms_feat + ptr2 + base_ptr,
              w2 * top_grad_mc_ms_feat * (dp_low2 * hd + dp_high2 * ld));
  }
  float v3 = 0;
  float dp_low3 = 0;
  float dp_high3 = 0;
  if (h_high <= height - 1 && w_low >= 0) {
    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset;
    v3 = bottom_data[ptr3 + base_ptr];
    const int ptr_d3 = ptr3 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low3 = bottom_data[ptr_d3];
      atomicAdd(grad_mc_ms_feat + ptr_d3, w3 * top_grad_mc_ms_feat * v3 * hd);
    }
    if (flag_d_high) {
      dp_high3 = bottom_data[ptr_d3 + 1];
      atomicAdd(grad_mc_ms_feat + ptr_d3 + 1,
                w3 * top_grad_mc_ms_feat * v3 * ld);
    }

    atomicAdd(grad_mc_ms_feat + ptr3 + base_ptr,
              w3 * top_grad_mc_ms_feat * (dp_low3 * hd + dp_high3 * ld));
  }
  float v4 = 0;
  float dp_low4 = 0;
  float dp_high4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1) {
    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset;
    v4 = bottom_data[ptr4 + base_ptr];
    const int ptr_d4 = ptr4 + depth_ptr + d_low;
    if (flag_d_low) {
      dp_low4 = bottom_data[ptr_d4];
      atomicAdd(grad_mc_ms_feat + ptr_d4, w4 * top_grad_mc_ms_feat * v4 * hd);
    }
    if (flag_d_high) {
      dp_high4 = bottom_data[ptr_d4 + 1];
      atomicAdd(grad_mc_ms_feat + ptr_d4 + 1,
                w4 * top_grad_mc_ms_feat * v4 * ld);
    }
    atomicAdd(grad_mc_ms_feat + ptr4 + base_ptr,
              w4 * top_grad_mc_ms_feat * (dp_low4 * hd + dp_high4 * ld));
  }

  const float val1 = w1 * v1 * dp_low1 + w2 * v2 * dp_low2 + w3 * v3 * dp_low3 +
                     w4 * v4 * dp_low4;
  const float val2 = w1 * v1 * dp_high1 + w2 * v2 * dp_high2 +
                     w3 * v3 * dp_high3 + w4 * v4 * dp_high4;

  const float val = hd * val1 + ld * val2;

  atomicAdd(grad_weights, grad_output * val);

  const float grad_w_weight = hd * (-hh * v1 * dp_low1 + hh * v2 * dp_low2 -
                                    lh * v3 * dp_low3 + lh * v4 * dp_low4) +
                              ld * (-hh * v1 * dp_high1 + hh * v2 * dp_high2 -
                                    lh * v3 * dp_high3 + lh * v4 * dp_high4);
  const float grad_h_weight = hd * (-hw * v1 * dp_low1 - lw * v2 * dp_low2 +
                                    hw * v3 * dp_low3 + lw * v4 * dp_low4) +
                              ld * (-hw * v1 * dp_high1 - lw * v2 * dp_high2 +
                                    hw * v3 * dp_high3 + lw * v4 * dp_high4);

  /* const float grad_d_weight = -1 * (w1 * v1 * dp_low1 + w2 * v2 * dp_low2 +
   * w3 * v3 * dp_low3 + w4 * v4 * dp_low4) */
  /*     + (w1 * v1 * dp_high1 + w2 * v2 * dp_high2 + w3 * v3 * dp_high3 + w4 *
   * v4 * dp_high4); */
  const float grad_d_weight = val2 - val1;
  /* const float grad_d_weight = w1 * v1 * (dp_high1 - dp_low1) */
  /*     + w2 * v2 * (dp_high2 - dp_low2) */
  /*     + w3 * v3 * (dp_high3 - dp_low3) */
  /*     + w4 * v4 * (dp_high4 - dp_low4); */

  atomicAdd(grad_sampling_location,
            width * top_grad_mc_ms_feat * grad_w_weight);
  atomicAdd(grad_sampling_location + 1,
            height * top_grad_mc_ms_feat * grad_h_weight);
  atomicAdd(grad_sampling_location + 2, top_grad_mc_ms_feat * grad_d_weight);
}

__global__ void deformable_aggregation_with_depth_kernel(
    const int num_kernels, float *output, const float *mc_ms_feat,
    const int *spatial_shape, const int *scale_start_index,
    const float *sample_location, const float *weights, int batch_size,
    int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts,
    int num_groups, int num_depths) {
  long int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= num_kernels)
    return;

  float *output_ptr = output + idx;
  const int channel_index = idx % num_embeds;
  const int groups_index = channel_index / (num_embeds / num_groups);
  idx /= num_embeds;
  const int pts_index = idx % num_pts;
  idx /= num_pts;
  const int batch_index = idx;

  const int num_embeds_with_depth = num_embeds + num_depths;
  const int value_cam_stride = num_feat * num_embeds_with_depth;
  const int weight_cam_stride = num_scale * num_groups;
  int loc_offset = (batch_index * num_pts + pts_index) * num_cams * 3;
  int value_offset = batch_index * num_cams * value_cam_stride;
  int depth_offset = value_offset + num_embeds;
  value_offset = value_offset + channel_index;
  int weight_offset =
      ((batch_index * num_pts + pts_index) * num_cams * weight_cam_stride +
       groups_index);

  float result = 0;
  for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 3) {
    const float loc_w = sample_location[loc_offset];
    const float loc_h = sample_location[loc_offset + 1];
    const float loc_d = sample_location[loc_offset + 2];

    if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1 && loc_d > -1 &&
        loc_d < num_depths) {
      for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
        const int scale_offset =
            scale_start_index[scale_index] * num_embeds_with_depth;

        const int spatial_shape_ptr = scale_index << 1;
        const int h = spatial_shape[spatial_shape_ptr];
        const int w = spatial_shape[spatial_shape_ptr + 1];

        const float h_im = loc_h * h - 0.5;
        const float w_im = loc_w * w - 0.5;

        const int value_ptr =
            value_offset + scale_offset + value_cam_stride * cam_index;
        const int depth_ptr =
            depth_offset + scale_offset + value_cam_stride * cam_index;

        const float *weights_ptr =
            (weights + weight_offset + scale_index * num_groups +
             weight_cam_stride * cam_index);
        result += bilinear_sampling(mc_ms_feat, h, w, num_embeds_with_depth,
                                    num_depths, h_im, w_im, loc_d, value_ptr,
                                    depth_ptr) *
                  *weights_ptr;
      }
    }
  }
  *output_ptr = result;
}

__global__ void deformable_aggregation_with_depth_grad_kernel(
    const int num_kernels, const float *mc_ms_feat, const int *spatial_shape,
    const int *scale_start_index, const float *sample_location,
    const float *weights, const float *grad_output, float *grad_mc_ms_feat,
    float *grad_sampling_location, float *grad_weights, int batch_size,
    int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts,
    int num_groups, int num_depths) {
  long int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= num_kernels)
    return;
  const float grad = grad_output[idx];

  const int channel_index = idx % num_embeds;
  const int groups_index = channel_index / (num_embeds / num_groups);
  idx /= num_embeds;
  const int pts_index = idx % num_pts;
  idx /= num_pts;
  const int batch_index = idx;

  const int num_embeds_with_depth = num_embeds + num_depths;
  const int value_cam_stride = num_feat * num_embeds_with_depth;
  const int weight_cam_stride = num_scale * num_groups;
  int loc_offset = (batch_index * num_pts + pts_index) * num_cams * 3;
  int value_offset = batch_index * num_cams * value_cam_stride;
  int depth_offset = value_offset + num_embeds;
  value_offset = value_offset + channel_index;
  int weight_offset =
      ((batch_index * num_pts + pts_index) * num_cams * weight_cam_stride +
       groups_index);

  for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 3) {
    const float loc_w = sample_location[loc_offset];
    const float loc_h = sample_location[loc_offset + 1];
    const float loc_d = sample_location[loc_offset + 2];

    if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1 && loc_d > -1 &&
        loc_d < num_depths) {
      for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
        const int scale_offset =
            scale_start_index[scale_index] * num_embeds_with_depth;

        const int spatial_shape_ptr = scale_index << 1;
        const int h = spatial_shape[spatial_shape_ptr];
        const int w = spatial_shape[spatial_shape_ptr + 1];

        const float h_im = loc_h * h - 0.5;
        const float w_im = loc_w * w - 0.5;

        const int value_ptr =
            value_offset + scale_offset + value_cam_stride * cam_index;
        const int depth_ptr =
            depth_offset + scale_offset + value_cam_stride * cam_index;
        const int weights_ptr = weight_offset + scale_index * num_groups +
                                weight_cam_stride * cam_index;
        const float weight = weights[weights_ptr];

        float *grad_location_ptr = grad_sampling_location + loc_offset;
        float *grad_weights_ptr = grad_weights + weights_ptr;
        bilinear_sampling_grad(mc_ms_feat, weight, h, w, num_embeds_with_depth,
                               num_depths, h_im, w_im, loc_d, value_ptr,
                               depth_ptr, grad, grad_mc_ms_feat,
                               grad_location_ptr, grad_weights_ptr);
      }
    }
  }
}

void deformable_aggregation_with_depth(
    float *output, const float *mc_ms_feat, const int *spatial_shape,
    const int *scale_start_index, const float *sample_location,
    const float *weights, int batch_size, int num_cams, int num_feat,
    int num_embeds, int num_scale, int num_pts, int num_groups,
    int num_depths) {
  const long int num_kernels = batch_size * num_pts * num_embeds;
  deformable_aggregation_with_depth_kernel<<<
      (int)ceil(((double)num_kernels / 512)), 512>>>(
      num_kernels, output, mc_ms_feat, spatial_shape, scale_start_index,
      sample_location, weights, batch_size, num_cams, num_feat, num_embeds,
      num_scale, num_pts, num_groups, num_depths);
}

void deformable_aggregation_with_depth_grad(
    const float *mc_ms_feat, const int *spatial_shape,
    const int *scale_start_index, const float *sample_location,
    const float *weights, const float *grad_output, float *grad_mc_ms_feat,
    float *grad_sampling_location, float *grad_weights, int batch_size,
    int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts,
    int num_groups, int num_depths) {
  const long int num_kernels = batch_size * num_pts * num_embeds;
  deformable_aggregation_with_depth_grad_kernel<<<
      (int)ceil(((double)num_kernels / 512)), 512>>>(
      num_kernels, mc_ms_feat, spatial_shape, scale_start_index,
      sample_location, weights, grad_output, grad_mc_ms_feat,
      grad_sampling_location, grad_weights, batch_size, num_cams, num_feat,
      num_embeds, num_scale, num_pts, num_groups, num_depths);
}


================================================
FILE: bip3d/registry.py
================================================
from mmengine import DATASETS as MMENGINE_DATASETS
from mmengine import METRICS as MMENGINE_METRICS
from mmengine import MODELS as MMENGINE_MODELS
from mmengine import TASK_UTILS as MMENGINE_TASK_UTILS
from mmengine import TRANSFORMS as MMENGINE_TRANSFORMS
from mmengine import VISBACKENDS as MMENGINE_VISBACKENDS
from mmengine import VISUALIZERS as MMENGINE_VISUALIZERS
from mmengine import Registry

MODELS = Registry('model',
                  parent=MMENGINE_MODELS,
                  locations=['bip3d.models'])
DATASETS = Registry('dataset',
                    parent=MMENGINE_DATASETS,
                    locations=['bip3d.datasets'])
TRANSFORMS = Registry('transform',
                      parent=MMENGINE_TRANSFORMS,
                      locations=['bip3d.datasets.transforms'])
METRICS = Registry('metric',
                   parent=MMENGINE_METRICS,
                   locations=['bip3d.eval'])
TASK_UTILS = Registry('task util',
                      parent=MMENGINE_TASK_UTILS,
                      locations=['bip3d.models'])


================================================
FILE: bip3d/structures/__init__.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from .bbox_3d import (BaseInstance3DBoxes, Box3DMode, Coord3DMode,
                      EulerDepthInstance3DBoxes, EulerInstance3DBoxes,
                      get_box_type, get_proj_mat_by_coord_type, limit_period,
                      mono_cam_box2vis, points_cam2img, points_img2cam,
                      rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr)

__all__ = [
    'BaseInstance3DBoxes', 'Box3DMode', 'Coord3DMode', 'EulerInstance3DBoxes',
    'EulerDepthInstance3DBoxes', 'get_box_type', 'get_proj_mat_by_coord_type',
    'limit_period', 'mono_cam_box2vis', 'points_cam2img', 'points_img2cam',
    'rotation_3d_in_axis', 'rotation_3d_in_euler', 'xywhr2xyxyr'
]


================================================
FILE: bip3d/structures/bbox_3d/__init__.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from .base_box3d import BaseInstance3DBoxes
from .box_3d_mode import Box3DMode
from .coord_3d_mode import Coord3DMode
from .euler_box3d import EulerInstance3DBoxes
from .euler_depth_box3d import EulerDepthInstance3DBoxes
from .utils import (batch_points_cam2img, get_box_type,
                    get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis,
                    points_cam2img, points_img2cam, rotation_3d_in_axis,
                    rotation_3d_in_euler, xywhr2xyxyr)

__all__ = [
    'Box3DMode', 'BaseInstance3DBoxes', 'EulerInstance3DBoxes',
    'EulerDepthInstance3DBoxes', 'xywhr2xyxyr', 'get_box_type',
    'rotation_3d_in_axis', 'rotation_3d_in_euler', 'limit_period',
    'points_cam2img', 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis',
    'batch_points_cam2img', 'get_proj_mat_by_coord_type'
]


================================================
FILE: bip3d/structures/bbox_3d/base_box3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from abc import abstractmethod
from typing import Iterator, Optional, Sequence, Tuple, Union

import numpy as np
import torch
from torch import Tensor

from bip3d.structures.points.base_points import BasePoints

from .utils import limit_period


class BaseInstance3DBoxes:
    """Base class for 3D Boxes.

    Note:
        The box is bottom centered, i.e. the relative position of origin in the
        box is (0.5, 0.5, 0).

    Args:
        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes
            data with shape (N, box_dim).
        box_dim (int): Number of the dimension of a box. Each row is
            (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7.
        with_yaw (bool): Whether the box is with yaw rotation. If False, the
            value of yaw will be set to 0 as minmax boxes. Defaults to True.
        origin (Tuple[float]): Relative position of the box origin.
            Defaults to (0.5, 0.5, 0). This will guide the box be converted to
            (0.5, 0.5, 0) mode.

    Attributes:
        tensor (Tensor): Float matrix with shape (N, box_dim).
        box_dim (int): Integer indicating the dimension of a box. Each row is
            (x, y, z, x_size, y_size, z_size, yaw, ...).
        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
            boxes.
    """

    YAW_AXIS: int = 0

    def __init__(
        self,
        tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
        box_dim: int = 7,
        with_yaw: bool = True,
        origin: Tuple[float, float, float] = (0.5, 0.5, 0)
    ) -> None:
        if isinstance(tensor, Tensor):
            device = tensor.device
        else:
            device = torch.device('cpu')
        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
        if tensor.numel() == 0:
            # Use reshape, so we don't end up creating a new tensor that does
            # not depend on the inputs (and consequently confuses jit)
            tensor = tensor.reshape((-1, box_dim))
        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \
            ('The box dimension must be 2 and the length of the last '
             f'dimension must be {box_dim}, but got boxes with shape '
             f'{tensor.shape}.')

        if tensor.shape[-1] == 6:
            # If the dimension of boxes is 6, we expand box_dim by padding 0 as
            # a fake yaw and set with_yaw to False
            assert box_dim == 6
            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
            tensor = torch.cat((tensor, fake_rot), dim=-1)
            self.box_dim = box_dim + 1
            self.with_yaw = False
        else:
            self.box_dim = box_dim
            self.with_yaw = with_yaw
        self.tensor = tensor.clone()

        if origin != (0.5, 0.5, 0):
            dst = self.tensor.new_tensor((0.5, 0.5, 0))
            src = self.tensor.new_tensor(origin)
            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)

    @property
    def shape(self) -> torch.Size:
        """torch.Size: Shape of boxes."""
        return self.tensor.shape

    @property
    def volume(self) -> Tensor:
        """Tensor: A vector with volume of each box in shape (N, )."""
        return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]

    @property
    def dims(self) -> Tensor:
        """Tensor: Size dimensions of each box in shape (N, 3)."""
        return self.tensor[:, 3:6]

    @property
    def yaw(self) -> Tensor:
        """Tensor: A vector with yaw of each box in shape (N, )."""
        return self.tensor[:, 6]

    @property
    def height(self) -> Tensor:
        """Tensor: A vector with height of each box in shape (N, )."""
        return self.tensor[:, 5]

    @property
    def top_height(self) -> Tensor:
        """Tensor: A vector with top height of each box in shape (N, )."""
        return self.bottom_height + self.height

    @property
    def bottom_height(self) -> Tensor:
        """Tensor: A vector with bottom height of each box in shape (N, )."""
        return self.tensor[:, 2]

    @property
    def center(self) -> Tensor:
        """Calculate the center of all the boxes.

        Note:
            In MMDetection3D's convention, the bottom center is usually taken
            as the default center.

            The relative position of the centers in different kinds of boxes
            are different, e.g., the relative center of a boxes is
            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is
            recommended to use ``bottom_center`` or ``gravity_center`` for
            clearer usage.

        Returns:
            Tensor: A tensor with center of each box in shape (N, 3).
        """
        return self.bottom_center

    @property
    def bottom_center(self) -> Tensor:
        """Tensor: A tensor with center of each box in shape (N, 3)."""
        return self.tensor[:, :3]

    @property
    def gravity_center(self) -> Tensor:
        """Tensor: A tensor with center of each box in shape (N, 3)."""
        bottom_center = self.bottom_center
        gravity_center = torch.zeros_like(bottom_center)
        gravity_center[:, :2] = bottom_center[:, :2]
        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
        return gravity_center

    @property
    def corners(self) -> Tensor:
        """Tensor: A tensor with 8 corners of each box in shape (N, 8, 3)."""
        pass

    @property
    def bev(self) -> Tensor:
        """Tensor: 2D BEV box of each box with rotation in XYWHR format, in
        shape (N, 5)."""
        return self.tensor[:, [0, 1, 3, 4, 6]]

    @property
    def nearest_bev(self) -> Tensor:
        """Tensor: A tensor of 2D BEV box of each box without rotation."""
        # Obtain BEV boxes with rotation in XYWHR format
        bev_rotated_boxes = self.bev
        # convert the rotation to a valid range
        rotations = bev_rotated_boxes[:, -1]
        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))

        # find the center of boxes
        conditions = (normed_rotations > np.pi / 4)[..., None]
        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
                                                                [0, 1, 3, 2]],
                                  bev_rotated_boxes[:, :4])

        centers = bboxes_xywh[:, :2]
        dims = bboxes_xywh[:, 2:]
        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
        return bev_boxes

    def in_range_bev(
            self, box_range: Union[Tensor, np.ndarray,
                                   Sequence[float]]) -> Tensor:
        """Check whether the boxes are in the given range.

        Args:
            box_range (Tensor or np.ndarray or Sequence[float]): The range of
                box in order of (x_min, y_min, x_max, y_max).

        Note:
            The original implementation of SECOND checks whether boxes in a
            range by checking whether the points are in a convex polygon, we
            reduce the burden for simpler cases.

        Returns:
            Tensor: A binary vector indicating whether each box is inside the
            reference range.
        """
        in_range_flags = ((self.bev[:, 0] > box_range[0])
                          & (self.bev[:, 1] > box_range[1])
                          & (self.bev[:, 0] < box_range[2])
                          & (self.bev[:, 1] < box_range[3]))
        return in_range_flags

    @abstractmethod
    def rotate(
        self,
        angle: Union[Tensor, np.ndarray, float],
        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
            BasePoints, Tensor], None]:
        """Rotate boxes with points (optional) with the given angle or rotation
        matrix.

        Args:
            angle (Tensor or np.ndarray or float): Rotation angle or rotation
                matrix.
            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
                Points to rotate. Defaults to None.

        Returns:
            tuple or None: When ``points`` is None, the function returns None,
            otherwise it returns the rotated points and the rotation matrix
            ``rot_mat_T``.
        """
        pass

    @abstractmethod
    def flip(
        self,
        bev_direction: str = 'horizontal',
        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
        """Flip the boxes in BEV along given BEV direction.

        Args:
            bev_direction (str): Direction by which to flip. Can be chosen from
                'horizontal' and 'vertical'. Defaults to 'horizontal'.
            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
                Points to flip. Defaults to None.

        Returns:
            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
            is None, the function returns None, otherwise it returns the
            flipped points.
        """
        pass

    def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
        """Translate boxes with the given translation vector.

        Args:
            trans_vector (Tensor or np.ndarray): Translation vector of size
                1x3.
        """
        if not isinstance(trans_vector, Tensor):
            trans_vector = self.tensor.new_tensor(trans_vector)
        self.tensor[:, :3] += trans_vector

    def in_range_3d(
            self, box_range: Union[Tensor, np.ndarray,
                                   Sequence[float]]) -> Tensor:
        """Check whether the boxes are in the given range.

        Args:
            box_range (Tensor or np.ndarray or Sequence[float]): The range of
                box (x_min, y_min, z_min, x_max, y_max, z_max).

        Note:
            In the original implementation of SECOND, checking whether a box in
            the range checks whether the points are in a convex polygon, we try
            to reduce the burden for simpler cases.

        Returns:
            Tensor: A binary vector indicating whether each point is inside the
            reference range.
        """
        in_range_flags = ((self.tensor[:, 0] > box_range[0])
                          & (self.tensor[:, 1] > box_range[1])
                          & (self.tensor[:, 2] > box_range[2])
                          & (self.tensor[:, 0] < box_range[3])
                          & (self.tensor[:, 1] < box_range[4])
                          & (self.tensor[:, 2] < box_range[5]))
        return in_range_flags

    @abstractmethod
    def convert_to(self,
                   dst: int,
                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
        """Convert self to ``dst`` mode.

        Args:
            dst (int): The target Box mode.
            rt_mat (Tensor or np.ndarray, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.
            correct_yaw (bool): Whether to convert the yaw angle to the target
                coordinate. Defaults to False.

        Returns:
            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
            the ``dst`` mode.
        """
        pass

    def scale(self, scale_factor: float) -> None:
        """Scale the box with horizontal and vertical scaling factors.

        Args:
            scale_factors (float): Scale factors to scale the boxes.
        """
        self.tensor[:, :6] *= scale_factor
        self.tensor[:, 7:] *= scale_factor  # velocity

    def limit_yaw(self, offset: float = 0.5, period: float = np.pi) -> None:
        """Limit the yaw to a given period and offset.

        Args:
            offset (float): The offset of the yaw. Defaults to 0.5.
            period (float): The expected period. Defaults to np.pi.
        """
        self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)

    def nonempty(self, threshold: float = 0.0) -> Tensor:
        """Find boxes that are non-empty.

        A box is considered empty if either of its side is no larger than
        threshold.

        Args:
            threshold (float): The threshold of minimal sizes. Defaults to 0.0.

        Returns:
            Tensor: A binary vector which represents whether each box is empty
            (False) or non-empty (True).
        """
        box = self.tensor
        size_x = box[..., 3]
        size_y = box[..., 4]
        size_z = box[..., 5]
        keep = ((size_x > threshold)
                & (size_y > threshold) & (size_z > threshold))
        return keep

    def __getitem__(
            self, item: Union[int, slice, np.ndarray,
                              Tensor]) -> 'BaseInstance3DBoxes':
        """
        Args:
            item (int or slice or np.ndarray or Tensor): Index of boxes.

        Note:
            The following usage are allowed:

            1. `new_boxes = boxes[3]`: Return a `Boxes` that contains only one
               box.
            2. `new_boxes = boxes[2:10]`: Return a slice of boxes.
            3. `new_boxes = boxes[vector]`: Where vector is a
               torch.BoolTensor with `length = len(boxes)`. Nonzero elements in
               the vector will be selected.

            Note that the returned Boxes might share storage with this Boxes,
            subject to PyTorch's indexing semantics.

        Returns:
            :obj:`BaseInstance3DBoxes`: A new object of
            :class:`BaseInstance3DBoxes` after indexing.
        """
        original_type = type(self)
        if isinstance(item, int):
            return original_type(self.tensor[item].view(1, -1),
                                 box_dim=self.box_dim,
                                 with_yaw=self.with_yaw)
        b = self.tensor[item]
        assert b.dim() == 2, \
            f'Indexing on Boxes with {item} failed to return a matrix!'
        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)

    def __len__(self) -> int:
        """int: Number of boxes in the current object."""
        return self.tensor.shape[0]

    def __repr__(self) -> str:
        """str: Return a string that describes the object."""
        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'

    @classmethod
    def cat(cls, boxes_list: Sequence['BaseInstance3DBoxes']
            ) -> 'BaseInstance3DBoxes':
        """Concatenate a list of Boxes into a single Boxes.

        Args:
            boxes_list (Sequence[:obj:`BaseInstance3DBoxes`]): List of boxes.

        Returns:
            :obj:`BaseInstance3DBoxes`: The concatenated boxes.
        """
        assert isinstance(boxes_list, (list, tuple))
        if len(boxes_list) == 0:
            return cls(torch.empty(0))
        assert all(isinstance(box, cls) for box in boxes_list)

        # use torch.cat (v.s. layers.cat)
        # so the returned boxes never share storage with input
        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0),
                        box_dim=boxes_list[0].box_dim,
                        with_yaw=boxes_list[0].with_yaw)
        return cat_boxes

    def numpy(self) -> np.ndarray:
        """Reload ``numpy`` from self.tensor."""
        return self.tensor.numpy()

    def to(self, device: Union[str, torch.device], *args,
           **kwargs) -> 'BaseInstance3DBoxes':
        """Convert current boxes to a specific device.

        Args:
            device (str or :obj:`torch.device`): The name of the device.

        Returns:
            :obj:`BaseInstance3DBoxes`: A new boxes object on the specific
            device.
        """
        original_type = type(self)
        return original_type(self.tensor.to(device, *args, **kwargs),
                             box_dim=self.box_dim,
                             with_yaw=self.with_yaw)

    def cpu(self) -> 'BaseInstance3DBoxes':
        """Convert current boxes to cpu device.

        Returns:
            :obj:`BaseInstance3DBoxes`: A new boxes object on the cpu device.
        """
        original_type = type(self)
        return original_type(self.tensor.cpu(),
                             box_dim=self.box_dim,
                             with_yaw=self.with_yaw)

    def cuda(self, *args, **kwargs) -> 'BaseInstance3DBoxes':
        """Convert current boxes to cuda device.

        Returns:
            :obj:`BaseInstance3DBoxes`: A new boxes object on the cuda device.
        """
        original_type = type(self)
        return original_type(self.tensor.cuda(*args, **kwargs),
                             box_dim=self.box_dim,
                             with_yaw=self.with_yaw)

    def clone(self) -> 'BaseInstance3DBoxes':
        """Clone the boxes.

        Returns:
            :obj:`BaseInstance3DBoxes`: Box object with the same properties as
            self.
        """
        original_type = type(self)
        return original_type(self.tensor.clone(),
                             box_dim=self.box_dim,
                             with_yaw=self.with_yaw)

    def detach(self) -> 'BaseInstance3DBoxes':
        """Detach the boxes.

        Returns:
            :obj:`BaseInstance3DBoxes`: Box object with the same properties as
            self.
        """
        original_type = type(self)
        return original_type(self.tensor.detach(),
                             box_dim=self.box_dim,
                             with_yaw=self.with_yaw)

    @property
    def device(self) -> torch.device:
        """torch.device: The device of the boxes are on."""
        return self.tensor.device

    def __iter__(self) -> Iterator[Tensor]:
        """Yield a box as a Tensor at a time.

        Returns:
            Iterator[Tensor]: A box of shape (box_dim, ).
        """
        yield from self.tensor

    @classmethod
    def height_overlaps(cls, boxes1: 'BaseInstance3DBoxes',
                        boxes2: 'BaseInstance3DBoxes') -> Tensor:
        """Calculate height overlaps of two boxes.

        Note:
            This function calculates the height overlaps between ``boxes1`` and
            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.

        Args:
            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.

        Returns:
            Tensor: Calculated height overlap of the boxes.
        """
        assert isinstance(boxes1, BaseInstance3DBoxes)
        assert isinstance(boxes2, BaseInstance3DBoxes)
        assert type(boxes1) == type(boxes2), \
            '"boxes1" and "boxes2" should be in the same type, ' \
            f'but got {type(boxes1)} and {type(boxes2)}.'

        boxes1_top_height = boxes1.top_height.view(-1, 1)
        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
        boxes2_top_height = boxes2.top_height.view(1, -1)
        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)

        heighest_of_bottom = torch.max(boxes1_bottom_height,
                                       boxes2_bottom_height)
        lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
        overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
        return overlaps_h

    def new_box(
        self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
    ) -> 'BaseInstance3DBoxes':
        """Create a new box object with data.

        The new box and its tensor has the similar properties as self and
        self.tensor, respectively.

        Args:
            data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
                be copied.

        Returns:
            :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the
            object's other properties are similar to ``self``.
        """
        new_tensor = self.tensor.new_tensor(data) \
            if not isinstance(data, Tensor) else data.to(self.device)
        original_type = type(self)
        return original_type(new_tensor,
                             box_dim=self.box_dim,
                             with_yaw=self.with_yaw)


================================================
FILE: bip3d/structures/bbox_3d/box_3d_mode.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from enum import IntEnum, unique
from typing import Optional, Sequence, Union

import numpy as np
import torch
from torch import Tensor

from .base_box3d import BaseInstance3DBoxes
from .utils import limit_period


@unique
class Box3DMode(IntEnum):
    """Enum of different ways to represent a box.

    Coordinates in LiDAR:

    .. code-block:: none

                    up z
                       ^   x front
                       |  /
                       | /
        left y <------ 0

    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.

    Coordinates in Camera:

    .. code-block:: none

                z front
               /
              /
             0 ------> x right
             |
             |
             v
        down y

    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
    and the yaw is around the y axis, thus the rotation axis=1.

    Coordinates in Depth:

    .. code-block:: none

        up z
           ^   y front
           |  /
           | /
           0 ------> x right

    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    """

    LIDAR = 0
    CAM = 1
    DEPTH = 2
    EULER_CAM = 3
    EULER_DEPTH = 4

    @staticmethod
    def convert(
        box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
        src: 'Box3DMode',
        dst: 'Box3DMode',
        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
        with_yaw: bool = True,
        correct_yaw: bool = False
    ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
        """Convert boxes from ``src`` mode to ``dst`` mode.

        Args:
            box (Sequence[float] or np.ndarray or Tensor or
                :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
                array/tensor.
            src (:obj:`Box3DMode`): The source box mode.
            dst (:obj:`Box3DMode`): The target box mode.
            rt_mat (np.ndarray or Tensor, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.
            with_yaw (bool): If ``box`` is an instance of
                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
                Defaults to True.
            correct_yaw (bool): If the yaw is rotated by rt_mat.
                Defaults to False.

        Returns:
            Sequence[float] or np.ndarray or Tensor or
            :obj:`BaseInstance3DBoxes`: The converted box of the same type.
        """
        if src == dst:
            return box

        is_numpy = isinstance(box, np.ndarray)
        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
        single_box = isinstance(box, (list, tuple))
        if single_box:
            assert len(box) >= 7, (
                'Box3DMode.convert takes either a k-tuple/list or '
                'an Nxk array/tensor, where k >= 7')
            arr = torch.tensor(box)[None, :]
        else:
            # avoid modifying the input box
            if is_numpy:
                arr = torch.from_numpy(np.asarray(box)).clone()
            elif is_Instance3DBoxes:
                arr = box.tensor.clone()
            else:
                arr = box.clone()

        if is_Instance3DBoxes:
            with_yaw = box.with_yaw

        # convert box from `src` mode to `dst` mode.
        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
        if with_yaw:
            yaw = arr[..., 6:7]
        if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
                if correct_yaw:
                    yaw_vector = torch.cat([
                        torch.cos(yaw),
                        torch.sin(yaw),
                        torch.zeros_like(yaw)
                    ],
                                           dim=1)
                else:
                    yaw = -yaw - np.pi / 2
                    yaw = limit_period(yaw, period=np.pi * 2)
        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
                if correct_yaw:
                    yaw_vector = torch.cat([
                        torch.cos(-yaw),
                        torch.zeros_like(yaw),
                        torch.sin(-yaw)
                    ],
                                           dim=1)
                else:
                    yaw = -yaw - np.pi / 2
                    yaw = limit_period(yaw, period=np.pi * 2)
        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
                if correct_yaw:
                    yaw_vector = torch.cat([
                        torch.cos(yaw),
                        torch.sin(yaw),
                        torch.zeros_like(yaw)
                    ],
                                           dim=1)
                else:
                    yaw = -yaw
        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
            if with_yaw:
                if correct_yaw:
                    yaw_vector = torch.cat([
                        torch.cos(-yaw),
                        torch.zeros_like(yaw),
                        torch.sin(-yaw)
                    ],
                                           dim=1)
                else:
                    yaw = -yaw
        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
            if with_yaw:
                if correct_yaw:
                    yaw_vector = torch.cat([
                        torch.cos(yaw),
                        torch.sin(yaw),
                        torch.zeros_like(yaw)
                    ],
                                           dim=1)
                else:
                    yaw = yaw + np.pi / 2
                    yaw = limit_period(yaw, period=np.pi * 2)
        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
            if with_yaw:
                if correct_yaw:
                    yaw_vector = torch.cat([
                        torch.cos(yaw),
                        torch.sin(yaw),
                        torch.zeros_like(yaw)
                    ],
                                           dim=1)
                else:
                    yaw = yaw - np.pi / 2
                    yaw = limit_period(yaw, period=np.pi * 2)
        else:  # TODO: add transformation between euler boxes
            raise NotImplementedError(
                f'Conversion from Box3DMode {src} to {dst} '
                'is not supported yet')

        if not isinstance(rt_mat, Tensor):
            rt_mat = arr.new_tensor(rt_mat)
        if rt_mat.size(1) == 4:
            extended_xyz = torch.cat(
                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
            xyz = extended_xyz @ rt_mat.t()
        else:
            xyz = arr[..., :3] @ rt_mat.t()

        # Note: we only use rotation in rt_mat
        # so don't need to extend yaw_vector
        if with_yaw and correct_yaw:
            rot_yaw_vector = yaw_vector @ rt_mat[:3, :3].t()
            if dst == Box3DMode.CAM:
                yaw = torch.atan2(-rot_yaw_vector[:, [2]], rot_yaw_vector[:,
                                                                          [0]])
            elif dst in [Box3DMode.LIDAR, Box3DMode.DEPTH]:
                yaw = torch.atan2(rot_yaw_vector[:, [1]], rot_yaw_vector[:,
                                                                         [0]])
            yaw = limit_period(yaw, period=np.pi * 2)

        if with_yaw:
            remains = arr[..., 7:]
            arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1)
        else:
            remains = arr[..., 6:]
            arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1)

        # convert arr to the original type
        original_type = type(box)
        if single_box:
            return original_type(arr.flatten().tolist())
        if is_numpy:
            return arr.numpy()
        elif is_Instance3DBoxes:
            raise NotImplementedError(
                f'Conversion to {dst} through {original_type} '
                'is not supported yet')
        else:
            return arr


================================================
FILE: bip3d/structures/bbox_3d/coord_3d_mode.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from enum import IntEnum, unique
from typing import Optional, Sequence, Union

import numpy as np
import torch
from torch import Tensor

from bip3d.structures.points import (BasePoints, CameraPoints,
                                            DepthPoints, LiDARPoints)

from .base_box3d import BaseInstance3DBoxes
from .box_3d_mode import Box3DMode


@unique
class Coord3DMode(IntEnum):
    """Enum of different ways to represent a box and point cloud.

    Coordinates in LiDAR:

    .. code-block:: none

                    up z
                       ^   x front
                       |  /
                       | /
        left y <------ 0

    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.

    Coordinates in Camera:

    .. code-block:: none

                z front
               /
              /
             0 ------> x right
             |
             |
             v
        down y

    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
    and the yaw is around the y axis, thus the rotation axis=1.

    Coordinates in Depth:

    .. code-block:: none

        up z
           ^   y front
           |  /
           | /
           0 ------> x right

    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    """

    LIDAR = 0
    CAM = 1
    DEPTH = 2

    @staticmethod
    def convert(input: Union[Sequence[float], np.ndarray, Tensor,
                             BaseInstance3DBoxes, BasePoints],
                src: Union[Box3DMode, 'Coord3DMode'],
                dst: Union[Box3DMode, 'Coord3DMode'],
                rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
                with_yaw: bool = True,
                correct_yaw: bool = False,
                is_point: bool = True):
        """Convert boxes or points from ``src`` mode to ``dst`` mode.

        Args:
            input (Sequence[float] or np.ndarray or Tensor or
                :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`): Can be a
                k-tuple, k-list or an Nxk array/tensor.
            src (:obj:`Box3DMode` or :obj:`Coord3DMode`): The source mode.
            dst (:obj:`Box3DMode` or :obj:`Coord3DMode`): The target mode.
            rt_mat (np.ndarray or Tensor, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.
            with_yaw (bool): If ``box`` is an instance of
                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
                Defaults to True.
            correct_yaw (bool): If the yaw is rotated by rt_mat.
                Defaults to False.
            is_point (bool): If ``input`` is neither an instance of
                :obj:`BaseInstance3DBoxes` nor an instance of
                :obj:`BasePoints`, whether or not it is point data.
                Defaults to True.

        Returns:
            Sequence[float] or np.ndarray or Tensor or
            :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`: The converted box
            or points of the same type.
        """
        if isinstance(input, BaseInstance3DBoxes):
            return Coord3DMode.convert_box(input,
                                           src,
                                           dst,
                                           rt_mat=rt_mat,
                                           with_yaw=with_yaw,
                                           correct_yaw=correct_yaw)
        elif isinstance(input, BasePoints):
            return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
        elif isinstance(input, (tuple, list, np.ndarray, Tensor)):
            if is_point:
                return Coord3DMode.convert_point(input,
                                                 src,
                                                 dst,
                                                 rt_mat=rt_mat)
            else:
                return Coord3DMode.convert_box(input,
                                               src,
                                               dst,
                                               rt_mat=rt_mat,
                                               with_yaw=with_yaw,
                                               correct_yaw=correct_yaw)
        else:
            raise NotImplementedError

    @staticmethod
    def convert_box(
        box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
        src: Box3DMode,
        dst: Box3DMode,
        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
        with_yaw: bool = True,
        correct_yaw: bool = False
    ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
        """Convert boxes from ``src`` mode to ``dst`` mode.

        Args:
            box (Sequence[float] or np.ndarray or Tensor or
                :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
                array/tensor.
            src (:obj:`Box3DMode`): The source box mode.
            dst (:obj:`Box3DMode`): The target box mode.
            rt_mat (np.ndarray or Tensor, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.
            with_yaw (bool): If ``box`` is an instance of
                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
                Defaults to True.
            correct_yaw (bool): If the yaw is rotated by rt_mat.
                Defaults to False.

        Returns:
            Sequence[float] or np.ndarray or Tensor or
            :obj:`BaseInstance3DBoxes`: The converted box of the same type.
        """
        return Box3DMode.convert(box,
                                 src,
                                 dst,
                                 rt_mat=rt_mat,
                                 with_yaw=with_yaw,
                                 correct_yaw=correct_yaw)

    @staticmethod
    def convert_point(
        point: Union[Sequence[float], np.ndarray, Tensor, BasePoints],
        src: 'Coord3DMode',
        dst: 'Coord3DMode',
        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
    ) -> Union[Sequence[float], np.ndarray, Tensor, BasePoints]:
        """Convert points from ``src`` mode to ``dst`` mode.

        Args:
            box (Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`):
                Can be a k-tuple, k-list or an Nxk array/tensor.
            src (:obj:`Coord3DMode`): The source point mode.
            dst (:obj:`Coord3DMode`): The target point mode.
            rt_mat (np.ndarray or Tensor, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.

        Returns:
            Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`: The
            converted point of the same type.
        """
        if src == dst:
            return point

        is_numpy = isinstance(point, np.ndarray)
        is_InstancePoints = isinstance(point, BasePoints)
        single_point = isinstance(point, (list, tuple))
        if single_point:
            assert len(point) >= 3, (
                'Coord3DMode.convert takes either a k-tuple/list or '
                'an Nxk array/tensor, where k >= 3')
            arr = torch.tensor(point)[None, :]
        else:
            # avoid modifying the input point
            if is_numpy:
                arr = torch.from_numpy(np.asarray(point)).clone()
            elif is_InstancePoints:
                arr = point.tensor.clone()
            else:
                arr = point.clone()

        # convert point from `src` mode to `dst` mode.
        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
        else:
            raise NotImplementedError(
                f'Conversion from Coord3DMode {src} to {dst} '
                'is not supported yet')

        if not isinstance(rt_mat, Tensor):
            rt_mat = arr.new_tensor(rt_mat)
        if rt_mat.size(1) == 4:
            extended_xyz = torch.cat(
                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
            xyz = extended_xyz @ rt_mat.t()
        else:
            xyz = arr[..., :3] @ rt_mat.t()

        remains = arr[..., 3:]
        arr = torch.cat([xyz[..., :3], remains], dim=-1)

        # convert arr to the original type
        original_type = type(point)
        if single_point:
            return original_type(arr.flatten().tolist())
        if is_numpy:
            return arr.numpy()
        elif is_InstancePoints:
            if dst == Coord3DMode.CAM:
                target_type = CameraPoints
            elif dst == Coord3DMode.LIDAR:
                target_type = LiDARPoints
            elif dst == Coord3DMode.DEPTH:
                target_type = DepthPoints
            else:
                raise NotImplementedError(
                    f'Conversion to {dst} through {original_type} '
                    'is not supported yet')
            return target_type(arr,
                               points_dim=arr.size(-1),
                               attribute_dims=point.attribute_dims)
        else:
            return arr


================================================
FILE: bip3d/structures/bbox_3d/euler_box3d.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import numpy as np
import torch
from pytorch3d.ops import box3d_overlap
from pytorch3d.transforms import euler_angles_to_matrix, matrix_to_euler_angles

from ..points.base_points import BasePoints
from .base_box3d import BaseInstance3DBoxes
from .utils import rotation_3d_in_euler


class EulerInstance3DBoxes(BaseInstance3DBoxes):
    """3D boxes with 1-D orientation represented by three Euler angles.

    See https://en.wikipedia.org/wiki/Euler_angles for
        regarding the definition of Euler angles.

    Attributes:
        tensor (torch.Tensor): Float matrix of N x box_dim.
        box_dim (int): Integer indicates the dimension of a box
            Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma).
    """

    def __init__(self, tensor, box_dim=9, origin=(0.5, 0.5, 0.5)):
        if isinstance(tensor, torch.Tensor):
            device = tensor.device
        else:
            device = torch.device('cpu')
        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
        if tensor.numel() == 0:
            # Use reshape, so we don't end up creating a new tensor that
            # does not depend on the inputs (and consequently confuses jit)
            tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32,
                                                     device=device)
        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()

        if tensor.shape[-1] == 6:
            # If the dimension of boxes is 6, we expand box_dim by padding
            # (0, 0, 0) as a fake euler angle.
            assert box_dim == 6
            fake_rot = tensor.new_zeros(tensor.shape[0], 3)
            tensor = torch.cat((tensor, fake_rot), dim=-1)
            self.box_dim = box_dim + 3
        elif tensor.shape[-1] == 7:
            assert box_dim == 7
            fake_euler = tensor.new_zeros(tensor.shape[0], 2)
            tensor = torch.cat((tensor, fake_euler), dim=-1)
            self.box_dim = box_dim + 2
        else:
            assert tensor.shape[-1] == 9
            self.box_dim = box_dim
        self.tensor = tensor.clone()

        self.origin = origin
        if origin != (0.5, 0.5, 0.5):
            dst = self.tensor.new_tensor((0.5, 0.5, 0.5))
            src = self.tensor.new_tensor(origin)
            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)

    def get_corners(self, tensor1):
        """torch.Tensor: Coordinates of corners of all the boxes
        in shape (N, 8, 3).

        Convert the boxes to corners in clockwise order, in form of
        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``

        .. code-block:: none

                                           up z
                            front y           ^
                                 /            |
                                /             |
                  (x0, y1, z1) + -----------  + (x1, y1, z1)
                              /|            / |
                             / |           /  |
               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
                            |  /      .   |  /
                            | / origin    | /
               (x0, y0, z0) + ----------- + --------> right x
                                          (x1, y0, z0)
        """
        if tensor1.numel() == 0:
            return torch.empty([0, 8, 3], device=tensor1.device)

        dims = tensor1[:, 3:6]
        corners_norm = torch.from_numpy(
            np.stack(np.unravel_index(np.arange(8), [2] * 3),
                     axis=1)).to(device=dims.device, dtype=dims.dtype)

        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
        # use relative origin
        assert self.origin == (0.5, 0.5, 0.5), \
            'self.origin != (0.5, 0.5, 0.5) needs to be checked!'
        corners_norm = corners_norm - dims.new_tensor(self.origin)
        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])

        # rotate
        corners = rotation_3d_in_euler(corners, tensor1[:, 6:])

        corners += tensor1[:, :3].view(-1, 1, 3)
        return corners

    @classmethod
    def overlaps(cls, boxes1, boxes2, mode='iou', eps=1e-4):
        """Calculate 3D overlaps of two boxes.

        Note:
            This function calculates the overlaps between ``boxes1`` and
            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.

        Args:
            boxes1 (:obj:`EulerInstance3DBoxes`): Boxes 1 contain N boxes.
            boxes2 (:obj:`EulerInstance3DBoxes`): Boxes 2 contain M boxes.
            mode (str): Mode of iou calculation. Defaults to 'iou'.
            eps (bool): Epsilon. Defaults to 1e-4.

        Returns:
            torch.Tensor: Calculated 3D overlaps of the boxes.
        """
        assert isinstance(boxes1, EulerInstance3DBoxes)
        assert isinstance(boxes2, EulerInstance3DBoxes)
        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'

        assert mode in ['iou']

        rows = len(boxes1)
        cols = len(boxes2)
        if rows * cols == 0:
            return boxes1.tensor.new(rows, cols)

        corners1 = boxes1.corners
        corners2 = boxes2.corners
        _, iou3d = box3d_overlap(corners1, corners2, eps=eps)
        return iou3d

    @property
    def gravity_center(self):
        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
        return self.tensor[:, :3]

    @property
    def corners(self):
        """torch.Tensor: Coordinates of corners of all the boxes
        in shape (N, 8, 3).

        Convert the boxes to corners in clockwise order, in form of
        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``

        .. code-block:: none

                                           up z
                            front y           ^
                                 /            |
                                /             |
                  (x0, y1, z1) + -----------  + (x1, y1, z1)
                              /|            / |
                             / |           /  |
               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
                            |  /      .   |  /
                            | / origin    | /
               (x0, y0, z0) + ----------- + --------> right x
                                          (x1, y0, z0)
        """
        if self.tensor.numel() == 0:
            return torch.empty([0, 8, 3], device=self.tensor.device)

        dims = self.dims
        corners_norm = torch.from_numpy(
            np.stack(np.unravel_index(np.arange(8), [2] * 3),
                     axis=1)).to(device=dims.device, dtype=dims.dtype)

        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
        # use relative origin
        assert self.origin == (0.5, 0.5, 0.5), \
            'self.origin != (0.5, 0.5, 0.5) needs to be checked!'
        corners_norm = corners_norm - dims.new_tensor(self.origin)
        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])

        # rotate
        corners = rotation_3d_in_euler(corners, self.tensor[:, 6:])

        corners += self.tensor[:, :3].view(-1, 1, 3)
        return corners

    def transform(self, matrix):
        if self.tensor.shape[0] == 0:
            return
        if not isinstance(matrix, torch.Tensor):
            matrix = self.tensor.new_tensor(matrix)
        points = self.tensor[:, :3]
        constant = points.new_ones(points.shape[0], 1)
        points_extend = torch.concat([points, constant], dim=-1)
        points_trans = torch.matmul(points_extend, matrix.transpose(-2,
                                                                    -1))[:, :3]

        size = self.tensor[:, 3:6]

        # angle_delta = matrix_to_euler_angles(matrix[:3,:3], 'ZXY')
        # angle = self.tensor[:,6:] + angle_delta
        ori_matrix = euler_angles_to_matrix(self.tensor[:, 6:], 'ZXY')
        rot_matrix = matrix[:3, :3].expand_as(ori_matrix)
        final = torch.bmm(rot_matrix, ori_matrix)
        angle = matrix_to_euler_angles(final, 'ZXY')

        self.tensor = torch.cat([points_trans, size, angle], dim=-1)

    def scale(self, scale_factor: float) -> None:
        """Scale the box with horizontal and vertical scaling factors.

        Args:
            scale_factors (float): Scale factors to scale the boxes.
        """
        self.tensor[:, :6] *= scale_factor

    def rotate(self, angle, points=None):
        """Rotate boxes with points (optional) with the given angle or rotation
        matrix.

        Args:
            angle (float | torch.Tensor | np.ndarray):
                Rotation angle or rotation matrix.
            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
                Points to rotate. Defaults to None.

        Returns:
            tuple or None: When ``points`` is None, the function returns
                None, otherwise it returns the rotated points and the
                rotation matrix ``rot_mat_T``.
        """
        if not isinstance(angle, torch.Tensor):
            angle = self.tensor.new_tensor(angle)

        if angle.numel() == 1:  # only given yaw angle for rotation
            angle = self.tensor.new_tensor([angle, 0., 0.])
            rot_matrix = euler_angles_to_matrix(angle, 'ZXY')
        elif angle.numel() == 3:
            rot_matrix = euler_angles_to_matrix(angle, 'ZXY')
        elif angle.shape == torch.Size([3, 3]):
            rot_matrix = angle
        else:
            raise NotImplementedError

        rot_mat_T = rot_matrix.T
        transform_matrix = torch.eye(4)
        transform_matrix[:3, :3] = rot_matrix
        self.transform(transform_matrix)

        if points is not None:
            if isinstance(points, torch.Tensor):
                points[:, :3] = points[:, :3] @ rot_mat_T
            elif isinstance(points, np.ndarray):
                rot_mat_T = rot_mat_T.cpu().numpy()
                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
            elif isinstance(points, BasePoints):
                points.rotate(rot_mat_T)
            else:
                raise ValueError
            return points, rot_mat_T
        else:
            return rot_mat_T

    def flip(self, direction='X'):
        """Flip the boxes along the corresponding axis.

        Args:
            direction (str, optional): Flip axis. Defaults to 'X'.
        """
        assert direction in ['X', 'Y', 'Z']
        if direction == 'X':
            self.tensor[:, 0] = -self.tensor[:, 0]
            self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
            self.tensor[:, 8] = -self.tensor[:, 8]
        elif direction == 'Y':
            self.tensor[:, 1] = -self.tensor[:, 1]
            self.tensor[:, 6] = -self.tensor[:, 6]
            self.tensor[:, 7] = -self.tensor[:, 7] + np.pi
        elif direction == 'Z':
            self.tensor[:, 2] = -self.tensor[:, 2]
            self.tensor[:, 7] = -self.tensor[:, 7]
            self.tensor[:, 8] = -self.tensor[:, 8] + np.pi


================================================
FILE: bip3d/structures/bbox_3d/euler_depth_box3d.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import numpy as np
import torch
from mmcv.ops import points_in_boxes_all, points_in_boxes_part

from ..points.base_points import BasePoints
from .euler_box3d import EulerInstance3DBoxes


class EulerDepthInstance3DBoxes(EulerInstance3DBoxes):
    """3D boxes of instances in Depth coordinates.

    We keep the "Depth" coordinate system definition in MMDet3D just for
    clarification of the points coordinates and the flipping augmentation.

    Coordinates in Depth:

    .. code-block:: none

                    up z    y front (alpha=0.5*pi)
                       ^   ^
                       |  /
                       | /
                       0 ------> x right (alpha=0)

    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
    and the yaw is around the z axis, thus the rotation axis=2.
    The yaw is 0 at the positive direction of x axis, and decreases from
    the positive direction of x to the positive direction of y.
    Also note that rotation of DepthInstance3DBoxes is counterclockwise,
    which is reverse to the definition of the yaw angle (clockwise).

    Attributes:
        tensor (torch.Tensor): Float matrix of N x box_dim.
        box_dim (int): Integer indicates the dimension of a box
            Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma).
        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
            boxes.
    """

    def __init__(self,
                 tensor,
                 box_dim=9,
                 with_yaw=True,
                 origin=(0.5, 0.5, 0.5)):
        super().__init__(tensor, box_dim, origin)
        self.with_yaw = with_yaw

    def flip(self, bev_direction='horizontal', points=None):
        """Flip the boxes in BEV along given BEV direction.

        In Depth coordinates, it flips x (horizontal) or y (vertical) axis.

        Args:
            bev_direction (str, optional): Flip direction
                (horizontal or vertical). Defaults to 'horizontal'.
            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
                Points to flip. Defaults to None.

        Returns:
            torch.Tensor, numpy.ndarray or None: Flipped points.
        """
        assert bev_direction in ('horizontal', 'vertical')
        if bev_direction == 'horizontal':
            super().flip(direction='X')
        elif bev_direction == 'vertical':
            super().flip(direction='Y')

        if points is not None:
            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
            if isinstance(points, (torch.Tensor, np.ndarray)):
                if bev_direction == 'horizontal':
                    points[:, 0] = -points[:, 0]
                elif bev_direction == 'vertical':
                    points[:, 1] = -points[:, 1]
            elif isinstance(points, BasePoints):
                points.flip(bev_direction)
            return points

    def convert_to(self, dst, rt_mat=None):
        """Convert self to ``dst`` mode.

        Args:
            dst (:obj:`Box3DMode`): The target Box mode.
            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None.
                The conversion from ``src`` coordinates to ``dst`` coordinates
                usually comes along the change of sensors, e.g., from camera
                to LiDAR. This requires a transformation matrix.

        Returns:
            :obj:`DepthInstance3DBoxes`:
                The converted box of the same type in the ``dst`` mode.
        """
        from .box_3d_mode import Box3DMode
        assert dst == Box3DMode.EULER_DEPTH
        return self

    def points_in_boxes_part(self, points, boxes_override=None):
        """Find the box in which each point is.

        Args:
            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
            boxes_override (torch.Tensor, optional): Boxes to override
                `self.tensor`. Defaults to None.

        Returns:
            torch.Tensor: The index of the first box that each point
                is in, in shape (M, ). Default value is -1
                (if the point is not enclosed by any box).

        Note:
            If a point is enclosed by multiple boxes, the index of the
            first box will be returned.
        """
        if boxes_override is not None:
            boxes = boxes_override
        else:
            boxes = self.tensor
        if points.dim() == 2:
            points = points.unsqueeze(0)
        # TODO: take euler angles into consideration
        aligned_boxes = boxes[..., :7].clone()
        aligned_boxes[..., 6] = 0
        box_idx = points_in_boxes_part(
            points,
            aligned_boxes.unsqueeze(0).to(points.device)).squeeze(0)
        return box_idx

    def points_in_boxes_all(self, points, boxes_override=None):
        """Find all boxes in which each point is.

        Args:
            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
            boxes_override (torch.Tensor, optional): Boxes to override
                `self.tensor`. Defaults to None.

        Returns:
            torch.Tensor: A tensor indicating whether a point is in a box,
                in shape (M, T). T is the number of boxes. Denote this
                tensor as A, if the m^th point is in the t^th box, then
                `A[m, t] == 1`, elsewise `A[m, t] == 0`.
        """
        if boxes_override is not None:
            boxes = boxes_override
        else:
            boxes = self.tensor

        points_clone = points.clone()[..., :3]
        if points_clone.dim() == 2:
            points_clone = points_clone.unsqueeze(0)
        else:
            assert points_clone.dim() == 3 and points_clone.shape[0] == 1

        boxes = boxes.to(points_clone.device).unsqueeze(0)
        # TODO: take euler angles into consideration
        aligned_boxes = boxes[..., :7].clone()
        aligned_boxes[..., 6] = 0
        box_idxs_of_pts = points_in_boxes_all(points_clone, aligned_boxes)

        return box_idxs_of_pts.squeeze(0)


================================================
FILE: bip3d/structures/bbox_3d/utils.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from logging import warning
from typing import Tuple, Union

import numpy as np
import torch
from pytorch3d.transforms import euler_angles_to_matrix, quaternion_to_matrix
from torch import Tensor

from bip3d.utils.array_converter import array_converter


@array_converter(apply_to=('val', ))
def limit_period(val: Union[np.ndarray, Tensor],
                 offset: float = 0.5,
                 period: float = np.pi) -> Union[np.ndarray, Tensor]:
    """Limit the value into a period for periodic function.

    Args:
        val (np.ndarray or Tensor): The value to be converted.
        offset (float): Offset to set the value range. Defaults to 0.5.
        period (float): Period of the value. Defaults to np.pi.

    Returns:
        np.ndarray or Tensor: Value in the range of
        [-offset * period, (1-offset) * period].
    """
    limited_val = val - torch.floor(val / period + offset) * period
    return limited_val


@array_converter(apply_to=('points', 'angles'))
def rotation_3d_in_euler(points, angles, return_mat=False, clockwise=False):
    """Rotate points by angles according to axis.

    Args:
        points (np.ndarray | torch.Tensor | list | tuple ):
            Points of shape (N, M, 3).
        angles (np.ndarray | torch.Tensor | list | tuple):
            Vector of angles in shape (N, 3)
        return_mat: Whether or not return the rotation matrix (transposed).
            Defaults to False.
        clockwise: Whether the rotation is clockwise. Defaults to False.

    Raises:
        ValueError: when the axis is not in range [0, 1, 2], it will
            raise value error.

    Returns:
        (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3).
    """
    batch_free = len(points.shape) == 2
    if batch_free:
        points = points[None]

    if len(angles.shape) == 1:
        angles = angles.expand(points.shape[:1] + (3, ))
        # angles = torch.full(points.shape[:1], angles)

    assert len(points.shape) == 3 and len(angles.shape) == 2 \
        and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \
        f'angles: {points.shape}, {angles.shape}'

    assert points.shape[-1] in [2, 3], \
        f'Points size should be 2 or 3 instead of {points.shape[-1]}'

    if angles.shape[1] == 3:
        rot_mat_T = euler_angles_to_matrix(angles, 'ZXY')  # N, 3,3
    else:
        rot_mat_T = quaternion_to_matrix(angles)  # N, 3,3
    rot_mat_T = rot_mat_T.transpose(-2, -1)

    if clockwise:
        raise NotImplementedError('clockwise')

    if points.shape[0] == 0:
        points_new = points
    else:
        points_new = torch.bmm(points, rot_mat_T)

    if batch_free:
        points_new = points_new.squeeze(0)

    if return_mat:
        if batch_free:
            rot_mat_T = rot_mat_T.squeeze(0)
        return points_new, rot_mat_T
    else:
        return points_new


@array_converter(apply_to=('points', 'angles'))
def rotation_3d_in_axis(
    points: Union[np.ndarray, Tensor],
    angles: Union[np.ndarray, Tensor, float],
    axis: int = 0,
    return_mat: bool = False,
    clockwise: bool = False
) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[Tensor, Tensor], np.ndarray,
           Tensor]:
    """Rotate points by angles according to axis.

    Args:
        points (np.ndarray or Tensor): Points with shape (N, M, 3).
        angles (np.ndarray or Tensor or float): Vector of angles with shape
            (N, ).
        axis (int): The axis to be rotated. Defaults to 0.
        return_mat (bool): Whether or not to return the rotation matrix
            (transposed). Defaults to False.
        clockwise (bool): Whether the rotation is clockwise. Defaults to False.

    Raises:
        ValueError: When the axis is not in range [-3, -2, -1, 0, 1, 2], it
            will raise ValueError.

    Returns:
        Tuple[np.ndarray, np.ndarray] or Tuple[Tensor, Tensor] or np.ndarray or
        Tensor: Rotated points with shape (N, M, 3) and rotation matrix with
        shape (N, 3, 3).
    """
    batch_free = len(points.shape) == 2
    if batch_free:
        points = points[None]

    if isinstance(angles, float) or len(angles.shape) == 0:
        angles = torch.full(points.shape[:1], angles)

    assert len(points.shape) == 3 and len(angles.shape) == 1 and \
        points.shape[0] == angles.shape[0], 'Incorrect shape of points ' \
        f'angles: {points.shape}, {angles.shape}'

    assert points.shape[-1] in [2, 3], \
        f'Points size should be 2 or 3 instead of {points.shape[-1]}'

    rot_sin = torch.sin(angles)
    rot_cos = torch.cos(angles)
    ones = torch.ones_like(rot_cos)
    zeros = torch.zeros_like(rot_cos)

    if points.shape[-1] == 3:
        if axis == 1 or axis == -2:
            rot_mat_T = torch.stack([
                torch.stack([rot_cos, zeros, -rot_sin]),
                torch.stack([zeros, ones, zeros]),
                torch.stack([rot_sin, zeros, rot_cos])
            ])
        elif axis == 2 or axis == -1:
            rot_mat_T = torch.stack([
                torch.stack([rot_cos, rot_sin, zeros]),
                torch.stack([-rot_sin, rot_cos, zeros]),
                torch.stack([zeros, zeros, ones])
            ])
        elif axis == 0 or axis == -3:
            rot_mat_T = torch.stack([
                torch.stack([ones, zeros, zeros]),
                torch.stack([zeros, rot_cos, rot_sin]),
                torch.stack([zeros, -rot_sin, rot_cos])
            ])
        else:
            raise ValueError(
                f'axis should in range [-3, -2, -1, 0, 1, 2], got {axis}')
    else:
        rot_mat_T = torch.stack([
            torch.stack([rot_cos, rot_sin]),
            torch.stack([-rot_sin, rot_cos])
        ])

    if clockwise:
        rot_mat_T = rot_mat_T.transpose(0, 1)

    if points.shape[0] == 0:
        points_new = points
    else:
        points_new = torch.einsum('aij,jka->aik', points, rot_mat_T)

    if batch_free:
        points_new = points_new.squeeze(0)

    if return_mat:
        rot_mat_T = torch.einsum('jka->ajk', rot_mat_T)
        if batch_free:
            rot_mat_T = rot_mat_T.squeeze(0)
        return points_new, rot_mat_T
    else:
        return points_new


@array_converter(apply_to=('boxes_xywhr', ))
def xywhr2xyxyr(
        boxes_xywhr: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
    """Convert a rotated boxes in XYWHR format to XYXYR format.

    Args:
        boxes_xywhr (Tensor or np.ndarray): Rotated boxes in XYWHR format.

    Returns:
        Tensor or np.ndarray: Converted boxes in XYXYR format.
    """
    boxes = torch.zeros_like(boxes_xywhr)
    half_w = boxes_xywhr[..., 2] / 2
    half_h = boxes_xywhr[..., 3] / 2

    boxes[..., 0] = boxes_xywhr[..., 0] - half_w
    boxes[..., 1] = boxes_xywhr[..., 1] - half_h
    boxes[..., 2] = boxes_xywhr[..., 0] + half_w
    boxes[..., 3] = boxes_xywhr[..., 1] + half_h
    boxes[..., 4] = boxes_xywhr[..., 4]
    return boxes


def get_box_type(box_type: str) -> Tuple[type, int]:
    """Get the type and mode of box structure.

    We temporarily only support EulerDepthInstance3DBoxes to
    support 9-DoF box operations
    and will consider refactoring this class with further experience.

    Args:
        box_type (str): The type of box structure. The valid value are "LiDAR",
            "Camera" and "Depth".

    Raises:
        ValueError: A ValueError is raised when ``box_type`` does not belong to
            the three valid types.

    Returns:
        tuple: Box type and box mode.
    """
    from .box_3d_mode import Box3DMode
    from .euler_depth_box3d import EulerDepthInstance3DBoxes
    box_type_lower = box_type.lower()
    if box_type_lower == 'euler-depth':
        box_type_3d = EulerDepthInstance3DBoxes
        box_mode_3d = Box3DMode.EULER_DEPTH
    # elif box_type_lower == 'euler-camera':
    #     box_type_3d = EulerCameraInstance3DBoxes
    #     box_mode_3d = Box3DMode.EULER_CAM
    else:
        raise ValueError(
            'Only "box_type" of "camera", "lidar", "depth", "euler"'
            f' are supported, got {box_type}')

    return box_type_3d, box_mode_3d


@array_converter(apply_to=('points_3d', 'proj_mat'))
def points_cam2img(points_3d: Union[Tensor, np.ndarray],
                   proj_mat: Union[Tensor, np.ndarray],
                   with_depth: bool = False) -> Union[Tensor, np.ndarray]:
    """Project points in camera coordinates to image coordinates.

    Args:
        points_3d (Tensor or np.ndarray): Points in shape (N, 3).
        proj_mat (Tensor or np.ndarray): Transformation matrix between
            coordinates.
        with_depth (bool): Whether to keep depth in the output.
            Defaults to False.

    Returns:
        Tensor or np.ndarray: Points in image coordinates with shape [N, 2] if
        ``with_depth=False``, else [N, 3].
    """
    points_shape = list(points_3d.shape)
    points_shape[-1] = 1

    assert len(proj_mat.shape) == 2, \
        'The dimension of the projection matrix should be 2 ' \
        f'instead of {len(proj_mat.shape)}.'
    d1, d2 = proj_mat.shape[:2]
    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or \
        (d1 == 4 and d2 == 4), 'The shape of the projection matrix ' \
        f'({d1}*{d2}) is not supported.'
    if d1 == 3:
        proj_mat_expanded = torch.eye(4,
                                      device=proj_mat.device,
                                      dtype=proj_mat.dtype)
        proj_mat_expanded[:d1, :d2] = proj_mat
        proj_mat = proj_mat_expanded

    # previous implementation use new_zeros, new_one yields better results
    points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)

    point_2d = points_4 @ proj_mat.T
    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]

    if with_depth:
        point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)

    return point_2d_res


@array_converter(apply_to=('points_3d', 'proj_mat'))
def batch_points_cam2img(points_3d, proj_mat, with_depth=False):
    """Project points in camera coordinates to image coordinates.

    Args:
        points_3d (torch.Tensor | np.ndarray): Points in shape (N, D, 3)
        proj_mat (torch.Tensor | np.ndarray):
            Transformation matrix between coordinates.
        with_depth (bool, optional): Whether to keep depth in the output.
            Defaults to False.

    Returns:
        (torch.Tensor | np.ndarray): Points in image coordinates,
            with shape [N, D, 2] if `with_depth=False`, else [N, D, 3].
    """
    points_shape = list(points_3d.shape)
    points_shape[-1] = 1

    assert len(proj_mat.shape) == 3, 'The dimension of the projection'\
        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
    d0, d1, d2 = proj_mat.shape[:3]
    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
        f' ({d1}*{d2}) is not supported.'
    if d1 == 3:
        proj_mat_expanded = torch.eye(4,
                                      device=proj_mat.device,
                                      dtype=proj_mat.dtype)
        proj_mat_expanded = proj_mat_expanded[None, :, :].expand(d0, -1, -1)
        proj_mat_expanded[:, :d1, :d2] = proj_mat
        proj_mat = proj_mat_expanded

    # previous implementation use new_zeros, new_one yields better results
    points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
    # do the batch wise operation
    point_2d = torch.bmm(points_4, proj_mat.permute(0, 2, 1))
    # point_2d = points_4 @ proj_mat.T

    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3].clamp(min=1e-3)

    if with_depth:
        point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)

    return point_2d_res


@array_converter(apply_to=('points', 'cam2img'))
def points_img2cam(
        points: Union[Tensor, np.ndarray],
        cam2img: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
    """Project points in image coordinates to camera coordinates.

    Args:
        points (Tensor or np.ndarray): 2.5D points in 2D images with shape
            [N, 3], 3 corresponds with x, y in the image and depth.
        cam2img (Tensor or np.ndarray): Camera intrinsic matrix. The shape can
            be [3, 3], [3, 4] or [4, 4].

    Returns:
        Tensor or np.ndarray: Points in 3D space with shape [N, 3], 3
        corresponds with x, y, z in 3D space.
    """
    assert cam2img.shape[0] <= 4
    assert cam2img.shape[1] <= 4
    assert points.shape[1] == 3

    xys = points[:, :2]
    depths = points[:, 2].view(-1, 1)
    unnormed_xys = torch.cat([xys * depths, depths], dim=1)

    pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device)
    pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img
    inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1)

    # Do operation in homogeneous coordinates.
    num_points = unnormed_xys.shape[0]
    homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1)
    points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3]

    return points3D


def mono_cam_box2vis(cam_box):
    """This is a post-processing function on the bboxes from Mono-3D task. If
    we want to perform projection visualization, we need to:

        1. rotate the box along x-axis for np.pi / 2 (roll)
        2. change orientation from local yaw to global yaw
        3. convert yaw by (np.pi / 2 - yaw)

    After applying this function, we can project and draw it on 2D images.

    Args:
        cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate
            system before conversion. Could be gt bbox loaded from dataset or
            network prediction output.

    Returns:
        :obj:`CameraInstance3DBoxes`: Box after conversion.
    """
    warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
                 'monocular 3D detection on nuScenes has been removed. The '
                 'function mono_cam_box2vis will be deprecated.')
    from .cam_box3d import CameraInstance3DBoxes
    assert isinstance(cam_box, CameraInstance3DBoxes), \
        'input bbox should be CameraInstance3DBoxes!'
    loc = cam_box.gravity_center
    dim = cam_box.dims
    yaw = cam_box.yaw
    feats = cam_box.tensor[:, 7:]
    # rotate along x-axis for np.pi / 2
    # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557  # noqa
    dim[:, [1, 2]] = dim[:, [2, 1]]
    # change local yaw to global yaw for visualization
    # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166  # noqa
    yaw += torch.atan2(loc[:, 0], loc[:, 2])
    # convert yaw by (-yaw - np.pi / 2)
    # this is because mono 3D box class such as `NuScenesBox` has different
    # definition of rotation with our `CameraInstance3DBoxes`
    yaw = -yaw - np.pi / 2
    cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
    cam_box = CameraInstance3DBoxes(cam_box,
                                    box_dim=cam_box.shape[-1],
                                    origin=(0.5, 0.5, 0.5))

    return cam_box


def get_proj_mat_by_coord_type(img_meta: dict, coord_type: str) -> Tensor:
    """Obtain image features using points.

    Args:
        img_meta (dict): Meta information.
        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Can be case-
            insensitive.

    Returns:
        Tensor: Transformation matrix.
    """
    coord_type = coord_type.upper()
    mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
    assert coord_type in mapping.keys()
    return img_meta[mapping[coord_type]]


def yaw2local(yaw: Tensor, loc: Tensor) -> Tensor:
    """Transform global yaw to local yaw (alpha in kitti) in camera
    coordinates, ranges from -pi to pi.

    Args:
        yaw (Tensor): A vector with local yaw of each box in shape (N, ).
        loc (Tensor): Gravity center of each box in shape (N, 3).

    Returns:
        Tensor: Local yaw (alpha in kitti).
    """
    local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
    larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
    small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
    if len(larger_idx) != 0:
        local_yaw[larger_idx] -= 2 * np.pi
    if len(small_idx) != 0:
        local_yaw[small_idx] += 2 * np.pi

    return local_yaw


def get_lidar2img(cam2img: Tensor, lidar2cam: Tensor) -> Tensor:
    """Get the projection matrix of lidar2img.

    Args:
        cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix.
        lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix.

    Returns:
        Tensor: Transformation matrix with shape 4x4.
    """
    if cam2img.shape == (3, 3):
        temp = cam2img.new_zeros(4, 4)
        temp[:3, :3] = cam2img
        temp[3, 3] = 1
        cam2img = temp

    if lidar2cam.shape == (3, 3):
        temp = lidar2cam.new_zeros(4, 4)
        temp[:3, :3] = lidar2cam
        temp[3, 3] = 1
        lidar2cam = temp
    return torch.matmul(cam2img, lidar2cam)


================================================
FILE: bip3d/structures/ops/__init__.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
# yapf:disable
from .box_np_ops import (box2d_to_corner_jit, box3d_to_bbox,
                         box_camera_to_lidar, boxes3d_to_corners3d_lidar,
                         camera_to_lidar, center_to_corner_box2d,
                         center_to_corner_box3d, center_to_minmax_2d,
                         corner_to_standup_nd_jit, corner_to_surfaces_3d,
                         corner_to_surfaces_3d_jit, corners_nd,
                         create_anchors_3d_range, depth_to_lidar_points,
                         depth_to_points, get_frustum, iou_jit,
                         minmax_to_corner_2d, points_in_convex_polygon_3d_jit,
                         points_in_convex_polygon_jit, points_in_rbbox,
                         projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox,
                         remove_outside_points, rotation_points_single_angle,
                         surface_equ_3d)
# yapf:enable
from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
                               BboxOverlapsNearest3D,
                               axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
                               bbox_overlaps_nearest_3d)
from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back

__all__ = [
    'box2d_to_corner_jit', 'box3d_to_bbox', 'box_camera_to_lidar',
    'boxes3d_to_corners3d_lidar', 'camera_to_lidar', 'center_to_corner_box2d',
    'center_to_corner_box3d', 'center_to_minmax_2d',
    'corner_to_standup_nd_jit', 'corner_to_surfaces_3d',
    'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range',
    'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit',
    'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit',
    'points_in_convex_polygon_jit', 'points_in_rbbox',
    'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox',
    'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d',
    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
    'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
    'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi',
    'bbox3d2result'
]


================================================
FILE: bip3d/structures/ops/box_np_ops.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
# TODO: clean the functions in this file and move the APIs into box bbox_3d
# in the future
# NOTICE: All functions in this file are valid for LiDAR or depth boxes only
# if we use default parameters.

import numba
import numpy as np

from bip3d.structures.bbox_3d import (limit_period, points_cam2img,
                                             rotation_3d_in_axis)


def camera_to_lidar(points, r_rect, velo2cam):
    """Convert points in camera coordinate to lidar coordinate.

    Note:
        This function is for KITTI only.

    Args:
        points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
            specific camera coordinate (e.g. CAM2) to CAM0.
        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
            camera coordinate to lidar coordinate.

    Returns:
        np.ndarray, shape=[N, 3]: Points in lidar coordinate.
    """
    points_shape = list(points.shape[0:-1])
    if points.shape[-1] == 3:
        points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
    lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
    return lidar_points[..., :3]


def box_camera_to_lidar(data, r_rect, velo2cam):
    """Convert boxes in camera coordinate to lidar coordinate.

    Note:
        This function is for KITTI only.

    Args:
        data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
            specific camera coordinate (e.g. CAM2) to CAM0.
        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
            camera coordinate to lidar coordinate.

    Returns:
        np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
    """
    xyz = data[:, 0:3]
    x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6]
    r = data[:, 6:7]
    xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
    # yaw and dims also needs to be converted
    r_new = -r - np.pi / 2
    r_new = limit_period(r_new, period=np.pi * 2)
    return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1)


def corners_nd(dims, origin=0.5):
    """Generate relative box corners based on length per dim and origin point.

    Args:
        dims (np.ndarray, shape=[N, ndim]): Array of length per dim
        origin (list or array or float, optional): origin point relate to
            smallest point. Defaults to 0.5

    Returns:
        np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
            where x0 < x1, y0 < y1, z0 < z1.
    """
    ndim = int(dims.shape[1])
    corners_norm = np.stack(np.unravel_index(np.arange(2**ndim), [2] * ndim),
                            axis=1).astype(dims.dtype)
    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
    # so need to convert to a format which is convenient to do other computing.
    # for 2d boxes, format is clockwise start with minimum point
    # for 3d boxes, please draw lines by your hand.
    if ndim == 2:
        # generate clockwise box corners
        corners_norm = corners_norm[[0, 1, 3, 2]]
    elif ndim == 3:
        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
        [1, 2**ndim, ndim])
    return corners


def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
    """Convert kitti locations, dimensions and angles to corners.
    format: center(xy), dims(xy), angles(counterclockwise when positive)

    Args:
        centers (np.ndarray): Locations in kitti label file with shape (N, 2).
        dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
        angles (np.ndarray, optional): Rotation_y in kitti label file with
            shape (N). Defaults to None.
        origin (list or array or float, optional): origin point relate to
            smallest point. Defaults to 0.5.

    Returns:
        np.ndarray: Corners with the shape of (N, 4, 2).
    """
    # 'length' in kitti format is in x axis.
    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
    corners = corners_nd(dims, origin=origin)
    # corners: [N, 4, 2]
    if angles is not None:
        corners = rotation_3d_in_axis(corners, angles)
    corners += centers.reshape([-1, 1, 2])
    return corners


@numba.jit(nopython=True)
def depth_to_points(depth, trunc_pixel):
    """Convert depth map to points.

    Args:
        depth (np.array, shape=[H, W]): Depth map which
            the row of [0~`trunc_pixel`] are truncated.
        trunc_pixel (int): The number of truncated row.

    Returns:
        np.ndarray: Points in camera coordinates.
    """
    num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
    points = np.zeros((num_pts, 3), dtype=depth.dtype)
    x = np.array([0, 0, 1], dtype=depth.dtype)
    k = 0
    for i in range(trunc_pixel, depth.shape[0]):
        for j in range(depth.shape[1]):
            if depth[i, j] > 0.1:
                x = np.array([j, i, 1], dtype=depth.dtype)
                points[k] = x * depth[i, j]
                k += 1
    return points


def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
    """Convert depth map to points in lidar coordinate.

    Args:
        depth (np.array, shape=[H, W]): Depth map which
            the row of [0~`trunc_pixel`] are truncated.
        trunc_pixel (int): The number of truncated row.
        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
            specific camera coordinate (e.g. CAM2) to CAM0.
        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
            camera coordinate to lidar coordinate.

    Returns:
        np.ndarray: Points in lidar coordinates.
    """
    pts = depth_to_points(depth, trunc_pixel)
    points_shape = list(pts.shape[0:-1])
    points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
    points = points @ np.linalg.inv(P2.T)
    lidar_points = camera_to_lidar(points, r_rect, velo2cam)
    return lidar_points


def center_to_corner_box3d(centers,
                           dims,
                           angles=None,
                           origin=(0.5, 1.0, 0.5),
                           axis=1):
    """Convert kitti locations, dimensions and angles to corners.

    Args:
        centers (np.ndarray): Locations in kitti label file with shape (N, 3).
        dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
        angles (np.ndarray, optional): Rotation_y in kitti label file with
            shape (N). Defaults to None.
        origin (list or array or float, optional): Origin point relate to
            smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
            in lidar. Defaults to (0.5, 1.0, 0.5).
        axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
            Defaults to 1.

    Returns:
        np.ndarray: Corners with the shape of (N, 8, 3).
    """
    # 'length' in kitti format is in x axis.
    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar)
    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
    corners = corners_nd(dims, origin=origin)
    # corners: [N, 8, 3]
    if angles is not None:
        corners = rotation_3d_in_axis(corners, angles, axis=axis)
    corners += centers.reshape([-1, 1, 3])
    return corners


@numba.jit(nopython=True)
def box2d_to_corner_jit(boxes):
    """Convert box2d to corner.

    Args:
        boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.

    Returns:
        box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
    """
    num_box = boxes.shape[0]
    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
    corners_norm[1, 1] = 1.0
    corners_norm[2] = 1.0
    corners_norm[3, 0] = 1.0
    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
    corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
        1, 4, 2)
    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
    box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
    for i in range(num_box):
        rot_sin = np.sin(boxes[i, -1])
        rot_cos = np.cos(boxes[i, -1])
        rot_mat_T[0, 0] = rot_cos
        rot_mat_T[0, 1] = rot_sin
        rot_mat_T[1, 0] = -rot_sin
        rot_mat_T[1, 1] = rot_cos
        box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
    return box_corners


@numba.njit
def corner_to_standup_nd_jit(boxes_corner):
    """Convert boxes_corner to aligned (min-max) boxes.

    Args:
        boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.

    Returns:
        np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
    """
    num_boxes = boxes_corner.shape[0]
    ndim = boxes_corner.shape[-1]
    result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
    for i in range(num_boxes):
        for j in range(ndim):
            result[i, j] = np.min(boxes_corner[i, :, j])
        for j in range(ndim):
            result[i, j + ndim] = np.max(boxes_corner[i, :, j])
    return result


@numba.jit(nopython=True)
def corner_to_surfaces_3d_jit(corners):
    """Convert 3d box corners from corner function above to surfaces that
    normal vectors all direct to internal.

    Args:
        corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).

    Returns:
        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
    """
    # box_corners: [N, 8, 3], must from corner functions in this module
    num_boxes = corners.shape[0]
    surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
    corner_idxes = np.array([
        0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
    ]).reshape(6, 4)
    for i in range(num_boxes):
        for j in range(6):
            for k in range(4):
                surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
    return surfaces


def rotation_points_single_angle(points, angle, axis=0):
    """Rotate points with a single angle.

    Args:
        points (np.ndarray, shape=[N, 3]]):
        angle (np.ndarray, shape=[1]]):
        axis (int, optional): Axis to rotate at. Defaults to 0.

    Returns:
        np.ndarray: Rotated points.
    """
    # points: [N, 3]
    rot_sin = np.sin(angle)
    rot_cos = np.cos(angle)
    if axis == 1:
        rot_mat_T = np.array(
            [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]],
            dtype=points.dtype)
    elif axis == 2 or axis == -1:
        rot_mat_T = np.array(
            [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]],
            dtype=points.dtype)
    elif axis == 0:
        rot_mat_T = np.array(
            [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]],
            dtype=points.dtype)
    else:
        raise ValueError('axis should in range')

    return points @ rot_mat_T, rot_mat_T


def box3d_to_bbox(box3d, P2):
    """Convert box3d in camera coordinates to bbox in image coordinates.

    Args:
        box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
        P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.

    Returns:
        np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
    """
    box_corners = center_to_corner_box3d(box3d[:, :3],
                                         box3d[:, 3:6],
                                         box3d[:, 6], [0.5, 1.0, 0.5],
                                         axis=1)
    box_corners_in_image = points_cam2img(box_corners, P2)
    # box_corners_in_image: [N, 8, 2]
    minxy = np.min(box_corners_in_image, axis=1)
    maxxy = np.max(box_corners_in_image, axis=1)
    bbox = np.concatenate([minxy, maxxy], axis=1)
    return bbox


def corner_to_surfaces_3d(corners):
    """convert 3d box corners from corner function above to surfaces that
    normal vectors all direct to internal.

    Args:
        corners (np.ndarray): 3D box corners with shape of (N, 8, 3).

    Returns:
        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
    """
    # box_corners: [N, 8, 3], must from corner functions in this module
    surfaces = np.array([
        [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
        [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
        [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
        [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
        [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
        [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
    ]).transpose([2, 0, 1, 3])
    return surfaces


def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
    """Check points in rotated bbox and return indices.

    Note:
        This function is for counterclockwise boxes.

    Args:
        points (np.ndarray, shape=[N, 3+dim]): Points to query.
        rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
        z_axis (int, optional): Indicate which axis is height.
            Defaults to 2.
        origin (tuple[int], optional): Indicate the position of
            box center. Defaults to (0.5, 0.5, 0).

    Returns:
        np.ndarray, shape=[N, M]: Indices of points in each box.
    """
    # TODO: this function is different from PointCloud3D, be careful
    # when start to use nuscene, check the input
    rbbox_corners = center_to_corner_box3d(rbbox[:, :3],
                                           rbbox[:, 3:6],
                                           rbbox[:, 6],
                                           origin=origin,
                                           axis=z_axis)
    surfaces = corner_to_surfaces_3d(rbbox_corners)
    indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
    return indices


def minmax_to_corner_2d(minmax_box):
    """Convert minmax box to corners2d.

    Args:
        minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.

    Returns:
        np.ndarray: 2d corners of boxes
    """
    ndim = minmax_box.shape[-1] // 2
    center = minmax_box[..., :ndim]
    dims = minmax_box[..., ndim:] - center
    return center_to_corner_box2d(center, dims, origin=0.0)


def create_anchors_3d_range(feature_size,
                            anchor_range,
                            sizes=((3.9, 1.6, 1.56), ),
                            rotations=(0, np.pi / 2),
                            dtype=np.float32):
    """Create anchors 3d by range.

    Args:
        feature_size (list[float] | tuple[float]): Feature map size. It is
            either a list of a tuple of [D, H, W](in order of z, y, and x).
        anchor_range (torch.Tensor | list[float]): Range of anchors with
            shape [6]. The order is consistent with that of anchors, i.e.,
            (x_min, y_min, z_min, x_max, y_max, z_max).
        sizes (list[list] | np.ndarray | torch.Tensor, optional):
            Anchor size with shape [N, 3], in order of x, y, z.
            Defaults to ((3.9, 1.6, 1.56), ).
        rotations (list[float] | np.ndarray | torch.Tensor, optional):
            Rotations of anchors in a single feature grid.
            Defaults to (0, np.pi / 2).
        dtype (type, optional): Data type. Defaults to np.float32.

    Returns:
        np.ndarray: Range based anchors with shape of
            (*feature_size, num_sizes, num_rots, 7).
    """
    anchor_range = np.array(anchor_range, dtype)
    z_centers = np.linspace(anchor_range[2],
                            anchor_range[5],
                            feature_size[0],
                            dtype=dtype)
    y_centers = np.linspace(anchor_range[1],
                            anchor_range[4],
                            feature_size[1],
                            dtype=dtype)
    x_centers = np.linspace(anchor_range[0],
                            anchor_range[3],
                            feature_size[2],
                            dtype=dtype)
    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
    rotations = np.array(rotations, dtype=dtype)
    rets = np.meshgrid(x_centers,
                       y_centers,
                       z_centers,
                       rotations,
                       indexing='ij')
    tile_shape = [1] * 5
    tile_shape[-2] = int(sizes.shape[0])
    for i in range(len(rets)):
        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
        rets[i] = rets[i][..., np.newaxis]  # for concat
    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
    tile_size_shape = list(rets[0].shape)
    tile_size_shape[3] = 1
    sizes = np.tile(sizes, tile_size_shape)
    rets.insert(3, sizes)
    ret = np.concatenate(rets, axis=-1)
    return np.transpose(ret, [2, 1, 0, 3, 4, 5])


def center_to_minmax_2d(centers, dims, origin=0.5):
    """Center to minmax.

    Args:
        centers (np.ndarray): Center points.
        dims (np.ndarray): Dimensions.
        origin (list or array or float, optional): Origin point relate
            to smallest point. Defaults to 0.5.

    Returns:
        np.ndarray: Minmax points.
    """
    if origin == 0.5:
        return np.concatenate([centers - dims / 2, centers + dims / 2],
                              axis=-1)
    corners = center_to_corner_box2d(centers, dims, origin=origin)
    return corners[:, [0, 2]].reshape([-1, 4])


def rbbox2d_to_near_bbox(rbboxes):
    """convert rotated bbox to nearest 'standing' or 'lying' bbox.

    Args:
        rbboxes (np.ndarray): Rotated bboxes with shape of
            (N, 5(x, y, xdim, ydim, rad)).

    Returns:
        np.ndarray: Bounding boxes with the shape of
            (N, 4(xmin, ymin, xmax, ymax)).
    """
    rots = rbboxes[..., -1]
    rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
    cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
    bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
    return bboxes


@numba.jit(nopython=True)
def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
    """Calculate box iou. Note that jit version runs ~10x faster than the
    box_overlaps function in mmdet3d.core.evaluation.

    Note:
        This function is for counterclockwise boxes.

    Args:
        boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
        query_boxes (np.ndarray): Query boxes with shape of (K, 4).
        mode (str, optional): IoU mode. Defaults to 'iou'.
        eps (float, optional): Value added to denominator. Defaults to 0.

    Returns:
        np.ndarray: Overlap between boxes and query_boxes
            with the shape of [N, K].
    """
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    overlaps = np.zeros((N, K), dtype=boxes.dtype)
    for k in range(K):
        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
                    (query_boxes[k, 3] - query_boxes[k, 1] + eps))
        for n in range(N):
            iw = (min(boxes[n, 2], query_boxes[k, 2]) -
                  max(boxes[n, 0], query_boxes[k, 0]) + eps)
            if iw > 0:
                ih = (min(boxes[n, 3], query_boxes[k, 3]) -
                      max(boxes[n, 1], query_boxes[k, 1]) + eps)
                if ih > 0:
                    if mode == 'iou':
                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
                              (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
                              iw * ih)
                    else:
                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
                              (boxes[n, 3] - boxes[n, 1] + eps))
                    overlaps[n, k] = iw * ih / ua
    return overlaps


def projection_matrix_to_CRT_kitti(proj):
    """Split projection matrix of KITTI.

    Note:
        This function is for KITTI only.

    P = C @ [R|T]
    C is upper triangular matrix, so we need to inverse CR and use QR
    stable for all kitti camera projection matrix.

    Args:
        proj (p.array, shape=[4, 4]): Intrinsics of camera.

    Returns:
        tuple[np.ndarray]: Splited matrix of C, R and T.
    """

    CR = proj[0:3, 0:3]
    CT = proj[0:3, 3]
    RinvCinv = np.linalg.inv(CR)
    Rinv, Cinv = np.linalg.qr(RinvCinv)
    C = np.linalg.inv(Cinv)
    R = np.linalg.inv(Rinv)
    T = Cinv @ CT
    return C, R, T


def remove_outside_points(points, rect, Trv2c, P2, image_shape):
    """Remove points which are outside of image.

    Note:
        This function is for KITTI only.

    Args:
        points (np.ndarray, shape=[N, 3+dims]): Total points.
        rect (np.ndarray, shape=[4, 4]): Matrix to project points in
            specific camera coordinate (e.g. CAM2) to CAM0.
        Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
            camera coordinate to lidar coordinate.
        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
        image_shape (list[int]): Shape of image.

    Returns:
        np.ndarray, shape=[N, 3+dims]: Filtered points.
    """
    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
    C, R, T = projection_matrix_to_CRT_kitti(P2)
    image_bbox = [0, 0, image_shape[1], image_shape[0]]
    frustum = get_frustum(image_bbox, C)
    frustum -= T
    frustum = np.linalg.inv(R) @ frustum.T
    frustum = camera_to_lidar(frustum.T, rect, Trv2c)
    frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
    indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
    points = points[indices.reshape([-1])]
    return points


def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
    """Get frustum corners in camera coordinates.

    Args:
        bbox_image (list[int]): box in image coordinates.
        C (np.ndarray): Intrinsics.
        near_clip (float, optional): Nearest distance of frustum.
            Defaults to 0.001.
        far_clip (float, optional): Farthest distance of frustum.
            Defaults to 100.

    Returns:
        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
    """
    fku = C[0, 0]
    fkv = -C[1, 1]
    u0v0 = C[0:2, 2]
    z_points = np.array([near_clip] * 4 + [far_clip] * 4,
                        dtype=C.dtype)[:, np.newaxis]
    b = bbox_image
    box_corners = np.array(
        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
        dtype=C.dtype)
    near_box_corners = (box_corners - u0v0) / np.array(
        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
    far_box_corners = (box_corners - u0v0) / np.array(
        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
    ret_xy = np.concatenate([near_box_corners, far_box_corners],
                            axis=0)  # [8, 2]
    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
    return ret_xyz


def surface_equ_3d(polygon_surfaces):
    """

    Args:
        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
            [num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
            All surfaces' normal vector must direct to internal.
            Max_num_points_of_surface must at least 3.

    Returns:
        tuple: normal vector and its direction.
    """
    # return [a, b, c], d in ax+by+cz+d=0
    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
    surface_vec = polygon_surfaces[:, :, :2, :] - \
        polygon_surfaces[:, :, 1:3, :]
    # normal_vec: [..., 3]
    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
    # print(normal_vec.shape, points[..., 0, :].shape)
    # d = -np.inner(normal_vec, points[..., 0, :])
    d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
    return normal_vec, -d


@numba.njit
def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
                                     num_surfaces):
    """
    Args:
        points (np.ndarray): Input points with shape of (num_points, 3).
        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
            All surfaces' normal vector must direct to internal.
            Max_num_points_of_surface must at least 3.
        normal_vec (np.ndarray): Normal vector of polygon_surfaces.
        d (int): Directions of normal vector.
        num_surfaces (np.ndarray): Number of surfaces a polygon contains
            shape of (num_polygon).

    Returns:
        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
    """
    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
    num_points = points.shape[0]
    num_polygons = polygon_surfaces.shape[0]
    ret = np.ones((num_points, num_polygons), dtype=np.bool_)
    sign = 0.0
    for i in range(num_points):
        for j in range(num_polygons):
            for k in range(max_num_surfaces):
                if k > num_surfaces[j]:
                    break
                sign = (points[i, 0] * normal_vec[j, k, 0] +
                        points[i, 1] * normal_vec[j, k, 1] +
                        points[i, 2] * normal_vec[j, k, 2] + d[j, k])
                if sign >= 0:
                    ret[i, j] = False
                    break
    return ret


def points_in_convex_polygon_3d_jit(points,
                                    polygon_surfaces,
                                    num_surfaces=None):
    """Check points is in 3d convex polygons.

    Args:
        points (np.ndarray): Input points with shape of (num_points, 3).
        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
            All surfaces' normal vector must direct to internal.
            Max_num_points_of_surface must at least 3.
        num_surfaces (np.ndarray, optional): Number of surfaces a polygon
            contains shape of (num_polygon). Defaults to None.

    Returns:
        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
    """
    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
    # num_points = points.shape[0]
    num_polygons = polygon_surfaces.shape[0]
    if num_surfaces is None:
        num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
    normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
    # normal_vec: [num_polygon, max_num_surfaces, 3]
    # d: [num_polygon, max_num_surfaces]
    return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
                                            normal_vec, d, num_surfaces)


@numba.njit
def points_in_convex_polygon_jit(points, polygon, clockwise=False):
    """Check points is in 2d convex polygons. True when point in polygon.

    Args:
        points (np.ndarray): Input points with the shape of [num_points, 2].
        polygon (np.ndarray): Input polygon with the shape of
            [num_polygon, num_points_of_polygon, 2].
        clockwise (bool, optional): Indicate polygon is clockwise. Defaults
            to True.

    Returns:
        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
    """
    # first convert polygon to directed lines
    num_points_of_polygon = polygon.shape[1]
    num_points = points.shape[0]
    num_polygons = polygon.shape[0]
    # vec for all the polygons
    if clockwise:
        vec1 = polygon - polygon[:,
                                 np.array([num_points_of_polygon - 1] +
                                          list(range(num_points_of_polygon -
                                                     1))), :]
    else:
        vec1 = polygon[:,
                       np.array([num_points_of_polygon - 1] +
                                list(range(num_points_of_polygon -
                                           1))), :] - polygon
    ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
    success = True
    cross = 0.0
    for i in range(num_points):
        for j in range(num_polygons):
            success = True
            for k in range(num_points_of_polygon):
                vec = vec1[j, k]
                cross = vec[1] * (polygon[j, k, 0] - points[i, 0])
                cross -= vec[0] * (polygon[j, k, 1] - points[i, 1])
                if cross >= 0:
                    success = False
                    break
            ret[i, j] = success
    return ret


def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
    """Convert kitti center boxes to corners.

        7 -------- 4
       /|         /|
      6 -------- 5 .
      | |        | |
      . 3 -------- 0
      |/         |/
      2 -------- 1

    Note:
        This function is for LiDAR boxes only.

    Args:
        boxes3d (np.ndarray): Boxes with shape of (N, 7)
            [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords,
            see the definition of ry in KITTI dataset.
        bottom_center (bool, optional): Whether z is on the bottom center
            of object. Defaults to True.

    Returns:
        np.ndarray: Box corners with the shape of [N, 8, 3].
    """
    boxes_num = boxes3d.shape[0]
    x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
    x_corners = np.array([
        x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2.,
        -x_size / 2., -x_size / 2., x_size / 2.
    ],
                         dtype=np.float32).T
    y_corners = np.array([
        -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2.,
        -y_size / 2., y_size / 2., y_size / 2.
    ],
                         dtype=np.float32).T
    if bottom_center:
        z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
        z_corners[:, 4:8] = z_size.reshape(boxes_num,
                                           1).repeat(4, axis=1)  # (N, 8)
    else:
        z_corners = np.array([
            -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2.,
            z_size / 2., z_size / 2., z_size / 2., z_size / 2.
        ],
                             dtype=np.float32).T

    ry = boxes3d[:, 6]
    zeros, ones = np.zeros(ry.size,
                           dtype=np.float32), np.ones(ry.size,
                                                      dtype=np.float32)
    rot_list = np.array([[np.cos(ry), np.sin(ry), zeros],
                         [-np.sin(ry), np.cos(ry), zeros],
                         [zeros, zeros, ones]])  # (3, 3, N)
    R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)

    temp_corners = np.concatenate((x_corners.reshape(
        -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
                                  axis=2)  # (N, 8, 3)
    rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
    x_corners = rotated_corners[:, :, 0]
    y_corners = rotated_corners[:, :, 1]
    z_corners = rotated_corners[:, :, 2]

    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]

    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)

    corners = np.concatenate(
        (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
        axis=2)

    return corners.astype(np.float32)


================================================
FILE: bip3d/structures/ops/iou3d_calculator.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import torch
from mmdet.structures.bbox import bbox_overlaps

from bip3d.registry import TASK_UTILS
from bip3d.structures.bbox_3d import get_box_type


@TASK_UTILS.register_module()
class BboxOverlapsNearest3D(object):
    """Nearest 3D IoU Calculator.

    Note:
        This IoU calculator first finds the nearest 2D boxes in bird eye view
        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.

    Args:
        coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
    """

    def __init__(self, coordinate='lidar'):
        assert coordinate in ['camera', 'lidar', 'depth']
        self.coordinate = coordinate

    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
        """Calculate nearest 3D IoU.

        Note:
            If ``is_aligned`` is ``False``, then it calculates the ious between
            each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
            between each aligned pair of bboxes1 and bboxes2.

        Args:
            bboxes1 (torch.Tensor): shape (N, 7+N)
                [x, y, z, x_size, y_size, z_size, ry, v].
            bboxes2 (torch.Tensor): shape (M, 7+N)
                [x, y, z, x_size, y_size, z_size, ry, v].
            mode (str): "iou" (intersection over union) or iof
                (intersection over foreground).
            is_aligned (bool): Whether the calculation is aligned.

        Return:
            torch.Tensor: If ``is_aligned`` is ``True``, return ious between
                bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
                ``False``, return shape is M.
        """
        return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
                                        self.coordinate)

    def __repr__(self):
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(coordinate={self.coordinate}'
        return repr_str


@TASK_UTILS.register_module()
class BboxOverlaps3D(object):
    """3D IoU Calculator.

    Args:
        coordinate (str): The coordinate system, valid options are
            'camera', 'lidar', and 'depth'.
    """

    def __init__(self, coordinate):
        assert coordinate in ['camera', 'lidar', 'depth']
        self.coordinate = coordinate

    def __call__(self, bboxes1, bboxes2, mode='iou'):
        """Calculate 3D IoU using cuda implementation.

        Note:
            This function calculate the IoU of 3D boxes based on their volumes.
            IoU calculator ``:class:BboxOverlaps3D`` uses this function to
            calculate the actual 3D IoUs of boxes.

        Args:
            bboxes1 (torch.Tensor): with shape (N, 7+C),
                (x, y, z, x_size, y_size, z_size, ry, v*).
            bboxes2 (torch.Tensor): with shape (M, 7+C),
                (x, y, z, x_size, y_size, z_size, ry, v*).
            mode (str): "iou" (intersection over union) or
                iof (intersection over foreground).

        Return:
            torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
                with shape (M, N) (aligned mode is not supported currently).
        """
        return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)

    def __repr__(self):
        """str: return a string that describes the module"""
        repr_str = self.__class__.__name__
        repr_str += f'(coordinate={self.coordinate}'
        return repr_str


def bbox_overlaps_nearest_3d(bboxes1,
                             bboxes2,
                             mode='iou',
                             is_aligned=False,
                             coordinate='lidar'):
    """Calculate nearest 3D IoU.

    Note:
        This function first finds the nearest 2D boxes in bird eye view
        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
        This IoU calculator :class:`BboxOverlapsNearest3D` uses this
        function to calculate IoUs of boxes.

        If ``is_aligned`` is ``False``, then it calculates the ious between
        each bbox of bboxes1 and bboxes2, otherwise the ious between each
        aligned pair of bboxes1 and bboxes2.

    Args:
        bboxes1 (torch.Tensor): with shape (N, 7+C),
            (x, y, z, x_size, y_size, z_size, ry, v*).
        bboxes2 (torch.Tensor): with shape (M, 7+C),
            (x, y, z, x_size, y_size, z_size, ry, v*).
        mode (str): "iou" (intersection over union) or iof
            (intersection over foreground).
        is_aligned (bool): Whether the calculation is aligned

    Return:
        torch.Tensor: If ``is_aligned`` is ``True``, return ious between
            bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
            ``False``, return shape is M.
    """
    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7

    box_type, _ = get_box_type(coordinate)

    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])

    # Change the bboxes to bev
    # box conversion and iou calculation in torch version on CUDA
    # is 10x faster than that in numpy version
    bboxes1_bev = bboxes1.nearest_bev
    bboxes2_bev = bboxes2.nearest_bev

    ret = bbox_overlaps(bboxes1_bev,
                        bboxes2_bev,
                        mode=mode,
                        is_aligned=is_aligned)
    return ret


def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
    """Calculate 3D IoU using cuda implementation.

    Note:
        This function calculates the IoU of 3D boxes based on their volumes.
        IoU calculator :class:`BboxOverlaps3D` uses this function to
        calculate the actual IoUs of boxes.

    Args:
        bboxes1 (torch.Tensor): with shape (N, 7+C),
            (x, y, z, x_size, y_size, z_size, ry, v*).
        bboxes2 (torch.Tensor): with shape (M, 7+C),
            (x, y, z, x_size, y_size, z_size, ry, v*).
        mode (str): "iou" (intersection over union) or
            iof (intersection over foreground).
        coordinate (str): 'camera' or 'lidar' coordinate system.

    Return:
        torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
            with shape (M, N) (aligned mode is not supported currently).
    """
    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7

    box_type, _ = get_box_type(coordinate)

    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])

    return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)


@TASK_UTILS.register_module()
class AxisAlignedBboxOverlaps3D(object):
    """Axis-aligned 3D Overlaps (IoU) Calculator."""

    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
        """Calculate IoU between 2D bboxes.

        Args:
            bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
                format or empty.
            bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
                format or empty.
                B indicates the batch dim, in shape (B1, B2, ..., Bn).
                If ``is_aligned`` is ``True``, then m and n must be equal.
            mode (str): "iou" (intersection over union) or "giou" (generalized
                intersection over union).
            is_aligned (bool, optional): If True, then m and n must be equal.
                Defaults to False.
        Returns:
            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
        """
        assert bboxes1.size(-1) == bboxes2.size(-1) == 6
        return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
                                             is_aligned)

    def __repr__(self):
        """str: a string describing the module"""
        repr_str = self.__class__.__name__ + '()'
        return repr_str


def axis_aligned_bbox_overlaps_3d(bboxes1,
                                  bboxes2,
                                  mode='iou',
                                  is_aligned=False,
                                  eps=1e-6):
    """Calculate overlap between two set of axis aligned 3D bboxes. If
    ``is_aligned`` is ``False``, then calculate the overlaps between each bbox
    of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
    bboxes1 and bboxes2.

    Args:
        bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
            format or empty.
        bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
            format or empty.
            B indicates the batch dim, in shape (B1, B2, ..., Bn).
            If ``is_aligned`` is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union) or "giou" (generalized
            intersection over union).
        is_aligned (bool, optional): If True, then m and n must be equal.
            Defaults to False.
        eps (float, optional): A value added to the denominator for numerical
            stability. Defaults to 1e-6.

    Returns:
        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)

    Example:
        >>> bboxes1 = torch.FloatTensor([
        >>>     [0, 0, 0, 10, 10, 10],
        >>>     [10, 10, 10, 20, 20, 20],
        >>>     [32, 32, 32, 38, 40, 42],
        >>> ])
        >>> bboxes2 = torch.FloatTensor([
        >>>     [0, 0, 0, 10, 20, 20],
        >>>     [0, 10, 10, 10, 19, 20],
        >>>     [10, 10, 10, 20, 20, 20],
        >>> ])
        >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
        >>> assert overlaps.shape == (3, 3)
        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
        >>> assert overlaps.shape == (3, )
    Example:
        >>> empty = torch.empty(0, 6)
        >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
    """

    assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
    # Either the boxes are empty or the length of boxes's last dimension is 6
    assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
    assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.size(-2)
    cols = bboxes2.size(-2)
    if is_aligned:
        assert rows == cols

    if rows * cols == 0:
        if is_aligned:
            return bboxes1.new(batch_shape + (rows, ))
        else:
            return bboxes1.new(batch_shape + (rows, cols))

    area1 = (bboxes1[..., 3] - bboxes1[..., 0]) * (
        bboxes1[..., 4] - bboxes1[..., 1]) * (bboxes1[..., 5] -
                                              bboxes1[..., 2])
    area2 = (bboxes2[..., 3] - bboxes2[..., 0]) * (
        bboxes2[..., 4] - bboxes2[..., 1]) * (bboxes2[..., 5] -
                                              bboxes2[..., 2])

    if is_aligned:
        lt = torch.max(bboxes1[..., :3], bboxes2[..., :3])  # [B, rows, 3]
        rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:])  # [B, rows, 3]

        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]

        if mode in ['iou', 'giou']:
            union = area1 + area2 - overlap
        else:
            union = area1
        if mode == 'giou':
            enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
            enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
    else:
        lt = torch.max(bboxes1[..., :, None, :3],
                       bboxes2[..., None, :, :3])  # [B, rows, cols, 3]
        rb = torch.min(bboxes1[..., :, None, 3:],
                       bboxes2[..., None, :, 3:])  # [B, rows, cols, 3]

        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 3]
        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]

        if mode in ['iou', 'giou']:
            union = area1[..., None] + area2[..., None, :] - overlap
        if mode == 'giou':
            enclosed_lt = torch.min(bboxes1[..., :, None, :3],
                                    bboxes2[..., None, :, :3])
            enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
                                    bboxes2[..., None, :, 3:])

    eps = union.new_tensor([eps])
    union = torch.max(union, eps)
    ious = overlap / union
    if mode in ['iou']:
        return ious
    # calculate gious
    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
    enclose_area = torch.max(enclose_area, eps)
    gious = ious - (enclose_area - union) / enclose_area
    return gious


================================================
FILE: bip3d/structures/ops/transforms.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import torch


def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
    """Map bboxes from testing scale to original image scale.

    Args:
        bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
        scale_factor (float): Scale factor.
        flip_horizontal (bool): Whether to flip horizontally.
        flip_vertical (bool): Whether to flip vertically.

    Returns:
        :obj:`BaseInstance3DBoxes`: Boxes mapped back.
    """
    new_bboxes = bboxes.clone()
    if flip_horizontal:
        new_bboxes.flip('horizontal')
    if flip_vertical:
        new_bboxes.flip('vertical')
    new_bboxes.scale(1 / scale_factor)

    return new_bboxes


def bbox3d2roi(bbox_list):
    """Convert a list of bounding boxes to roi format.

    Args:
        bbox_list (list[torch.Tensor]): A list of bounding boxes
            corresponding to a batch of images.

    Returns:
        torch.Tensor: Region of interests in shape (n, c), where
            the channels are in order of [batch_ind, x, y ...].
    """
    rois_list = []
    for img_id, bboxes in enumerate(bbox_list):
        if bboxes.size(0) > 0:
            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
            rois = torch.cat([img_inds, bboxes], dim=-1)
        else:
            rois = torch.zeros_like(bboxes)
        rois_list.append(rois)
    rois = torch.cat(rois_list, 0)
    return rois


# TODO delete this
def bbox3d2result(bboxes, scores, labels, attrs=None):
    """Convert detection results to a list of numpy arrays.

    Args:
        bboxes (torch.Tensor): Bounding boxes with shape (N, 5).
        labels (torch.Tensor): Labels with shape (N, ).
        scores (torch.Tensor): Scores with shape (N, ).
        attrs (torch.Tensor, optional): Attributes with shape (N, ).
            Defaults to None.

    Returns:
        dict[str, torch.Tensor]: Bounding box results in cpu mode.

            - boxes_3d (torch.Tensor): 3D boxes.
            - scores (torch.Tensor): Prediction scores.
            - labels_3d (torch.Tensor): Box labels.
            - attrs_3d (torch.Tensor, optional): Box attributes.
    """
    result_dict = dict(bboxes_3d=bboxes.to('cpu'),
                       scores_3d=scores.cpu(),
                       labels_3d=labels.cpu())

    if attrs is not None:
        result_dict['attr_labels'] = attrs.cpu()

    return result_dict


================================================
FILE: bip3d/structures/points/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .base_points import BasePoints
from .cam_points import CameraPoints
from .depth_points import DepthPoints
from .lidar_points import LiDARPoints

__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']


def get_points_type(points_type: str) -> type:
    """Get the class of points according to coordinate type.

    Args:
        points_type (str): The type of points coordinate. The valid value are
            "CAMERA", "LIDAR" and "DEPTH".

    Returns:
        type: Points type.
    """
    points_type_upper = points_type.upper()
    if points_type_upper == 'CAMERA':
        points_cls = CameraPoints
    elif points_type_upper == 'LIDAR':
        points_cls = LiDARPoints
    elif points_type_upper == 'DEPTH':
        points_cls = DepthPoints
    else:
        raise ValueError('Only "points_type" of "CAMERA", "LIDAR" and "DEPTH" '
                         f'are supported, got {points_type}')

    return points_cls


================================================
FILE: bip3d/structures/points/base_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from abc import abstractmethod
from typing import Iterator, Optional, Sequence, Union

import numpy as np
import torch
from torch import Tensor

from bip3d.structures.bbox_3d.utils import (rotation_3d_in_axis,
                                                   rotation_3d_in_euler)


class BasePoints:
    """Base class for Points.

    Args:
        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
            data with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...). Defaults to 3.
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.

    Attributes:
        tensor (Tensor): Float matrix with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...).
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.
        rotation_axis (int): Default rotation axis for points rotation.
    """

    def __init__(self,
                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
                 points_dim: int = 3,
                 attribute_dims: Optional[dict] = None) -> None:
        if isinstance(tensor, Tensor):
            device = tensor.device
        else:
            device = torch.device('cpu')
        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
        if tensor.numel() == 0:
            # Use reshape, so we don't end up creating a new tensor that does
            # not depend on the inputs (and consequently confuses jit)
            tensor = tensor.reshape((-1, points_dim))
        assert tensor.dim() == 2 and tensor.size(-1) == points_dim, \
            ('The points dimension must be 2 and the length of the last '
             f'dimension must be {points_dim}, but got points with shape '
             f'{tensor.shape}.')

        self.tensor = tensor.clone()
        self.points_dim = points_dim
        self.attribute_dims = attribute_dims
        self.rotation_axis = 0

    @property
    def coord(self) -> Tensor:
        """Tensor: Coordinates of each point in shape (N, 3)."""
        return self.tensor[:, :3]

    @coord.setter
    def coord(self, tensor: Union[Tensor, np.ndarray]) -> None:
        """Set the coordinates of each point.

        Args:
            tensor (Tensor or np.ndarray): Coordinates of each point with shape
                (N, 3).
        """
        try:
            tensor = tensor.reshape(self.shape[0], 3)
        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
            raise ValueError(f'got unexpected shape {tensor.shape}')
        if not isinstance(tensor, Tensor):
            tensor = self.tensor.new_tensor(tensor)
        self.tensor[:, :3] = tensor

    @property
    def height(self) -> Union[Tensor, None]:
        """Tensor or None: Returns a vector with height of each point in shape
        (N, )."""
        if self.attribute_dims is not None and \
                'height' in self.attribute_dims.keys():
            return self.tensor[:, self.attribute_dims['height']]
        else:
            return None

    @height.setter
    def height(self, tensor: Union[Tensor, np.ndarray]) -> None:
        """Set the height of each point.

        Args:
            tensor (Tensor or np.ndarray): Height of each point with shape
                (N, ).
        """
        try:
            tensor = tensor.reshape(self.shape[0])
        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
            raise ValueError(f'got unexpected shape {tensor.shape}')
        if not isinstance(tensor, Tensor):
            tensor = self.tensor.new_tensor(tensor)
        if self.attribute_dims is not None and \
                'height' in self.attribute_dims.keys():
            self.tensor[:, self.attribute_dims['height']] = tensor
        else:
            # add height attribute
            if self.attribute_dims is None:
                self.attribute_dims = dict()
            attr_dim = self.shape[1]
            self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
            self.attribute_dims.update(dict(height=attr_dim))
            self.points_dim += 1

    @property
    def color(self) -> Union[Tensor, None]:
        """Tensor or None: Returns a vector with color of each point in shape
        (N, 3)."""
        if self.attribute_dims is not None and \
                'color' in self.attribute_dims.keys():
            return self.tensor[:, self.attribute_dims['color']]
        else:
            return None

    @color.setter
    def color(self, tensor: Union[Tensor, np.ndarray]) -> None:
        """Set the color of each point.

        Args:
            tensor (Tensor or np.ndarray): Color of each point with shape
                (N, 3).
        """
        try:
            tensor = tensor.reshape(self.shape[0], 3)
        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
            raise ValueError(f'got unexpected shape {tensor.shape}')
        if tensor.max() >= 256 or tensor.min() < 0:
            warnings.warn('point got color value beyond [0, 255]')
        if not isinstance(tensor, Tensor):
            tensor = self.tensor.new_tensor(tensor)
        if self.attribute_dims is not None and \
                'color' in self.attribute_dims.keys():
            self.tensor[:, self.attribute_dims['color']] = tensor
        else:
            # add color attribute
            if self.attribute_dims is None:
                self.attribute_dims = dict()
            attr_dim = self.shape[1]
            self.tensor = torch.cat([self.tensor, tensor], dim=1)
            self.attribute_dims.update(
                dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
            self.points_dim += 3

    @property
    def shape(self) -> torch.Size:
        """torch.Size: Shape of points."""
        return self.tensor.shape

    def shuffle(self) -> Tensor:
        """Shuffle the points.

        Returns:
            Tensor: The shuffled index.
        """
        idx = torch.randperm(self.__len__(), device=self.tensor.device)
        self.tensor = self.tensor[idx]
        return idx

    def rotate(self,
               rotation: Union[Tensor, np.ndarray, float],
               axis: Optional[int] = None) -> Tensor:
        """Rotate points with the given rotation matrix or angle.

        Args:
            rotation (Tensor or np.ndarray or float): Rotation matrix or angle.
            axis (int, optional): Axis to rotate at. Defaults to None.

        Returns:
            Tensor: Rotation matrix.
        """
        if not isinstance(rotation, Tensor):
            rotation = self.tensor.new_tensor(rotation)
        assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, \
            f'invalid rotation shape {rotation.shape}'

        if axis is None:
            axis = self.rotation_axis

        if rotation.numel() == 1:
            rotated_points, rot_mat_T = rotation_3d_in_axis(
                self.tensor[:, :3][None], rotation, axis=axis, return_mat=True)
            self.tensor[:, :3] = rotated_points.squeeze(0)
            rot_mat_T = rot_mat_T.squeeze(0)
        elif rotation.numel() == 3:
            rotated_points, rot_mat_T = rotation_3d_in_euler(
                self.tensor[:, :3][None], rotation, return_mat=True)
            self.tensor[:, :3] = rotated_points.squeeze(0)
            rot_mat_T = rot_mat_T.squeeze(0)
        else:
            # rotation.numel() == 9
            self.tensor[:, :3] = self.tensor[:, :3] @ rotation
            rot_mat_T = rotation

        return rot_mat_T

    @abstractmethod
    def flip(self, bev_direction: str = 'horizontal') -> None:
        """Flip the points along given BEV direction.

        Args:
            bev_direction (str): Flip direction (horizontal or vertical).
                Defaults to 'horizontal'.
        """
        pass

    def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
        """Translate points with the given translation vector.

        Args:
            trans_vector (Tensor or np.ndarray): Translation vector of size 3
                or nx3.
        """
        if not isinstance(trans_vector, Tensor):
            trans_vector = self.tensor.new_tensor(trans_vector)
        trans_vector = trans_vector.squeeze(0)
        if trans_vector.dim() == 1:
            assert trans_vector.shape[0] == 3
        elif trans_vector.dim() == 2:
            assert trans_vector.shape[0] == self.tensor.shape[0] and \
                trans_vector.shape[1] == 3
        else:
            raise NotImplementedError(
                f'Unsupported translation vector of shape {trans_vector.shape}'
            )
        self.tensor[:, :3] += trans_vector

    def in_range_3d(
            self, point_range: Union[Tensor, np.ndarray,
                                     Sequence[float]]) -> Tensor:
        """Check whether the points are in the given range.

        Args:
            point_range (Tensor or np.ndarray or Sequence[float]): The range of
                point (x_min, y_min, z_min, x_max, y_max, z_max).

        Note:
            In the original implementation of SECOND, checking whether a box in
            the range checks whether the points are in a convex polygon, we try
            to reduce the burden for simpler cases.

        Returns:
            Tensor: A binary vector indicating whether each point is inside the
            reference range.
        """
        in_range_flags = ((self.tensor[:, 0] > point_range[0])
                          & (self.tensor[:, 1] > point_range[1])
                          & (self.tensor[:, 2] > point_range[2])
                          & (self.tensor[:, 0] < point_range[3])
                          & (self.tensor[:, 1] < point_range[4])
                          & (self.tensor[:, 2] < point_range[5]))
        return in_range_flags

    @property
    def bev(self) -> Tensor:
        """Tensor: BEV of the points in shape (N, 2)."""
        return self.tensor[:, [0, 1]]

    def in_range_bev(
            self, point_range: Union[Tensor, np.ndarray,
                                     Sequence[float]]) -> Tensor:
        """Check whether the points are in the given range.

        Args:
            point_range (Tensor or np.ndarray or Sequence[float]): The range of
                point in order of (x_min, y_min, x_max, y_max).

        Returns:
            Tensor: A binary vector indicating whether each point is inside the
            reference range.
        """
        in_range_flags = ((self.bev[:, 0] > point_range[0])
                          & (self.bev[:, 1] > point_range[1])
                          & (self.bev[:, 0] < point_range[2])
                          & (self.bev[:, 1] < point_range[3]))
        return in_range_flags

    @abstractmethod
    def convert_to(self,
                   dst: int,
                   rt_mat: Optional[Union[Tensor,
                                          np.ndarray]] = None) -> 'BasePoints':
        """Convert self to ``dst`` mode.

        Args:
            dst (int): The target Point mode.
            rt_mat (Tensor or np.ndarray, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.

        Returns:
            :obj:`BasePoints`: The converted point of the same type in the
            ``dst`` mode.
        """
        pass

    def scale(self, scale_factor: float) -> None:
        """Scale the points with horizontal and vertical scaling factors.

        Args:
            scale_factors (float): Scale factors to scale the points.
        """
        self.tensor[:, :3] *= scale_factor

    def __getitem__(
            self, item: Union[int, tuple, slice, np.ndarray,
                              Tensor]) -> 'BasePoints':
        """
        Args:
            item (int or tuple or slice or np.ndarray or Tensor): Index of
                points.

        Note:
            The following usage are allowed:

            1. `new_points = points[3]`: Return a `Points` that contains only
               one point.
            2. `new_points = points[2:10]`: Return a slice of points.
            3. `new_points = points[vector]`: Whether vector is a
               torch.BoolTensor with `length = len(points)`. Nonzero elements
               in the vector will be selected.
            4. `new_points = points[3:11, vector]`: Return a slice of points
               and attribute dims.
            5. `new_points = points[4:12, 2]`: Return a slice of points with
               single attribute.

            Note that the returned Points might share storage with this Points,
            subject to PyTorch's indexing semantics.

        Returns:
            :obj:`BasePoints`: A new object of :class:`BasePoints` after
            indexing.
        """
        original_type = type(self)
        if isinstance(item, int):
            return original_type(self.tensor[item].view(1, -1),
                                 points_dim=self.points_dim,
                                 attribute_dims=self.attribute_dims)
        elif isinstance(item, tuple) and len(item) == 2:
            if isinstance(item[1], slice):
                start = 0 if item[1].start is None else item[1].start
                stop = self.tensor.shape[1] \
                    if item[1].stop is None else item[1].stop
                step = 1 if item[1].step is None else item[1].step
                item = list(item)
                item[1] = list(range(start, stop, step))
                item = tuple(item)
            elif isinstance(item[1], int):
                item = list(item)
                item[1] = [item[1]]
                item = tuple(item)
            p = self.tensor[item[0], item[1]]

            keep_dims = list(
                set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
            if self.attribute_dims is not None:
                attribute_dims = self.attribute_dims.copy()
                for key in self.attribute_dims.keys():
                    cur_attribute_dims = attribute_dims[key]
                    if isinstance(cur_attribute_dims, int):
                        cur_attribute_dims = [cur_attribute_dims]
                    intersect_attr = list(
                        set(cur_attribute_dims).intersection(set(keep_dims)))
                    if len(intersect_attr) == 1:
                        attribute_dims[key] = intersect_attr[0]
                    elif len(intersect_attr) > 1:
                        attribute_dims[key] = intersect_attr
                    else:
                        attribute_dims.pop(key)
            else:
                attribute_dims = None
        elif isinstance(item, (slice, np.ndarray, Tensor)):
            p = self.tensor[item]
            attribute_dims = self.attribute_dims
        else:
            raise NotImplementedError(f'Invalid slice {item}!')

        assert p.dim() == 2, \
            f'Indexing on Points with {item} failed to return a matrix!'
        return original_type(p,
                             points_dim=p.shape[1],
                             attribute_dims=attribute_dims)

    def __len__(self) -> int:
        """int: Number of points in the current object."""
        return self.tensor.shape[0]

    def __repr__(self) -> str:
        """str: Return a string that describes the object."""
        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'

    @classmethod
    def cat(cls, points_list: Sequence['BasePoints']) -> 'BasePoints':
        """Concatenate a list of Points into a single Points.

        Args:
            points_list (Sequence[:obj:`BasePoints`]): List of points.

        Returns:
            :obj:`BasePoints`: The concatenated points.
        """
        assert isinstance(points_list, (list, tuple))
        if len(points_list) == 0:
            return cls(torch.empty(0))
        assert all(isinstance(points, cls) for points in points_list)

        # use torch.cat (v.s. layers.cat)
        # so the returned points never share storage with input
        cat_points = cls(torch.cat([p.tensor for p in points_list], dim=0),
                         points_dim=points_list[0].points_dim,
                         attribute_dims=points_list[0].attribute_dims)
        return cat_points

    def numpy(self) -> np.ndarray:
        """Reload ``numpy`` from self.tensor."""
        return self.tensor.numpy()

    def to(self, device: Union[str, torch.device], *args,
           **kwargs) -> 'BasePoints':
        """Convert current points to a specific device.

        Args:
            device (str or :obj:`torch.device`): The name of the device.

        Returns:
            :obj:`BasePoints`: A new points object on the specific device.
        """
        original_type = type(self)
        return original_type(self.tensor.to(device, *args, **kwargs),
                             points_dim=self.points_dim,
                             attribute_dims=self.attribute_dims)

    def cpu(self) -> 'BasePoints':
        """Convert current points to cpu device.

        Returns:
            :obj:`BasePoints`: A new points object on the cpu device.
        """
        original_type = type(self)
        return original_type(self.tensor.cpu(),
                             points_dim=self.points_dim,
                             attribute_dims=self.attribute_dims)

    def cuda(self, *args, **kwargs) -> 'BasePoints':
        """Convert current points to cuda device.

        Returns:
            :obj:`BasePoints`: A new points object on the cuda device.
        """
        original_type = type(self)
        return original_type(self.tensor.cuda(*args, **kwargs),
                             points_dim=self.points_dim,
                             attribute_dims=self.attribute_dims)

    def clone(self) -> 'BasePoints':
        """Clone the points.

        Returns:
            :obj:`BasePoints`: Point object with the same properties as self.
        """
        original_type = type(self)
        return original_type(self.tensor.clone(),
                             points_dim=self.points_dim,
                             attribute_dims=self.attribute_dims)

    def detach(self) -> 'BasePoints':
        """Detach the points.

        Returns:
            :obj:`BasePoints`: Point object with the same properties as self.
        """
        original_type = type(self)
        return original_type(self.tensor.detach(),
                             points_dim=self.points_dim,
                             attribute_dims=self.attribute_dims)

    @property
    def device(self) -> torch.device:
        """torch.device: The device of the points are on."""
        return self.tensor.device

    def __iter__(self) -> Iterator[Tensor]:
        """Yield a point as a Tensor at a time.

        Returns:
            Iterator[Tensor]: A point of shape (points_dim, ).
        """
        yield from self.tensor

    def new_point(
        self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
    ) -> 'BasePoints':
        """Create a new point object with data.

        The new point and its tensor has the similar properties as self and
        self.tensor, respectively.

        Args:
            data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
                be copied.

        Returns:
            :obj:`BasePoints`: A new point object with ``data``, the object's
            other properties are similar to ``self``.
        """
        new_tensor = self.tensor.new_tensor(data) \
            if not isinstance(data, Tensor) else data.to(self.device)
        original_type = type(self)
        return original_type(new_tensor,
                             points_dim=self.points_dim,
                             attribute_dims=self.attribute_dims)


================================================
FILE: bip3d/structures/points/cam_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Sequence, Union

import numpy as np
from torch import Tensor

from .base_points import BasePoints


class CameraPoints(BasePoints):
    """Points of instances in CAM coordinates.

    Args:
        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
            data with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...). Defaults to 3.
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.

    Attributes:
        tensor (Tensor): Float matrix with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...).
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.
        rotation_axis (int): Default rotation axis for points rotation.
    """

    def __init__(self,
                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
                 points_dim: int = 3,
                 attribute_dims: Optional[dict] = None) -> None:
        super(CameraPoints, self).__init__(tensor,
                                           points_dim=points_dim,
                                           attribute_dims=attribute_dims)
        self.rotation_axis = 1

    def flip(self, bev_direction: str = 'horizontal') -> None:
        """Flip the points along given BEV direction.

        Args:
            bev_direction (str): Flip direction (horizontal or vertical).
                Defaults to 'horizontal'.
        """
        assert bev_direction in ('horizontal', 'vertical')
        if bev_direction == 'horizontal':
            self.tensor[:, 0] = -self.tensor[:, 0]
        elif bev_direction == 'vertical':
            self.tensor[:, 2] = -self.tensor[:, 2]

    @property
    def bev(self) -> Tensor:
        """Tensor: BEV of the points in shape (N, 2)."""
        return self.tensor[:, [0, 2]]

    def convert_to(self,
                   dst: int,
                   rt_mat: Optional[Union[Tensor,
                                          np.ndarray]] = None) -> 'BasePoints':
        """Convert self to ``dst`` mode.

        Args:
            dst (int): The target Point mode.
            rt_mat (Tensor or np.ndarray, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.

        Returns:
            :obj:`BasePoints`: The converted point of the same type in the
            ``dst`` mode.
        """
        from embodiedscan.structures.bbox_3d import Coord3DMode
        return Coord3DMode.convert_point(point=self,
                                         src=Coord3DMode.CAM,
                                         dst=dst,
                                         rt_mat=rt_mat)


================================================
FILE: bip3d/structures/points/depth_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Sequence, Union

import numpy as np
from torch import Tensor

from .base_points import BasePoints


class DepthPoints(BasePoints):
    """Points of instances in DEPTH coordinates.

    Args:
        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
            data with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...). Defaults to 3.
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.

    Attributes:
        tensor (Tensor): Float matrix with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...).
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.
        rotation_axis (int): Default rotation axis for points rotation.
    """

    def __init__(self,
                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
                 points_dim: int = 3,
                 attribute_dims: Optional[dict] = None) -> None:
        super(DepthPoints, self).__init__(tensor,
                                          points_dim=points_dim,
                                          attribute_dims=attribute_dims)
        self.rotation_axis = 2

    def flip(self, bev_direction: str = 'horizontal') -> None:
        """Flip the points along given BEV direction.

        Args:
            bev_direction (str): Flip direction (horizontal or vertical).
                Defaults to 'horizontal'.
        """
        assert bev_direction in ('horizontal', 'vertical')
        if bev_direction == 'horizontal':
            self.tensor[:, 0] = -self.tensor[:, 0]
        elif bev_direction == 'vertical':
            self.tensor[:, 1] = -self.tensor[:, 1]

    def convert_to(self,
                   dst: int,
                   rt_mat: Optional[Union[Tensor,
                                          np.ndarray]] = None) -> 'BasePoints':
        """Convert self to ``dst`` mode.

        Args:
            dst (int): The target Point mode.
            rt_mat (Tensor or np.ndarray, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.

        Returns:
            :obj:`BasePoints`: The converted point of the same type in the
            ``dst`` mode.
        """
        from embodiedscan.structures.bbox_3d import Coord3DMode
        return Coord3DMode.convert_point(point=self,
                                         src=Coord3DMode.DEPTH,
                                         dst=dst,
                                         rt_mat=rt_mat)


================================================
FILE: bip3d/structures/points/lidar_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Sequence, Union

import numpy as np
from torch import Tensor

from .base_points import BasePoints


class LiDARPoints(BasePoints):
    """Points of instances in LIDAR coordinates.

    Args:
        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
            data with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...). Defaults to 3.
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.

    Attributes:
        tensor (Tensor): Float matrix with shape (N, points_dim).
        points_dim (int): Integer indicating the dimension of a point. Each row
            is (x, y, z, ...).
        attribute_dims (dict, optional): Dictionary to indicate the meaning of
            extra dimension. Defaults to None.
        rotation_axis (int): Default rotation axis for points rotation.
    """

    def __init__(self,
                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
                 points_dim: int = 3,
                 attribute_dims: Optional[dict] = None) -> None:
        super(LiDARPoints, self).__init__(tensor,
                                          points_dim=points_dim,
                                          attribute_dims=attribute_dims)
        self.rotation_axis = 2

    def flip(self, bev_direction: str = 'horizontal') -> None:
        """Flip the points along given BEV direction.

        Args:
            bev_direction (str): Flip direction (horizontal or vertical).
                Defaults to 'horizontal'.
        """
        assert bev_direction in ('horizontal', 'vertical')
        if bev_direction == 'horizontal':
            self.tensor[:, 1] = -self.tensor[:, 1]
        elif bev_direction == 'vertical':
            self.tensor[:, 0] = -self.tensor[:, 0]

    def convert_to(self,
                   dst: int,
                   rt_mat: Optional[Union[Tensor,
                                          np.ndarray]] = None) -> 'BasePoints':
        """Convert self to ``dst`` mode.

        Args:
            dst (int): The target Point mode.
            rt_mat (Tensor or np.ndarray, optional): The rotation and
                translation matrix between different coordinates.
                Defaults to None. The conversion from ``src`` coordinates to
                ``dst`` coordinates usually comes along the change of sensors,
                e.g., from camera to LiDAR. This requires a transformation
                matrix.

        Returns:
            :obj:`BasePoints`: The converted point of the same type in the
            ``dst`` mode.
        """
        from embodiedscan.structures.bbox_3d import Coord3DMode
        return Coord3DMode.convert_point(point=self,
                                         src=Coord3DMode.LIDAR,
                                         dst=dst,
                                         rt_mat=rt_mat)


================================================
FILE: bip3d/utils/__init__.py
================================================
from .array_converter import ArrayConverter, array_converter
from .typing_config import ConfigType, OptConfigType, OptMultiConfig

__all__ = [
    'ConfigType', 'OptConfigType', 'OptMultiConfig', 'ArrayConverter',
    'array_converter'
]


================================================
FILE: bip3d/utils/array_converter.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import functools
from inspect import getfullargspec
from typing import Callable, Optional, Tuple, Type, Union

import numpy as np
import torch

TemplateArrayType = Union[np.ndarray, torch.Tensor, list, tuple, int, float]


def array_converter(to_torch: bool = True,
                    apply_to: Tuple[str, ...] = tuple(),
                    template_arg_name_: Optional[str] = None,
                    recover: bool = True) -> Callable:
    """Wrapper function for data-type agnostic processing.

    First converts input arrays to PyTorch tensors or NumPy arrays for middle
    calculation, then convert output to original data-type if `recover=True`.

    Args:
        to_torch (bool): Whether to convert to PyTorch tensors for middle
            calculation. Defaults to True.
        apply_to (Tuple[str]): The arguments to which we apply data-type
            conversion. Defaults to an empty tuple.
        template_arg_name_ (str, optional): Argument serving as the template
            (return arrays should have the same dtype and device as the
            template). Defaults to None. If None, we will use the first
            argument in `apply_to` as the template argument.
        recover (bool): Whether or not to recover the wrapped function outputs
            to the `template_arg_name_` type. Defaults to True.

    Raises:
        ValueError: When template_arg_name_ is not among all args, or when
            apply_to contains an arg which is not among all args, a ValueError
            will be raised. When the template argument or an argument to
            convert is a list or tuple, and cannot be converted to a NumPy
            array, a ValueError will be raised.
        TypeError: When the type of the template argument or an argument to
            convert does not belong to the above range, or the contents of such
            an list-or-tuple-type argument do not share the same data type, a
            TypeError will be raised.

    Returns:
        Callable: Wrapped function.

    Examples:
        >>> import torch
        >>> import numpy as np
        >>>
        >>> # Use torch addition for a + b,
        >>> # and convert return values to the type of a
        >>> @array_converter(apply_to=('a', 'b'))
        >>> def simple_add(a, b):
        >>>     return a + b
        >>>
        >>> a = np.array([1.1])
        >>> b = np.array([2.2])
        >>> simple_add(a, b)
        >>>
        >>> # Use numpy addition for a + b,
        >>> # and convert return values to the type of b
        >>> @array_converter(to_torch=False, apply_to=('a', 'b'),
        >>>                  template_arg_name_='b')
        >>> def simple_add(a, b):
        >>>     return a + b
        >>>
        >>> simple_add(a, b)
        >>>
        >>> # Use torch funcs for floor(a) if flag=True else ceil(a),
        >>> # and return the torch tensor
        >>> @array_converter(apply_to=('a',), recover=False)
        >>> def floor_or_ceil(a, flag=True):
        >>>     return torch.floor(a) if flag else torch.ceil(a)
        >>>
        >>> floor_or_ceil(a, flag=False)
    """

    def array_converter_wrapper(func):
        """Outer wrapper for the function."""

        @functools.wraps(func)
        def new_func(*args, **kwargs):
            """Inner wrapper for the arguments."""
            if len(apply_to) == 0:
                return func(*args, **kwargs)

            func_name = func.__name__

            arg_spec = getfullargspec(func)

            arg_names = arg_spec.args
            arg_num = len(arg_names)
            default_arg_values = arg_spec.defaults
            if default_arg_values is None:
                default_arg_values = []
            no_default_arg_num = len(arg_names) - len(default_arg_values)

            kwonly_arg_names = arg_spec.kwonlyargs
            kwonly_default_arg_values = arg_spec.kwonlydefaults
            if kwonly_default_arg_values is None:
                kwonly_default_arg_values = {}

            all_arg_names = arg_names + kwonly_arg_names

            # in case there are args in the form of *args
            if len(args) > arg_num:
                named_args = args[:arg_num]
                nameless_args = args[arg_num:]
            else:
                named_args = args
                nameless_args = []

            # template argument data type is used for all array-like arguments
            if template_arg_name_ is None:
                template_arg_name = apply_to[0]
            else:
                template_arg_name = template_arg_name_

            if template_arg_name not in all_arg_names:
                raise ValueError(f'{template_arg_name} is not among the '
                                 f'argument list of function {func_name}')

            # inspect apply_to
            for arg_to_apply in apply_to:
                if arg_to_apply not in all_arg_names:
                    raise ValueError(
                        f'{arg_to_apply} is not an argument of {func_name}')

            new_args = []
            new_kwargs = {}

            converter = ArrayConverter()
            target_type = torch.Tensor if to_torch else np.ndarray

            # non-keyword arguments
            for i, arg_value in enumerate(named_args):
                if arg_names[i] in apply_to:
                    new_args.append(
                        converter.convert(input_array=arg_value,
                                          target_type=target_type))
                else:
                    new_args.append(arg_value)

                if arg_names[i] == template_arg_name:
                    template_arg_value = arg_value

            kwonly_default_arg_values.update(kwargs)
            kwargs = kwonly_default_arg_values

            # keyword arguments and non-keyword arguments using default value
            for i in range(len(named_args), len(all_arg_names)):
                arg_name = all_arg_names[i]
                if arg_name in kwargs:
                    if arg_name in apply_to:
                        new_kwargs[arg_name] = converter.convert(
                            input_array=kwargs[arg_name],
                            target_type=target_type)
                    else:
                        new_kwargs[arg_name] = kwargs[arg_name]
                else:
                    default_value = default_arg_values[i - no_default_arg_num]
                    if arg_name in apply_to:
                        new_kwargs[arg_name] = converter.convert(
                            input_array=default_value, target_type=target_type)
                    else:
                        new_kwargs[arg_name] = default_value
                if arg_name == template_arg_name:
                    template_arg_value = kwargs[arg_name]

            # add nameless args provided by *args (if exists)
            new_args += nameless_args

            return_values = func(*new_args, **new_kwargs)
            converter.set_template(template_arg_value)

            def recursive_recover(input_data):
                if isinstance(input_data, (tuple, list)):
                    new_data = []
                    for item in input_data:
                        new_data.append(recursive_recover(item))
                    return tuple(new_data) if isinstance(input_data,
                                                         tuple) else new_data
                elif isinstance(input_data, dict):
                    new_data = {}
                    for k, v in input_data.items():
                        new_data[k] = recursive_recover(v)
                    return new_data
                elif isinstance(input_data, (torch.Tensor, np.ndarray)):
                    return converter.recover(input_data)
                else:
                    return input_data

            if recover:
                return recursive_recover(return_values)
            else:
                return return_values

        return new_func

    return array_converter_wrapper


class ArrayConverter:
    """Utility class for data-type agnostic processing.

    Args:
        template_array (np.ndarray or torch.Tensor or list or tuple or int or
            float, optional): Template array. Defaults to None.
    """
    SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32,
                                 np.int64, np.uint8, np.uint16, np.uint32,
                                 np.uint64, np.float16, np.float32, np.float64)

    def __init__(self,
                 template_array: Optional[TemplateArrayType] = None) -> None:
        if template_array is not None:
            self.set_template(template_array)

    def set_template(self, array: TemplateArrayType) -> None:
        """Set template array.

        Args:
            array (np.ndarray or torch.Tensor or list or tuple or int or
                float): Template array.

        Raises:
            ValueError: If input is list or tuple and cannot be converted to a
                NumPy array, a ValueError is raised.
            TypeError: If input type does not belong to the above range, or the
                contents of a list or tuple do not share the same data type, a
                TypeError is raised.
        """
        self.array_type = type(array)
        self.is_num = False
        self.device = 'cpu'

        if isinstance(array, np.ndarray):
            self.dtype = array.dtype
        elif isinstance(array, torch.Tensor):
            self.dtype = array.dtype
            self.device = array.device
        elif isinstance(array, (list, tuple)):
            try:
                array = np.array(array)
                if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
                    raise TypeError
                self.dtype = array.dtype
            except (ValueError, TypeError):
                print('The following list cannot be converted to a numpy '
                      f'array of supported dtype:\n{array}')
                raise
        elif isinstance(array, (int, float)):
            self.array_type = np.ndarray
            self.is_num = True
            self.dtype = np.dtype(type(array))
        else:
            raise TypeError(
                f'Template type {self.array_type} is not supported.')

    def convert(
        self,
        input_array: TemplateArrayType,
        target_type: Optional[Type] = None,
        target_array: Optional[Union[np.ndarray, torch.Tensor]] = None
    ) -> Union[np.ndarray, torch.Tensor]:
        """Convert input array to target data type.

        Args:
            input_array (np.ndarray or torch.Tensor or list or tuple or int or
                float): Input array.
            target_type (Type, optional): Type to which input array is
                converted. It should be `np.ndarray` or `torch.Tensor`.
                Defaults to None.
            target_array (np.ndarray or torch.Tensor, optional): Template array
                to which input array is converted. Defaults to None.

        Raises:
            ValueError: If input is list or tuple and cannot be converted to a
                NumPy array, a ValueError is raised.
            TypeError: If input type does not belong to the above range, or the
                contents of a list or tuple do not share the same data type, a
                TypeError is raised.

        Returns:
            np.ndarray or torch.Tensor: The converted array.
        """
        if isinstance(input_array, (list, tuple)):
            try:
                input_array = np.array(input_array)
                if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
                    raise TypeError
            except (ValueError, TypeError):
                print('The input cannot be converted to a single-type numpy '
                      f'array:\n{input_array}')
                raise
        elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES):
            input_array = np.array(input_array)
        array_type = type(input_array)
        assert target_type is not None or target_array is not None, \
            'must specify a target'
        if target_type is not None:
            assert target_type in (np.ndarray, torch.Tensor), \
                'invalid target type'
            if target_type == array_type:
                return input_array
            elif target_type == np.ndarray:
                # default dtype is float32
                converted_array = input_array.cpu().numpy().astype(np.float32)
            else:
                # default dtype is float32, device is 'cpu'
                converted_array = torch.tensor(input_array,
                                               dtype=torch.float32)
        else:
            assert isinstance(target_array, (np.ndarray, torch.Tensor)), \
                'invalid target array type'
            if isinstance(target_array, array_type):
                return input_array
            elif isinstance(target_array, np.ndarray):
                converted_array = input_array.cpu().numpy().astype(
                    target_array.dtype)
            else:
                converted_array = target_array.new_tensor(input_array)
        return converted_array

    def recover(
        self, input_array: Union[np.ndarray, torch.Tensor]
    ) -> Union[np.ndarray, torch.Tensor, int, float]:
        """Recover input type to original array type.

        Args:
            input_array (np.ndarray or torch.Tensor): Input array.

        Returns:
            np.ndarray or torch.Tensor or int or float: Converted array.
        """
        assert isinstance(input_array, (np.ndarray, torch.Tensor)), \
            'invalid input array type'
        if isinstance(input_array, self.array_type):
            return input_array
        elif isinstance(input_array, torch.Tensor):
            converted_array = input_array.cpu().numpy().astype(self.dtype)
        else:
            converted_array = torch.tensor(input_array,
                                           dtype=self.dtype,
                                           device=self.device)
        if self.is_num:
            converted_array = converted_array.item()
        return converted_array


================================================
FILE: bip3d/utils/default_color_map.py
================================================
DEFAULT_COLOR_MAP = {
    'floor': [255, 193, 193],
    'wall': [137, 54, 74],
    'chair': [153, 69, 1],
    'cabinet': [134, 199, 156],
    'door': [92, 136, 89],
    'table': [255.0, 187.0, 120.0],
    'couch': [3, 95, 161],
    'shelf': [255, 160, 98],
    'window': [183, 121, 142],
    'bed': [123, 104, 238],
    'curtain': [210, 170, 100],
    'plant': [163, 255, 0],
    'stairs': [104, 84, 109],
    'pillow': [76, 91, 113],
    'counter': [146, 112, 198],
    'bench': [250, 0, 30],
    'rail': [230, 150, 140],
    'sink': [135, 206, 250],
    'mirror': [154, 208, 0],
    'toilet': [0, 165, 120],
    'refrigerator': [59, 105, 106],
    'book': [142, 108, 45],
    'tv': [183, 130, 88],
    'blanket': [147, 211, 203],
    'rack': [255, 208, 186],
    'towel': [225, 199, 255],
    'backpack': [255, 179, 240],
    'roof': [92, 82, 55],
    'bag': [209, 0, 151],
    'board': [133, 129, 255],
    'bicycle': [119, 11, 32],
    'oven': [178, 90, 62],
    'microwave': [79, 210, 114],
    'desk': [109, 63, 54],
    'doorframe': [199, 100, 0],
    'wardrobe': [7, 246, 231],
    'picture': [171, 134, 1],
    'bathtub': [92, 0, 73],
    'box': [188, 208, 182],
    'stand': [146, 139, 141],
    'clothes': [96, 96, 96],
    'lamp': [107, 255, 200],
    'dresser': [206, 186, 171],
    'stool': [73, 77, 174],
    'fireplace': [255, 77, 255],
    'commode': [102, 102, 156],
    'washing machine': [152, 251, 152],
    'monitor': [208, 195, 210],
    'window frame': [227, 255, 205],
    'radiator': [191, 162, 208],
    'mat': [250, 141, 255],
    'shower': [154, 255, 154],
    'ottoman': [95, 32, 0],
    'column': [60, 143, 255],
    'blinds': [134, 134, 103],
    'stove': [128, 64, 128],
    'bar': [72, 0, 118],
    'pillar': [220, 20, 60],
    'bin': [187, 255, 255],
    'heater': [209, 226, 140],
    'clothes dryer': [100, 170, 30],
    'blackboard': [0, 82, 0],
    'decoration': [107, 142, 35],
    'steps': [120, 166, 157],
    'windowsill': [9, 80, 61],
    'cushion': [0, 228, 0],
    'carpet': [175, 116, 175],
    'copier': [241, 129, 0],
    'countertop': [207, 138, 255],
    'basket': [0, 0, 70],
    'mailbox': [150, 100, 100],
    'kitchen island': [220, 220, 0],
    'washbasin': [0, 80, 100],
    'drawer': [0, 220, 176],
    'piano': [78, 180, 255],
    'exercise equipment': [151, 0, 95],
    'beam': [255, 255, 128],
    'partition': [168, 171, 172],
    'printer': [179, 0, 194],
    'frame': [255, 180, 195],
    'object': [0, 0, 0],
    'adhesive tape': [0, 220, 176],
    'air conditioner': [109, 63, 54],
    'alarm': [0, 114, 143],
    'album': [147, 186, 208],
    'arch': [135, 158, 223],
    'balcony': [70, 70, 70],
    'ball': [96, 96, 96],
    'banister': [196, 172, 0],
    'barricade': [45, 89, 255],
    'baseboard': [153, 69, 1],
    'basin': [255.0, 187.0, 120.0],
    'beanbag': [190, 153, 153],
    'bidet': [123, 104, 238],
    'body loofah': [196, 172, 0],
    'boots': [134, 199, 156],
    'bottle': [241, 129, 0],
    'bowl': [92, 136, 89],
    'bread': [119, 11, 32],
    'broom': [0, 226, 252],
    'brush': [255, 255, 128],
    'bucket': [255, 73, 97],
    'calendar': [76, 91, 113],
    'camera': [72, 0, 118],
    'can': [109, 63, 54],
    'candle': [78, 180, 255],
    'candlestick': [104, 84, 109],
    'cap': [128, 76, 255],
    'car': [107, 142, 35],
    'cart': [255, 255, 128],
    'case': [0, 0, 230],
    'chandelier': [169, 164, 131],
    'cleanser': [0, 165, 120],
    'clock': [190, 153, 153],
    'coat hanger': [179, 0, 194],
    'coffee maker': [0, 82, 0],
    'coil': [255, 179, 240],
    'computer': [225, 199, 255],
    'conducting wire': [150, 100, 100],
    'container': [0, 0, 70],
    'control': [255, 77, 255],
    'cosmetics': [142, 108, 45],
    'crate': [0, 226, 252],
    'crib': [169, 164, 131],
    'cube': [116, 112, 0],
    'cup': [175, 116, 175],
    'detergent': [255, 208, 186],
    'device': [146, 139, 141],
    'dish rack': [0, 0, 142],
    'dishwasher': [92, 82, 55],
    'dispenser': [95, 32, 0],
    'divider': [219, 142, 185],
    'door knob': [166, 74, 118],
    'doorway': [134, 134, 103],
    'dress': [0, 114, 143],
    'drum': [107, 142, 35],
    'duct': [0, 80, 100],
    'dumbbell': [0, 0, 192],
    'dustpan': [78, 180, 255],
    'dvd': [0, 143, 149],
    'eraser': [0, 82, 0],
    'fan': [0, 0, 70],
    'faucet': [84, 105, 51],
    'fence': [190, 153, 153],
    'file': [255, 228, 255],
    'fire extinguisher': [107, 255, 200],
    'flowerpot': [9, 80, 61],
    'flush': [227, 255, 205],
    'folder': [208, 229, 228],
    'food': [109, 63, 54],
    'footstool': [133, 129, 255],
    'fruit': [179, 0, 194],
    'furniture': [220, 20, 60],
    'garage door': [217, 17, 255],
    'garbage': [0, 82, 0],
    'glass': [255, 99, 164],
    'globe': [255, 77, 255],
    'glove': [166, 196, 102],
    'grab bar': [145, 148, 174],
    'grass': [0, 60, 100],
    'guitar': [73, 77, 174],
    'hair dryer': [169, 164, 131],
    'hamper': [241, 129, 0],
    'handle': [142, 108, 45],
    'hanger': [150, 100, 100],
    'hat': [154, 208, 0],
    'headboard': [171, 134, 1],
    'headphones': [124, 74, 181],
    'helmets': [209, 226, 140],
    'holder': [151, 0, 95],
    'hook': [92, 136, 89],
    'humidifier': [209, 99, 106],
    'ironware': [127, 167, 115],
    'jacket': [255, 73, 97],
    'jalousie': [255, 179, 240],
    'jar': [106, 154, 176],
    'kettle': [196, 172, 0],
    'keyboard': [0, 125, 92],
    'kitchenware': [74, 65, 105],
    'knife': [70, 130, 180],
    'label': [0, 228, 0],
    'ladder': [0, 114, 143],
    'laptop': [255, 180, 195],
    'ledge': [58, 41, 149],
    'letter': [0, 0, 192],
    'light': [78, 180, 255],
    'luggage': [0, 226, 252],
    'machine': [197, 226, 255],
    'magazine': [199, 100, 0],
    'map': [183, 121, 142],
    'mask': [74, 65, 105],
    'mattress': [255, 179, 240],
    'menu': [255, 255, 128],
    'molding': [104, 84, 109],
    'mop': [199, 100, 0],
    'mouse': [5, 121, 0],
    'napkins': [165, 42, 42],
    'notebook': [175, 116, 175],
    'pack': [0, 143, 149],
    'package': [166, 196, 102],
    'pad': [208, 229, 228],
    'pan': [209, 99, 106],
    'panel': [201, 57, 1],
    'paper': [255, 179, 240],
    'paper cutter': [207, 138, 255],
    'pedestal': [64, 170, 64],
    'pen': [193, 0, 92],
    'person': [7, 246, 231],
    'pipe': [255, 180, 195],
    'pitcher': [220, 20, 60],
    'plate': [142, 108, 45],
    'player': [0, 143, 149],
    'plug': [255, 77, 255],
    'plunger': [165, 42, 42],
    'pool': [153, 69, 1],
    'pool table': [0, 0, 230],
    'poster': [130, 114, 135],
    'pot': [96, 36, 108],
    'price tag': [255, 77, 255],
    'projector': [179, 0, 194],
    'purse': [0, 228, 0],
    'radio': [116, 112, 0],
    'range hood': [199, 100, 0],
    'remote control': [188, 208, 182],
    'ridge': [59, 105, 106],
    'rod': [207, 138, 255],
    'roll': [123, 104, 238],
    'rope': [110, 76, 0],
    'sack': [190, 153, 153],
    'salt': [250, 0, 30],
    'scale': [58, 41, 149],
    'scissors': [60, 143, 255],
    'screen': [0, 82, 0],
    'seasoning': [254, 212, 124],
    'shampoo': [70, 130, 180],
    'sheet': [151, 0, 95],
    'shirt': [190, 153, 153],
    'shoe': [199, 100, 0],
    'shovel': [241, 129, 0],
    'sign': [208, 195, 210],
    'soap': [109, 63, 54],
    'soap dish': [166, 74, 118],
    'soap dispenser': [95, 32, 0],
    'socket': [255, 255, 255],
    'speaker': [65, 70, 15],
    'sponge': [0, 220, 176],
    'spoon': [134, 134, 103],
    'stall': [0, 60, 100],
    'stapler': [246, 0, 122],
    'statue': [196, 172, 0],
    'stick': [0, 165, 120],
    'stopcock': [0, 60, 100],
    'structure': [220, 20, 60],
    'sunglasses': [142, 108, 45],
    'support': [209, 226, 140],
    'switch': [7, 246, 231],
    'tablet': [137, 54, 74],
    'teapot': [0, 80, 100],
    'telephone': [220, 220, 0],
    'thermostat': [128, 76, 255],
    'tissue': [73, 77, 174],
    'tissue box': [96, 96, 96],
    'toaster': [106, 0, 228],
    'toilet paper': [84, 105, 51],
    'toiletry': [128, 64, 128],
    'tool': [220, 20, 60],
    'toothbrush': [130, 114, 135],
    'toothpaste': [0, 143, 149],
    'toy': [255.0, 187.0, 120.0],
    'tray': [255, 179, 240],
    'treadmill': [166, 74, 118],
    'trophy': [0, 220, 176],
    'tube': [255, 255, 128],
    'umbrella': [250, 0, 30],
    'urn': [152, 251, 152],
    'utensil': [220, 220, 0],
    'vacuum cleaner': [96, 36, 108],
    'vanity': [5, 121, 0],
    'vase': [255, 193, 193],
    'vent': [209, 226, 140],
    'ventilation': [123, 104, 238],
    'water cooler': [255, 255, 128],
    'water heater': [145, 148, 174],
    'wine': [220, 220, 0],
    'wire': [96, 36, 108],
    'wood': [127, 167, 115],
    'wrap': [175, 116, 175],
}


================================================
FILE: bip3d/utils/dist_utils.py
================================================
import torch.distributed as dist


def reduce_mean(tensor):
    """"Obtain the mean of tensor on different GPUs."""
    if not (dist.is_available() and dist.is_initialized()):
        return tensor
    tensor = tensor.clone()
    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
    return tensor


================================================
FILE: bip3d/utils/line_mesh.py
================================================
"""Adapted from https://github.com/isl-
org/Open3D/pull/738#issuecomment-564785941.

Module which creates mesh lines from a line set
Open3D relies upon using glLineWidth to set line width on a LineSet
However, this method is now deprecated and not fully supported in
newer OpenGL versions.
See:
    Open3D Github Pull Request:
        https://github.com/intel-isl/Open3D/pull/738
    Other Framework Issues:
        https://github.com/openframeworks/openFrameworks/issues/3460

This module aims to solve this by converting a line into a triangular mesh
(which has thickness). The basic idea is to create a cylinder for each line
segment, translate it, and then rotate it.

License: MIT
"""
import numpy as np
import open3d as o3d


def align_vector_to_another(a=np.array([0, 0, 1]), b=np.array([1, 0, 0])):
    """Aligns vector a to vector b with axis angle rotation."""
    if np.array_equal(a, b):
        return None, None
    axis_ = np.cross(a, b)
    axis_ = axis_ / np.linalg.norm(axis_)
    angle = np.arccos(np.dot(a, b))

    return axis_, angle


def normalized(a, axis=-1, order=2):
    """Normalizes a numpy array of points."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis), l2


class LineMesh(object):

    def __init__(self, points, lines=None, colors=[0, 1, 0], radius=0.15):
        """Creates a line represented as sequence of cylinder triangular
        meshes.

        Arguments:
            points {ndarray} -- Numpy array of ponts Nx3.

        Keyword Arguments:
            lines {list[list] or None} -- List of point index pairs denoting
                line segments. If None, implicit lines from ordered pairwise
                points. (default: {None})
            colors {list} -- list of colors, or single color of the line
                (default: {[0, 1, 0]})
            radius {float} -- radius of cylinder (default: {0.15})
        """
        self.points = np.array(points)
        self.lines = np.array(
            lines) if lines is not None else self.lines_from_ordered_points(
                self.points)
        self.colors = np.array(colors)
        self.radius = radius
        self.cylinder_segments = []

        self.create_line_mesh()

    @staticmethod
    def lines_from_ordered_points(points):
        lines = [[i, i + 1] for i in range(0, points.shape[0] - 1, 1)]
        return np.array(lines)

    def create_line_mesh(self):
        first_points = self.points[self.lines[:, 0], :]
        second_points = self.points[self.lines[:, 1], :]
        line_segments = second_points - first_points
        line_segments_unit, line_lengths = normalized(line_segments)

        z_axis = np.array([0, 0, 1])
        # Create triangular mesh cylinder segments of line
        for i in range(line_segments_unit.shape[0]):
            line_segment = line_segments_unit[i, :]
            line_length = line_lengths[i]
            # get axis angle rotation to allign cylinder with line segment
            axis, angle = align_vector_to_another(z_axis, line_segment)
            # Get translation vector
            translation = first_points[i, :] + \
                line_segment * line_length * 0.5
            # create cylinder and apply transformations
            cylinder_segment = o3d.geometry.TriangleMesh.create_cylinder(
                self.radius, line_length)
            cylinder_segment = cylinder_segment.translate(translation,
                                                          relative=False)
            if axis is not None:
                axis_a = axis * angle
                cylinder_segment = cylinder_segment.rotate(
                    R=o3d.geometry.get_rotation_matrix_from_axis_angle(
                        axis_a))  # center=True)
                # cylinder_segment = cylinder_segment.rotate(
                #   axis_a, center=True,
                #   type=o3d.geometry.RotationType.AxisAngle)
            # color cylinder
            color = self.colors if self.colors.ndim == 1 \
                else self.colors[i, :]
            cylinder_segment.paint_uniform_color(color)

            self.cylinder_segments.append(cylinder_segment)

    def add_line(self, vis):
        """Adds this line to the visualizer."""
        for cylinder in self.cylinder_segments:
            vis.add_geometry(cylinder)

    def remove_line(self, vis):
        """Removes this line from the visualizer."""
        for cylinder in self.cylinder_segments:
            vis.remove_geometry(cylinder)


if __name__ == '__main__':

    def main():
        print('Demonstrating LineMesh vs LineSet')
        # Create Line Set
        points = [[0, 0, 0], [1, 0, 0], [0, 1, 0], [1, 1, 0], [0, 0, 1],
                  [1, 0, 1], [0, 1, 1], [1, 1, 1]]
        lines = [[0, 1], [0, 2], [1, 3], [2, 3], [4, 5], [4, 6], [5, 7],
                 [6, 7], [0, 4], [1, 5], [2, 6], [3, 7]]
        colors = [[1, 0, 0] for i in range(len(lines))]

        line_set = o3d.geometry.LineSet()
        line_set.points = o3d.utility.Vector3dVector(points)
        line_set.lines = o3d.utility.Vector2iVector(lines)
        line_set.colors = o3d.utility.Vector3dVector(colors)

        # Create Line Mesh 1
        points = np.array(points) + [0, 0, 2]
        line_mesh1 = LineMesh(points, lines, colors, radius=0.02)
        line_mesh1_geoms = line_mesh1.cylinder_segments

        # Create Line Mesh 1
        points = np.array(points) + [0, 2, 0]
        line_mesh2 = LineMesh(points, radius=0.03)
        line_mesh2_geoms = line_mesh2.cylinder_segments

        o3d.visualization.draw_geometries(
            [line_set, *line_mesh1_geoms, *line_mesh2_geoms])

    main()


================================================
FILE: bip3d/utils/typing_config.py
================================================
from collections.abc import Sized
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import torch
from mmdet.models.task_modules.samplers import SamplingResult
from mmengine.config import ConfigDict
from mmengine.structures import BaseDataElement, InstanceData


class Det3DDataElement(BaseDataElement):

    @property
    def gt_instances_3d(self) -> InstanceData:
        return self._gt_instances_3d

    @gt_instances_3d.setter
    def gt_instances_3d(self, value: InstanceData) -> None:
        self.set_field(value, '_gt_instances_3d', dtype=InstanceData)

    @gt_instances_3d.deleter
    def gt_instances_3d(self) -> None:
        del self._gt_instances_3d

    @property
    def pred_instances_3d(self) -> InstanceData:
        return self._pred_instances_3d

    @pred_instances_3d.setter
    def pred_instances_3d(self, value: InstanceData) -> None:
        self.set_field(value, '_pred_instances_3d', dtype=InstanceData)

    @pred_instances_3d.deleter
    def pred_instances_3d(self) -> None:
        del self._pred_instances_3d


IndexType = Union[str, slice, int, list, torch.LongTensor,
                  torch.cuda.LongTensor, torch.BoolTensor,
                  torch.cuda.BoolTensor, np.ndarray]


class PointData(BaseDataElement):
    """Data structure for point-level annotations or predictions.

    All data items in ``data_fields`` of ``PointData`` meet the following
    requirements:

    - They are all one dimension.
    - They should have the same length.

    `PointData` is used to save point-level semantic and instance mask,
    it also can save `instances_labels` and `instances_scores` temporarily.
    In the future, we would consider to move the instance-level info into
    `gt_instances_3d` and `pred_instances_3d`.

    Examples:
        >>> metainfo = dict(
        ...     sample_idx=random.randint(0, 100))
        >>> points = np.random.randint(0, 255, (100, 3))
        >>> point_data = PointData(metainfo=metainfo,
        ...                        points=points)
        >>> print(len(point_data))
        100

        >>> # slice
        >>> slice_data = point_data[10:60]
        >>> assert len(slice_data) == 50

        >>> # set
        >>> point_data.pts_semantic_mask = torch.randint(0, 255, (100,))
        >>> point_data.pts_instance_mask = torch.randint(0, 255, (100,))
        >>> assert tuple(point_data.pts_semantic_mask.shape) == (100,)
        >>> assert tuple(point_data.pts_instance_mask.shape) == (100,)
    """

    def __setattr__(self, name: str, value: Sized) -> None:
        """setattr is only used to set data.

        The value must have the attribute of `__len__` and have the same length
        of `PointData`.
        """
        if name in ('_metainfo_fields', '_data_fields'):
            if not hasattr(self, name):
                super().__setattr__(name, value)
            else:
                raise AttributeError(f'{name} has been used as a '
                                     'private attribute, which is immutable.')

        else:
            assert isinstance(value,
                              Sized), 'value must contain `__len__` attribute'
            # TODO: make sure the input value share the same length
            super().__setattr__(name, value)

    __setitem__ = __setattr__

    def __getitem__(self, item: IndexType) -> 'PointData':
        """
        Args:
            item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`,
                :obj:`torch.LongTensor`, :obj:`torch.BoolTensor`):
                Get the corresponding values according to item.

        Returns:
            :obj:`PointData`: Corresponding values.
        """
        if isinstance(item, list):
            item = np.array(item)
        if isinstance(item, np.ndarray):
            # The default int type of numpy is platform dependent, int32 for
            # windows and int64 for linux. `torch.Tensor` requires the index
            # should be int64, therefore we simply convert it to int64 here.
            # Mode details in https://github.com/numpy/numpy/issues/9464
            item = item.astype(np.int64) if item.dtype == np.int32 else item
            item = torch.from_numpy(item)
        assert isinstance(
            item, (str, slice, int, torch.LongTensor, torch.cuda.LongTensor,
                   torch.BoolTensor, torch.cuda.BoolTensor))

        if isinstance(item, str):
            return getattr(self, item)

        if isinstance(item, int):
            if item >= len(self) or item < -len(self):  # type: ignore
                raise IndexError(f'Index {item} out of range!')
            else:
                # keep the dimension
                item = slice(item, None, len(self))

        new_data = self.__class__(metainfo=self.metainfo)
        if isinstance(item, torch.Tensor):
            assert item.dim() == 1, 'Only support to get the' \
                                    ' values along the first dimension.'
            if isinstance(item, (torch.BoolTensor, torch.cuda.BoolTensor)):
                assert len(item) == len(self), 'The shape of the ' \
                                               'input(BoolTensor) ' \
                                               f'{len(item)} ' \
                                               'does not match the shape ' \
                                               'of the indexed tensor ' \
                                               'in results_field ' \
                                               f'{len(self)} at ' \
                                               'first dimension.'

            for k, v in self.items():
                if isinstance(v, torch.Tensor):
                    new_data[k] = v[item]
                elif isinstance(v, np.ndarray):
                    new_data[k] = v[item.cpu().numpy()]
                elif isinstance(
                        v, (str, list, tuple)) or (hasattr(v, '__getitem__')
                                                   and hasattr(v, 'cat')):
                    # convert to indexes from BoolTensor
                    if isinstance(item,
                                  (torch.BoolTensor, torch.cuda.BoolTensor)):
                        indexes = torch.nonzero(item).view(
                            -1).cpu().numpy().tolist()
                    else:
                        indexes = item.cpu().numpy().tolist()
                    slice_list = []
                    if indexes:
                        for index in indexes:
                            slice_list.append(slice(index, None, len(v)))
                    else:
                        slice_list.append(slice(None, 0, None))
                    r_list = [v[s] for s in slice_list]
                    if isinstance(v, (str, list, tuple)):
                        new_value = r_list[0]
                        for r in r_list[1:]:
                            new_value = new_value + r
                    else:
                        new_value = v.cat(r_list)
                    new_data[k] = new_value
                else:
                    raise ValueError(
                        f'The type of `{k}` is `{type(v)}`, which has no '
                        'attribute of `cat`, so it does not '
                        'support slice with `bool`')
        else:
            # item is a slice
            for k, v in self.items():
                new_data[k] = v[item]
        return new_data  # type: ignore

    def __len__(self) -> int:
        """int: The length of `PointData`."""
        if len(self._data_fields) > 0:
            return len(self.values()[0])
        else:
            return 0


# Type hint of config data
ConfigType = Union[ConfigDict, dict]
OptConfigType = Optional[ConfigType]

# Type hint of one or more config data
MultiConfig = Union[ConfigType, List[ConfigType]]
OptMultiConfig = Optional[MultiConfig]

InstanceList = List[InstanceData]
OptInstanceList = Optional[InstanceList]
ForwardResults = Union[Dict[str, torch.Tensor], List[Det3DDataElement],
                       Tuple[torch.Tensor], torch.Tensor]

SamplingResultList = List[SamplingResult]

OptSamplingResultList = Optional[SamplingResultList]
SampleList = List[Det3DDataElement]
OptSampleList = Optional[SampleList]


================================================
FILE: configs/__init__.py
================================================


================================================
FILE: configs/bip3d_det.py
================================================
_base_ = ["./default_runtime.py"]

import os
from bip3d.datasets.embodiedscan_det_grounding_dataset import (
    class_names, head_labels, common_labels, tail_labels
)

DEBUG = os.environ.get("CLUSTER") is None
del os

backend_args = None

metainfo = dict(classes="all")

depth = True
depth_loss = True

z_range=[-0.2, 3]
min_depth = 0.25
max_depth = 10
num_depth = 64

model = dict(
    type="BIP3D",
    input_3d="depth_img",
    use_depth_grid_mask=True,
    data_preprocessor=dict(
        type="CustomDet3DDataPreprocessor",
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        bgr_to_rgb=True,
        pad_size_divisor=32,
    ),
    backbone=dict(
        type="mmdet.SwinTransformer",
        embed_dims=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.2,
        patch_norm=True,
        out_indices=(1, 2, 3),
        with_cp=True,
        convert_weights=False,
    ),
    neck=dict(
        type="mmdet.ChannelMapper",
        in_channels=[192, 384, 768],
        kernel_size=1,
        out_channels=256,
        act_cfg=None,
        bias=True,
        norm_cfg=dict(type="GN", num_groups=32),
        num_outs=4,
    ),
    text_encoder=dict(
        type="BertModel",
        special_tokens_list=["[CLS]", "[SEP]"],
        name="./ckpt/bert-base-uncased",
        pad_to_max=False,
        use_sub_sentence_represent=True,
        add_pooling_layer=False,
        max_tokens=768,
        use_checkpoint=True,
        return_tokenized=True,
    ),
    backbone_3d=(
        dict(
            type="mmdet.ResNet",
            depth=34,
            in_channels=1,
            base_channels=4,
            num_stages=4,
            out_indices=(1, 2, 3),
            norm_cfg=dict(type="BN", requires_grad=True),
            norm_eval=True,
            with_cp=True,
            style="pytorch",
        )
        if depth
        else None
    ),
    neck_3d=(
        dict(
            type="mmdet.ChannelMapper",
            in_channels=[8, 16, 32],
            kernel_size=1,
            out_channels=32,
            act_cfg=None,
            bias=True,
            norm_cfg=dict(type="GN", num_groups=4),
            num_outs=4,
        )
        if depth
        else None
    ),
    feature_enhancer=dict(
        type="TextImageDeformable2DEnhancer",
        num_layers=6,
        text_img_attn_block=dict(
            v_dim=256, l_dim=256, embed_dim=1024, num_heads=4, init_values=1e-4
        ),
        img_attn_block=dict(
            self_attn_cfg=dict(
                embed_dims=256, num_levels=4, dropout=0.0, im2col_step=64
            ),
            ffn_cfg=dict(
                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0
            ),
        ),
        text_attn_block=dict(
            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
            ffn_cfg=dict(
                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0
            ),
        ),
        embed_dims=256,
        num_feature_levels=4,
        positional_encoding=dict(
            num_feats=128, normalize=True, offset=0.0, temperature=20
        ),
    ),
    spatial_enhancer=dict(
        type="DepthFusionSpatialEnhancer",
        embed_dims=256,
        feature_3d_dim=32,
        num_depth_layers=2,
        min_depth=min_depth,
        max_depth=max_depth,
        num_depth=num_depth,
        with_feature_3d=depth,
        loss_depth_weight=1.0 if depth_loss else -1,
    ),
    decoder=dict(
        type="BBox3DDecoder",
        look_forward_twice=True,
        instance_bank=dict(
            type="InstanceBank",
            num_anchor=50,
            anchor="anchor_files/embodiedscan_kmeans.npy",
            embed_dims=256,
            anchor_in_camera=True,
        ),
        anchor_encoder=dict(
            type="DoF9BoxEncoder",
            embed_dims=256,
            rot_dims=3,
        ),
        graph_model=dict(
            type="MultiheadAttention",
            embed_dims=256,
            num_heads=8,
            dropout=0.0,
            batch_first=True,
        ),
        ffn=dict(
            type="FFN",
            embed_dims=256,
            feedforward_channels=2048,
            ffn_drop=0.0,
        ),
        norm_layer=dict(type="LN", normalized_shape=256),
        deformable_model=dict(
            type="DeformableFeatureAggregation",
            embed_dims=256,
            num_groups=8,
            num_levels=4,
            use_camera_embed=True,
            use_deformable_func=True,
            with_depth=True,
            min_depth=min_depth,
            max_depth=max_depth,
            kps_generator=dict(
                type="SparseBox3DKeyPointsGenerator",
                fix_scale=[
                    [0, 0, 0],
                    [0.45, 0, 0],
                    [-0.45, 0, 0],
                    [0, 0.45, 0],
                    [0, -0.45, 0],
                    [0, 0, 0.45],
                    [0, 0, -0.45],
                ],
                num_learnable_pts=9,
            ),
            with_value_proj=True,
            filter_outlier=True,
        ),
        text_cross_attn=dict(
            type="MultiheadAttention",
            embed_dims=256,
            num_heads=8,
            dropout=0.0,
            batch_first=True,
        ),
        refine_layer=dict(
            type="GroundingRefineClsHead",
            embed_dims=256,
            output_dim=9,
            cls_bias=True,
            # cls_layers=True,
        ),
        loss_cls=dict(
            type="mmdet.FocalLoss",
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0,
        ),
        loss_reg=dict(
            type="DoF9BoxLoss",
            loss_weight_wd=1.0,
            loss_weight_pcd=0.0,
            loss_weight_cd=0.8,
        ),
        sampler=dict(
            type="Grounding3DTarget",
            cls_weight=1.0,
            box_weight=1.0,
            num_dn=100,
            cost_weight_wd=1.0,
            cost_weight_pcd=0.0,
            cost_weight_cd=0.8,
            with_dn_query=True,
            num_classes=284,
            embed_dims=256,
        ),
        gt_reg_key="gt_bboxes_3d",
        gt_cls_key="tokens_positive",
        post_processor=dict(
            type="GroundingBox3DPostProcess",
            num_output=1000,
        ),
        with_instance_id=False,
    ),
)

dataset_type = "EmbodiedScanDetGroundingDataset"
data_root = "data"
metainfo = dict(
    classes=class_names,
    classes_split=(head_labels, common_labels, tail_labels),
    box_type_3d="euler-depth",
)


image_wh = (512, 512)

rotate_3rscan = True
sep_token = "[SEP]"
cam_standardization = True
if cam_standardization:
    resize = dict(
        type="CamIntrisicStandardization",
        dst_intrinsic=[
            [432.57943431339237, 0.0, 256],
            [0.0, 539.8570854208559, 256],
            [0.0, 0.0, 1.0],
        ],
        dst_wh=image_wh,
    )
else:
    resize = dict(type="CustomResize", scale=image_wh, keep_ratio=False)

train_pipeline = [
    dict(type="LoadAnnotations3D"),
    dict(
        type="MultiViewPipeline",
        n_images=1,
        max_n_images=18,
        transforms=[
            dict(type="LoadImageFromFile", backend_args=backend_args),
            dict(type="LoadDepthFromFile", backend_args=backend_args),
            resize,
            dict(
                type="ResizeCropFlipImage",
                data_aug_conf={
                    "resize_lim": (1.0, 1.0),
                    "final_dim": image_wh,
                    "bot_pct_lim": (0.0, 0.05),
                    "rot_lim": (0, 0),
                    "H": image_wh[1],
                    "W": image_wh[0],
                    "rand_flip": False,
                },
            ),
        ],
        rotate_3rscan=rotate_3rscan,
        ordered=False,
    ),
    dict(
        type="CategoryGroundingDataPrepare",
        classes=class_names,
        filter_others=True,
        sep_token=sep_token,
        max_class=128,
        training=True,
        z_range=z_range,
    ),
    dict(
        type="Pack3DDetInputs",
        keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
    ),
]
if depth_loss:
    train_pipeline.append(
        dict(
            type="DepthProbLabelGenerator",
            origin_stride=4,
            min_depth=min_depth,
            max_depth=max_depth,
            num_depth=num_depth,
        ),
    )

test_pipeline = [
    dict(type="LoadAnnotations3D"),
    dict(
        type="MultiViewPipeline",
        n_images=1,
        max_n_images=50,
        ordered=True,
        transforms=[
            dict(type="LoadImageFromFile", backend_args=backend_args),
            dict(type="LoadDepthFromFile", backend_args=backend_args),
            resize,
        ],
        rotate_3rscan=rotate_3rscan,
    ),
    dict(
        type="CategoryGroundingDataPrepare",
        classes=class_names,
        sep_token=sep_token,
        training=False,
        filter_others=False,
    ),
    dict(
        type="Pack3DDetInputs",
        keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
    ),
]

trainval = False
data_version = "v1"

if data_version == "v1":
    train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl"
    val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl"
elif data_version == "v2":
    train_ann_file = "embodiedscan-v2/embodiedscan_infos_train.pkl"
    val_ann_file = "embodiedscan-v2/embodiedscan_infos_val.pkl"
else:
    assert False

train_dataset = dict(
    type=dataset_type,
    data_root=data_root,
    ann_file=train_ann_file,
    pipeline=train_pipeline,
    test_mode=False,
    filter_empty_gt=True,
    box_type_3d="Euler-Depth",
    metainfo=metainfo,
)


if trainval:
    train_dataset = dict(
        type="ConcatDataset",
        datasets=[
            train_dataset,
            dict(
                type=dataset_type,
                data_root=data_root,
                ann_file=val_ann_file,
                pipeline=train_pipeline,
                test_mode=False,
                filter_empty_gt=True,
                box_type_3d="Euler-Depth",
                metainfo=metainfo,
            )
        ]
    )


train_dataloader = dict(
    batch_size=1,
    num_workers=4 if not DEBUG else 0,
    persistent_workers=False,
    sampler=dict(type="DefaultSampler", shuffle=True),
    dataset=dict(
        type="RepeatDataset",
        times=10,
        dataset=train_dataset,
    ),
)

    
val_dataloader = dict(
    batch_size=1,
    num_workers=4,
    persistent_workers=False,
    drop_last=False,
    sampler=dict(type="DefaultSampler", shuffle=False),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=val_ann_file,
        pipeline=test_pipeline,
        test_mode=True,
        filter_empty_gt=True,
        box_type_3d="Euler-Depth",
        metainfo=metainfo,
    ),
)
test_dataloader = val_dataloader

val_evaluator = dict(
    type="IndoorDetMetric",
    collect_dir="/job_data/.dist_test" if not DEBUG else None,
    # collect_device="gpu"
)
test_evaluator = val_evaluator

max_epochs = 24
train_cfg = dict(
    type="EpochBasedTrainLoop", max_epochs=max_epochs, val_interval=1
)
val_cfg = dict(type="ValLoop")
test_cfg = dict(type="TestLoop")

lr = 2e-4
optim_wrapper = dict(
    type="OptimWrapper",
    optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005),
    paramwise_cfg=dict(
        custom_keys={
            "backbone.": dict(lr_mult=0.1),
            "text_encoder": dict(lr_mult=0.05),
            "absolute_pos_embed": dict(decay_mult=0.0),
        }
    ),
    clip_grad=dict(max_norm=10, norm_type=2),
)


# learning rate
param_scheduler = [
    dict(
        type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500
    ),
    dict(
        type="MultiStepLR",
        begin=0,
        end=max_epochs,
        by_epoch=True,
        milestones=[int(max_epochs / 12 * 8), int(max_epochs / 12 * 11)],
        gamma=0.1,
    ),
]

custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)]
default_hooks = dict(
    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3),
)

vis_backends = [
    dict(
        type="TensorboardVisBackend",
        save_dir="/job_tboard" if not DEBUG else "./work-dir",
    ),
]

visualizer = dict(
    type="Visualizer",
    vis_backends=vis_backends,
    name="visualizer",
)

load_from = "ckpt/groundingdino_swint_ogc_mmdet-822d7e9d-rename.pth"


================================================
FILE: configs/bip3d_det_grounding.py
================================================
_base_ = ["./bip3d_grounding.py"]
from mmengine import Config
import os
det_config = Config.fromfile("configs/bip3d_det.py")
det_train_dataset = det_config.train_dataset
del Config, det_config

train_dataloader = _base_.train_dataloader
DEBUG = os.environ.get("CLUSTER") is None

train_dataloader["dataset"] = dict(
    type="ConcatDataset",
    datasets=[
        dict(
            type="RepeatDataset",
            times=20,
            dataset=det_train_dataset,
        ),
        train_dataloader["dataset"],
    ]
)


max_iters = 50000
train_cfg = dict(
    type="IterBasedTrainLoop", max_iters=max_iters, val_interval=25000,
    _delete_=True,
)
val_cfg = dict(type="ValLoop")
test_cfg = dict(type="TestLoop")

lr = 2e-4
optim_wrapper = dict(
    type="OptimWrapper",
    optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005),
    paramwise_cfg=dict(
        custom_keys={
            "backbone.": dict(lr_mult=0.1),
            "text_encoder": dict(lr_mult=0.05),
            "absolute_pos_embed": dict(decay_mult=0.0),
        }
    ),
    clip_grad=dict(max_norm=10, norm_type=2),
)


# learning rate
param_scheduler = [
    dict(
        type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500
    ),
    dict(
        type="MultiStepLR",
        begin=0,
        end=max_iters,
        by_epoch=False,
        milestones=[int(max_iters*0.8), int(max_iters*0.95)],
        gamma=0.1,
    ),
]

custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)]
default_hooks = dict(
    logger=dict(
        type="LoggerHook",
        interval=25,
        log_metric_by_epoch=False,
    ),
    checkpoint=dict(
        type="CheckpointHook", interval=25000, max_keep_ckpts=3, by_epoch=False
    ),
)

vis_backends = [
    dict(
        type="TensorboardVisBackend",
        save_dir="/job_tboard" if not DEBUG else "./work-dir",
    ),
]

visualizer = dict(
    type="Visualizer",
    vis_backends=vis_backends,
    name="visualizer",
)
log_processor = dict(type="LogProcessor", window_size=50, by_epoch=False)

load_from = "ckpt/bip3d_det.pth"


================================================
FILE: configs/bip3d_det_rgb.py
================================================
_base_ = ["./bip3d_det.py"]

model = dict(
    backbone_3d=None,
    neck_3d=None,
    spatial_enhancer=dict(with_feature_3d=False),
    decoder=dict(deformable_model=dict(with_depth=False)),
)


================================================
FILE: configs/bip3d_grounding.py
================================================
_base_ = ["./default_runtime.py"]

import os
from bip3d.datasets.embodiedscan_det_grounding_dataset import (
    class_names, head_labels, common_labels, tail_labels
)

DEBUG = os.environ.get("CLUSTER") is None
del os

backend_args = None

metainfo = dict(classes="all")

depth = True
depth_loss = True

z_range=[-0.2, 3]
min_depth = 0.25
max_depth = 10
num_depth = 64

model = dict(
    type="BIP3D",
    input_3d="depth_img",
    use_depth_grid_mask=True,
    data_preprocessor=dict(
        type="CustomDet3DDataPreprocessor",
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        bgr_to_rgb=True,
        pad_size_divisor=32,
    ),
    backbone=dict(
        type="mmdet.SwinTransformer",
        embed_dims=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.2,
        patch_norm=True,
        out_indices=(1, 2, 3),
        with_cp=True,
        convert_weights=False,
    ),
    neck=dict(
        type="mmdet.ChannelMapper",
        in_channels=[192, 384, 768],
        kernel_size=1,
        out_channels=256,
        act_cfg=None,
        bias=True,
        norm_cfg=dict(type="GN", num_groups=32),
        num_outs=4,
    ),
    text_encoder=dict(
        type="BertModel",
        special_tokens_list=["[CLS]", "[SEP]"],
        name="./ckpt/bert-base-uncased",
        pad_to_max=False,
        use_sub_sentence_represent=True,
        add_pooling_layer=False,
        max_tokens=768,
        use_checkpoint=True,
        return_tokenized=True,
    ),
    backbone_3d=(
        dict(
            type="mmdet.ResNet",
            depth=34,
            in_channels=1,
            base_channels=4,
            num_stages=4,
            out_indices=(1, 2, 3),
            norm_cfg=dict(type="BN", requires_grad=True),
            norm_eval=True,
            with_cp=True,
            style="pytorch",
        )
        if depth
        else None
    ),
    neck_3d=(
        dict(
            type="mmdet.ChannelMapper",
            in_channels=[8, 16, 32],
            kernel_size=1,
            out_channels=32,
            act_cfg=None,
            bias=True,
            norm_cfg=dict(type="GN", num_groups=4),
            num_outs=4,
        )
        if depth
        else None
    ),
    feature_enhancer=dict(
        type="TextImageDeformable2DEnhancer",
        num_layers=6,
        text_img_attn_block=dict(
            v_dim=256, l_dim=256, embed_dim=1024, num_heads=4, init_values=1e-4
        ),
        img_attn_block=dict(
            self_attn_cfg=dict(
                embed_dims=256, num_levels=4, dropout=0.0, im2col_step=64
            ),
            ffn_cfg=dict(
                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0
            ),
        ),
        text_attn_block=dict(
            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
            ffn_cfg=dict(
                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0
            ),
        ),
        embed_dims=256,
        num_feature_levels=4,
        positional_encoding=dict(
            num_feats=128, normalize=True, offset=0.0, temperature=20
        ),
    ),
    spatial_enhancer=dict(
        type="DepthFusionSpatialEnhancer",
        embed_dims=256,
        feature_3d_dim=32,
        num_depth_layers=2,
        min_depth=min_depth,
        max_depth=max_depth,
        num_depth=num_depth,
        with_feature_3d=depth,
        loss_depth_weight=1.0 if depth_loss else -1,
    ),
    decoder=dict(
        type="BBox3DDecoder",
        look_forward_twice=True,
        instance_bank=dict(
            type="InstanceBank",
            num_anchor=50,
            anchor="anchor_files/embodiedscan_kmeans.npy",
            embed_dims=256,
            anchor_in_camera=True,
        ),
        anchor_encoder=dict(
            type="DoF9BoxEncoder",
            embed_dims=256,
            rot_dims=3,
        ),
        graph_model=dict(
            type="MultiheadAttention",
            embed_dims=256,
            num_heads=8,
            dropout=0.0,
            batch_first=True,
        ),
        ffn=dict(
            type="FFN",
            embed_dims=256,
            feedforward_channels=2048,
            ffn_drop=0.0,
        ),
        norm_layer=dict(type="LN", normalized_shape=256),
        deformable_model=dict(
            type="DeformableFeatureAggregation",
            embed_dims=256,
            num_groups=8,
            num_levels=4,
            use_camera_embed=True,
            use_deformable_func=True,
            with_depth=True,
            min_depth=min_depth,
            max_depth=max_depth,
            kps_generator=dict(
                type="SparseBox3DKeyPointsGenerator",
                fix_scale=[
                    [0, 0, 0],
                    [0.45, 0, 0],
                    [-0.45, 0, 0],
                    [0, 0.45, 0],
                    [0, -0.45, 0],
                    [0, 0, 0.45],
                    [0, 0, -0.45],
                ],
                num_learnable_pts=9,
            ),
            with_value_proj=True,
            filter_outlier=True,
        ),
        text_cross_attn=dict(
            type="MultiheadAttention",
            embed_dims=256,
            num_heads=8,
            dropout=0.0,
            batch_first=True,
        ),
        refine_layer=dict(
            type="GroundingRefineClsHead",
            embed_dims=256,
            output_dim=9,
            cls_bias=True,
            # cls_layers=True,
        ),
        loss_cls=dict(
            type="mmdet.FocalLoss",
            use_sigmoid=True,
            gamma=2.0,
            alpha=0.25,
            loss_weight=1.0,
        ),
        loss_reg=dict(
            type="DoF9BoxLoss",
            loss_weight_wd=1.0,
            loss_weight_pcd=0.0,
            loss_weight_cd=0.8,
        ),
        sampler=dict(
            type="Grounding3DTarget",
            cls_weight=1.0,
            box_weight=1.0,
            num_dn=100,
            cost_weight_wd=1.0,
            cost_weight_pcd=0.0,
            cost_weight_cd=0.8,
            with_dn_query=True,
            num_classes=284,
            embed_dims=256,
        ),
        gt_reg_key="gt_bboxes_3d",
        gt_cls_key="tokens_positive",
        post_processor=dict(
            type="GroundingBox3DPostProcess",
            num_output=1000,
        ),
        with_instance_id=False,
    ),
)

dataset_type = "EmbodiedScanDetGroundingDataset"
data_root = "data"
metainfo = dict(
    classes=class_names,
    classes_split=(head_labels, common_labels, tail_labels),
    box_type_3d="euler-depth",
)


image_wh = (512, 512)

rotate_3rscan = True
sep_token = "[SEP]"
cam_standardization = True
if cam_standardization:
    resize = dict(
        type="CamIntrisicStandardization",
        dst_intrinsic=[
            [432.57943431339237, 0.0, 256],
            [0.0, 539.8570854208559, 256],
            [0.0, 0.0, 1.0],
        ],
        dst_wh=image_wh,
    )
else:
    resize = dict(type="CustomResize", scale=image_wh, keep_ratio=False)

train_pipeline = [
    dict(type="LoadAnnotations3D"),
    dict(
        type="MultiViewPipeline",
        n_images=1,
        max_n_images=18,
        transforms=[
            dict(type="LoadImageFromFile", backend_args=backend_args),
            dict(type="LoadDepthFromFile", backend_args=backend_args),
            resize,
            dict(
                type="ResizeCropFlipImage",
                data_aug_conf={
                    "resize_lim": (1.0, 1.0),
                    "final_dim": image_wh,
                    "bot_pct_lim": (0.0, 0.05),
                    "rot_lim": (0, 0),
                    "H": image_wh[1],
                    "W": image_wh[0],
                    "rand_flip": False,
                },
            ),
        ],
        rotate_3rscan=rotate_3rscan,
        ordered=True,
    ),
    dict(
        type="Pack3DDetInputs",
        keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
    ),
]
if depth_loss:
    train_pipeline.append(
        dict(
            type="DepthProbLabelGenerator",
            origin_stride=4,
            min_depth=min_depth,
            max_depth=max_depth,
            num_depth=num_depth,
        ),
    )

test_pipeline = [
    dict(type="LoadAnnotations3D"),
    dict(
        type="MultiViewPipeline",
        n_images=1,
        max_n_images=50,
        ordered=True,
        transforms=[
            dict(type="LoadImageFromFile", backend_args=backend_args),
            dict(type="LoadDepthFromFile", backend_args=backend_args),
            resize,
        ],
        rotate_3rscan=rotate_3rscan,
    ),
    dict(
        type="Pack3DDetInputs",
        keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
    ),
]

trainval = False
data_version = "v1"

if data_version == "v1":
    train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl"
    val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl"
    train_vg_file = "embodiedscan/embodiedscan_train_vg_all.json"
    val_vg_file = "embodiedscan/embodiedscan_val_vg_all.json"
elif data_version == "v1-mini":
    train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl"
    val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl"
    train_vg_file = "embodiedscan/embodiedscan_train_mini_vg.json"
    val_vg_file = "embodiedscan/embodiedscan_val_mini_vg.json"
elif data_version == "v2":
    train_ann_file = "embodiedscan-v2/embodiedscan_infos_train.pkl"
    val_ann_file = "embodiedscan-v2/embodiedscan_infos_val.pkl"
    train_vg_file = "embodiedscan-v2/embodiedscan_train_vg.json"
    val_vg_file = "embodiedscan-v2/embodiedscan_val_vg.json"

test_ann_file = "embodiedscan/embodiedscan_infos_test.pkl"
test_vg_file = "embodiedscan/embodiedscan_test_vg.json"

train_dataset = dict(
    type=dataset_type,
    data_root=data_root,
    ann_file=train_ann_file,
    pipeline=train_pipeline,
    test_mode=False,
    filter_empty_gt=True,
    box_type_3d="Euler-Depth",
    metainfo=metainfo,
    mode="grounding",
    vg_file=train_vg_file,
    num_text=10,
    sep_token=sep_token,
)

if trainval:
    train_dataset = dict(
        type="ConcatDataset",
        datasets=[
            train_dataset,
            dict(
                type=dataset_type,
                data_root=data_root,
                ann_file=val_ann_file,
                pipeline=train_pipeline,
                test_mode=False,
                filter_empty_gt=True,
                box_type_3d="Euler-Depth",
                metainfo=metainfo,
                mode="grounding",
                vg_file=val_vg_file,
                num_text=10,
                sep_token=sep_token,
            )
        ],
    )

train_dataloader = dict(
    batch_size=1,
    num_workers=4 if not DEBUG else 0,
    persistent_workers=False,
    sampler=dict(type="DefaultSampler", shuffle=True),
    dataset=train_dataset,
)

val_dataloader = dict(
    batch_size=1,
    num_workers=4 if not DEBUG else 0,
    persistent_workers=False,
    drop_last=False,
    sampler=dict(type="DefaultSampler", shuffle=False),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=val_ann_file,
        pipeline=test_pipeline,
        test_mode=True,
        filter_empty_gt=True,
        box_type_3d="Euler-Depth",
        metainfo=metainfo,
        mode="grounding",
        vg_file=val_vg_file,
    ),
)

test_dataloader = dict(
    batch_size=1,
    num_workers=4 if not DEBUG else 0,
    persistent_workers=False,
    drop_last=False,
    sampler=dict(type="DefaultSampler", shuffle=False),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file=test_ann_file,
        pipeline=[
            dict(type="LoadAnnotations3D"),
            dict(
                type="MultiViewPipeline",
                n_images=1,
                max_n_images=50,
                ordered=True,
                transforms=[
                    dict(type="LoadImageFromFile", backend_args=backend_args),
                    dict(type="LoadDepthFromFile", backend_args=backend_args),
                    resize,
                ],
                rotate_3rscan=rotate_3rscan,
            ),
            dict(
                type="Pack3DDetInputs",
                keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
            ),
        ],
        test_mode=True,
        filter_empty_gt=True,
        box_type_3d="Euler-Depth",
        metainfo=metainfo,
        mode="grounding",
        vg_file=test_vg_file,
    ),
)

val_evaluator = dict(
    type="GroundingMetric",
    collect_dir="/job_data/.dist_test" if not DEBUG else None,
)

test_evaluator = dict(
    type="GroundingMetric",
    collect_dir="/job_data/.dist_test" if not DEBUG else None,
    format_only=True,
    submit_info={
        'method': 'BIP3D',
        'team': 'robot-lab manipulation team',
        'authors': ['xuewu lin'],
        'e-mail': '878585984@qq.com',
        'institution': 'Horizon',
        'country': 'China',
    },
    result_dir="/job_data" if not DEBUG else "./",
)

max_epochs = 2
train_cfg = dict(
    type="EpochBasedTrainLoop", max_epochs=max_epochs, val_interval=1
)
val_cfg = dict(type="ValLoop")
test_cfg = dict(type="TestLoop")

lr = 2e-4
optim_wrapper = dict(
    type="OptimWrapper",
    optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005),
    paramwise_cfg=dict(
        custom_keys={
            "backbone.": dict(lr_mult=0.1),
            # "text_encoder": dict(lr_mult=0.05),
            "absolute_pos_embed": dict(decay_mult=0.0),
        }
    ),
    clip_grad=dict(max_norm=10, norm_type=2),
)


# learning rate
param_scheduler = [
    dict(
        type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500
    ),
    dict(
        type="MultiStepLR",
        begin=0,
        end=62000,
        by_epoch=False,
        milestones=[40000, 55000],
        gamma=0.1,
    ),
]

custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)]
default_hooks = dict(
    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3),
)

vis_backends = [
    dict(
        type="TensorboardVisBackend",
        save_dir="/job_tboard" if not DEBUG else "./work-dir",
    ),
]

visualizer = dict(
    type="Visualizer",
    vis_backends=vis_backends,
    name="visualizer",
)

load_from = "ckpt/bip3d_det.pth"


================================================
FILE: configs/bip3d_grounding_rgb.py
================================================
_base_ = ["./bip3d_grounding.py"]

model = dict(
    backbone_3d=None,
    neck_3d=None,
    spatial_enhancer=dict(with_feature_3d=False),
    decoder=dict(deformable_model=dict(with_depth=False)),
)

load_from = "ckpt/bip3d_det_rgb.pth"


================================================
FILE: configs/default_runtime.py
================================================
default_scope = "bip3d"

default_hooks = dict(
    timer=dict(type="IterTimerHook"),
    logger=dict(
        type="LoggerHook",
        interval=25,
    ),
    param_scheduler=dict(type="ParamSchedulerHook"),
    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=4),
    sampler_seed=dict(type="DistSamplerSeedHook"),
)
# visualization=dict(type='Det3DVisualizationHook'))

env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
    dist_cfg=dict(backend="nccl"),
)

log_processor = dict(type="LogProcessor", window_size=50, by_epoch=True)

log_level = "INFO"
load_from = None
resume = False

# TODO: support auto scaling lr
randomness = dict(seed=0)
find_unused_parameters = False


================================================
FILE: docs/quick_start.md
================================================
# Quick Start

### Set up python environment
```bash
virtualenv mm_bip3d --python=python3.8
source mm_bip3d/bin/activate
pip3 install --upgrade pip

bip3d_path="path/to/bip3d"
cd ${bip3d_path}
# MMCV recommends installing via a wheel package, url: https://download.openmmlab.com/mmcv/dist/cu{$cuda_version}/torch{$torch_version}/index.html
pip3 install -r requirement.txt
```

### Compile the deformable_aggregation CUDA op
```bash
cd bip3d/ops
python3 setup.py develop
cd ../../
```

### Prepare the data
Download the [EmbodiedScan dataset](https://github.com/OpenRobotLab/EmbodiedScan) and create symbolic links.
```bash
cd ${bip3d_path}
mkdir data
ln -s path/to/embodiedscan ./data/embodiedscan
```

Download datasets [ScanNet](https://github.com/ScanNet/ScanNet), [3RScan](https://github.com/WaldJohannaU/3RScan), [Matterport3D](https://github.com/niessner/Matterport), and optionally download [ARKitScenes](https://github.com/apple/ARKitScenes). Adjust the data directory structure as follows:
```bash
${bip3d_path}
└──data
    ├──embodiedscan
    │   ├──embodiedscan_infos_train.pkl
    │   ├──embodiedscan_infos_val.pkl
    │   ...
    ├──3rscan
    │   ├──00d42bed-778d-2ac6-86a7-0e0e5f5f5660
    │   ...
    ├──scannet
    │   └──posed_images
    ├──matterport3d
    │   ├──17DRP5sb8fy
    │   ...
    └──arkitscenes
        ├──Training
        └──Validation
```

### Prepare pre-trained weights
Download the required Grounding-DINO pre-trained weights: [Swin-Tiny](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth) and [Swin-Base](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth).
```bash
mkdir ckpt

# Swin-Tiny
wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth -O ckpt/groundingdino_swint_ogc_mmdet-822d7e9d.pth
python tools/ckpt_rename.py ckpt/groundingdino_swint_ogc_mmdet-822d7e9d.pth

# Swin-Base
wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth -O ckpt/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
python tools/ckpt_rename.py ckpt/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
```
Download bert config and pretrain weights from [huggingface](https://huggingface.co/google-bert/bert-base-uncased/tree/main).
```bash
${bip3d_path}
└──ckpt
    ├──groundingdino_swint_ogc_mmdet-822d7e9d.pth
    ├──groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
    └──bert-base-uncased
        ├──config.json
        ├──tokenizer_config.json
        ├──tokenizer.json
        ├──pytorch_model.bin
        ...
```

### Generate anchors by K-means
```bash
mkdir anchor_files
python3 tools/anchor_bbox3d_kmeans.py \
    --ann_file data/embodiedscan/embodiedscan_infos_train.pkl \
    --output_file anchor_files/embodiedscan_kmeans.npy
```
You can also download the anchor file we provide.


### Modify config
According to personal needs, modify some config items, such as [`DEBUG`](../configs/bip3d_det.py#L8), [`collect_dir`](../configs/bip3d_det.py#L425), [`save_dir`](../configs/bip3d_det.py#L475), and [`load_from`](../configs/bip3d_det.py#L485).

If performing multi-machine training, ensure that [`collect_dir`](../configs/bip3d_det.py#L425) is a shared folder accessible by all machines.

### Run local training and testing
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# train
bash engine.sh  configs/xxx.py

# test
bash test.sh  configs/xxx.py  path/to/checkpoint
```


================================================
FILE: engine.sh
================================================
export PYTHONPATH=./:$PYTHONPATH
CONFIG=configs/bip3d_det.py
if ! [[ -z $1 ]]; then
    echo $1
    CONFIG=$1
fi
nvcc -V
which nvcc

gpus=(${CUDA_VISIBLE_DEVICES//,/ })
gpu_num=${#gpus[@]}
echo "number of gpus: "${gpu_num}
echo $CONFIG

if [ ${gpu_num} -gt 1 ]
then
    bash ./tools/dist_train.sh \
        ${CONFIG} \
        ${gpu_num} \
        --work-dir=work-dirs
else
    python ./tools/train.py \
        ${CONFIG} \
        --work-dir ./work-dir
fi


================================================
FILE: requirements.txt
================================================
einops==0.8.0
huggingface-hub==0.26.5
mmcv==2.1.0
mmdet==3.3.0
mmengine==0.10.4
ninja==1.11.1.3
numpy==1.26.4
opencv-python==4.10.0.84
pytorch3d==0.7.5
requests==2.32.3
scipy==1.14.1
tensorboard==2.18.0
timm==1.0.3
tokenizers==0.21.0
torch==2.1.0+cu118
torchaudio==2.1.0+cu118
torchmetrics==1.6.1
torchvision==0.16.0+cu118


================================================
FILE: test.sh
================================================
export PYTHONPATH=./:$PYTHONPATH
CONFIG=$1
CKPT=$2

nvcc -V
which nvcc

gpus=(${CUDA_VISIBLE_DEVICES//,/ })
gpu_num=${#gpus[@]}
echo "number of gpus: "${gpu_num}
echo "ckeckpoint: "$CKPT
echo $CONFIG


if [ ${gpu_num} -gt 1 ]
then
    bash ./tools/dist_test.sh \
        ${CONFIG} \
        $CKPT \
        $gpu_num \
        --work-dir ./work-dir
else
    python3 tools/test.py \
        $CONFIG \
        $CKPT \
        --work-dir ./work-dir
fi


================================================
FILE: tools/anchor_bbox3d_kmeans.py
================================================
import torch
import pickle
import tqdm
from sklearn.cluster import KMeans
import numpy as np

from bip3d.structures.bbox_3d import EulerDepthInstance3DBoxes


def sample(ids, n):
    if n == len(ids):
        return np.copy(ids)
    elif n > len(ids):
        return np.concatenate(
            [ids, sample(ids, n - len(ids))]
        )
    else:
        interval = len(ids) / n
        output = []
        for i in range(n):
            output.append(ids[int(interval*i)])
        return np.array(output)


def kmeans(
    ann_file,
    output_file,
    z_min=-0.2,
    z_max=3,
):
    ann = pickle.load(open(ann_file, "rb"))
    all_cam_bbox = []
    for x in tqdm.tqdm(ann["data_list"]):
        bbox = np.array([y["bbox_3d"] for y in x["instances"]])
        ids = np.arange(len(x["images"]))
        ids = sample(ids, 50)
        for idx in ids:
            image = x["images"][idx]
            global2cam = np.linalg.inv(x['axis_align_matrix'] @ image['cam2global'])
            _bbox = EulerDepthInstance3DBoxes(np.copy(bbox[image["visible_instance_ids"]]))
            mask = torch.logical_and(
                _bbox.tensor[:, 2] > z_min,
                _bbox.tensor[:, 2] < z_max
            )
            _bbox = _bbox[mask]
            _bbox.transform(global2cam)
            all_cam_bbox.append(_bbox.tensor.numpy())
    all_cam_bbox = np.concatenate(all_cam_bbox)
    print("start to kmeans, please wait")
    cluster_cam = KMeans(n_clusters=100).fit(all_cam_bbox).cluster_centers_
    cluster_cam[:, 3:6] = np.log(cluster_cam[:, 3:6])
    np.save(output_file, cluster_cam)


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(
        description='anchor bbox3d kmeans for embodiedscan dataset')
    parser.add_argument("ann_file")
    parser.add_argument("--output_file")
    parser.add_argument("--z_min", defaule=-0.2)
    parser.add_argument("--z_max", defaule=3)
    args = parser.parse_args()
    kmeans(args.ann_file, args.output_file, args.z_min, args.z_max)


================================================
FILE: tools/benchmark.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os

from mmengine import MMLogger
from mmengine.config import Config, DictAction
from mmengine.dist import init_dist
from mmengine.registry import init_default_scope
from mmengine.utils import mkdir_or_exist

from mmdet.utils.benchmark import (DataLoaderBenchmark, DatasetBenchmark,
                                   InferenceBenchmark)

from embodiedscan import *

def parse_args():
    parser = argparse.ArgumentParser(description='MMDet benchmark')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('--checkpoint', help='checkpoint file')
    parser.add_argument(
        '--task',
        choices=['inference', 'dataloader', 'dataset'],
        default='dataloader',
        help='Which task do you want to go to benchmark')
    parser.add_argument(
        '--repeat-num',
        type=int,
        default=1,
        help='number of repeat times of measurement for averaging the results')
    parser.add_argument(
        '--max-iter', type=int, default=2000, help='num of max iter')
    parser.add_argument(
        '--log-interval', type=int, default=50, help='interval of logging')
    parser.add_argument(
        '--num-warmup', type=int, default=5, help='Number of warmup')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')
    parser.add_argument(
        '--dataset-type',
        choices=['train', 'val', 'test'],
        default='test',
        help='Benchmark dataset type. only supports train, val and test')
    parser.add_argument(
        '--work-dir',
        help='the directory to save the file containing '
        'benchmark metrics')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)
    return args


def inference_benchmark(args, cfg, distributed, logger):
    benchmark = InferenceBenchmark(
        cfg,
        args.checkpoint,
        distributed,
        args.fuse_conv_bn,
        args.max_iter,
        args.log_interval,
        args.num_warmup,
        logger=logger)
    return benchmark


def dataloader_benchmark(args, cfg, distributed, logger):
    benchmark = DataLoaderBenchmark(
        cfg,
        distributed,
        args.dataset_type,
        args.max_iter,
        args.log_interval,
        args.num_warmup,
        logger=logger)
    return benchmark


def dataset_benchmark(args, cfg, distributed, logger):
    benchmark = DatasetBenchmark(
        cfg,
        args.dataset_type,
        args.max_iter,
        args.log_interval,
        args.num_warmup,
        logger=logger)
    return benchmark


def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)

    init_default_scope(cfg.get('default_scope', 'mmdet'))

    distributed = False
    if args.launcher != 'none':
        init_dist(args.launcher, **cfg.get('env_cfg', {}).get('dist_cfg', {}))
        distributed = True

    log_file = None
    if args.work_dir:
        log_file = os.path.join(args.work_dir, 'benchmark.log')
        mkdir_or_exist(args.work_dir)

    logger = MMLogger.get_instance(
        'mmdet', log_file=log_file, log_level='INFO')

    benchmark = eval(f'{args.task}_benchmark')(args, cfg, distributed, logger)
    benchmark.run(args.repeat_num)


if __name__ == '__main__':
    main()


================================================
FILE: tools/ckpt_rename.py
================================================
import os
import torch

def get_renamed_ckpt(file, output="./"):
    ckpt_rename = dict()
    ckpt = torch.load(file)
    if "state_dict" in ckpt:
        ckpt = ckpt["state_dict"]
    for key,value in ckpt.items():
        k = None
        if key.startswith("backbone") or key.startswith("neck"):
            k = key
        elif key.startswith("language_model."):
            k = key.replace("language_model.", "text_encoder.")
        elif key.startswith("encoder."):
            if key.startswith("encoder.layers."):
                k = key.replace("encoder.layers.", "feature_enhancer.img_attn_blocks.")
            elif key.startswith("encoder.fusion_layers."):
                k = key.replace("encoder.fusion_layers.", "feature_enhancer.text_img_attn_blocks.")
            elif key.startswith("encoder.text_layers."):
                k = key.replace("encoder.text_layers.", "feature_enhancer.text_attn_blocks.")
        elif key.startswith("decoder.layers."):
            ops = [
                "gnn",
                "norm",
                "text_cross_attn",
                "norm",
                "deformable",
                "norm",
                "ffn",
                "norm",
                "refine",
            ]
            layer_id = int(key.split(".")[2])
            name = key.split(".")[3]
            for op in ops:
                if name == "self_attn":
                    block_id = 0
                elif name == "cross_attn_text":
                    block_id = 2
                elif name == "cross_attn":
                    block_id = 4
                elif name == "ffn":
                    block_id = 6
                elif name == "norms":
                    norm_id = int(key.split(".")[4])
                    block_id = (norm_id + 1) * 2 - 1
            op_id = block_id + layer_id * len(ops)
            k = f"decoder.layers.{op_id}."
            if name == "norms":
                k += ".".join(key.split(".")[5:])
            else:
                k += ".".join(key.split(".")[4:])
        elif key.startswith("bbox_head."):
            layer_id = int(key.split(".")[2])
            op_id = 8 + layer_id * 9
            if "reg_branches" in key:
                k = f"decoder.layers.{op_id}." + ".".join(key.split(".")[3:])
            elif "cls_branches" in key:
                k = f"decoder.layers.{op_id}.bias"
        elif "pts_prob_fc" in key:
            k = "spatial_enhancer." + key
        elif key in [
            "pts_prob_pre_fc.weight",
            "pts_prob_pre_fc.bias",
            "pts_fc.weight",
            "pts_fc.bias",
            "fusion_fc.0.layers.0.0.weight",
            "fusion_fc.0.layers.0.0.bias",
            "fusion_fc.0.layers.1.weight",
            "fusion_fc.0.layers.1.bias",
            "fusion_fc.1.weight",
            "fusion_fc.1.bias",
            "fusion_norm.weight",
            "fusion_norm.bias",
        ]:
            k = "spatial_enhancer." + key
        elif key == "level_embed":
            k = "feature_enhancer.level_embed"
        elif key == "query_embedding.weight":
            k = "decoder.instance_bank.instance_feature"
        if k is None:
    #         print(key)
            k = key
        if k == "decoder.norm.weight":
            print(key)
        ckpt_rename[k] = value

    path, file_name = os.path.split(file)
    file_name = file_name[:-4]+"-rename.pth"
    output_file = os.path.join(output, file_name)
    torch.save(ckpt_rename, output_file)

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(
        description='mmdet Grounding-DINO checkpoint rename to BIP3D')
    parser.add_argument("file")
    parser.add_argument("--output", default=None)
    args = parser.parse_args()
    if args.output is None:
        output = "./"
    get_renamed_ckpt(args.file, output)


================================================
FILE: tools/dist_test.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-11000}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}


================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-12050}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}


================================================
FILE: tools/test.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import time
import os

def wait_before_import_config():
    t = int(os.environ.get('LOCAL_RANK', 0))
    time.sleep(t * 0.5)

def wait_after_import_config():
    t = int(os.environ.get('WORLD_SIZE', 0)) - int(os.environ.get('LOCAL_RANK', 0))
    time.sleep(t * 0.5)

wait_before_import_config()
from mmengine.config import Config
wait_after_import_config()

import argparse
import os
import os.path as osp

from mmengine.config import Config, ConfigDict, DictAction
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
import torch
torch.multiprocessing.set_sharing_strategy('file_system')

# from embodiedscan.utils import replace_ceph_backend


# TODO: support fuse_conv_bn and format_only
def parse_args():
    parser = argparse.ArgumentParser(
        description='MMDet3D test (and eval) a model')
    parser.add_argument('config', help='test config file path')
    parser.add_argument('checkpoint', help='checkpoint file')
    parser.add_argument(
        '--work-dir',
        help='the directory to save the file containing evaluation metrics')
    parser.add_argument('--task-name', help='task names')
    parser.add_argument('--ceph',
                        action='store_true',
                        help='Use ceph as data storage backend')
    parser.add_argument('--show',
                        action='store_true',
                        help='show prediction results')
    parser.add_argument('--show-dir',
                        help='directory where painted images will be saved. '
                        'If specified, it will be automatically saved '
                        'to the work_dir/timestamp/show_dir')
    parser.add_argument('--score-thr',
                        type=float,
                        default=0.1,
                        help='bbox score threshold')
    parser.add_argument(
        '--task',
        type=str,
        choices=[
            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
            'multi-modality_det'
        ],
        help='Determine the visualization method depending on the task.')
    parser.add_argument('--wait-time',
                        type=float,
                        default=2,
                        help='the interval of show (s)')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument('--launcher',
                        choices=['none', 'pytorch', 'slurm', 'mpi'],
                        default='none',
                        help='job launcher')
    parser.add_argument('--tta',
                        action='store_true',
                        help='Test time augmentation')
    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
    # will pass the `--local-rank` parameter to `tools/test.py` instead
    # of `--local_rank`.
    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)
    return args


def trigger_visualization_hook(cfg, args):
    default_hooks = cfg.default_hooks
    if 'visualization' in default_hooks:
        visualization_hook = default_hooks['visualization']
        # Turn on visualization
        visualization_hook['draw'] = True
        if args.show:
            visualization_hook['show'] = True
            visualization_hook['wait_time'] = args.wait_time
        if args.show_dir:
            visualization_hook['test_out_dir'] = args.show_dir
        all_task_choices = [
            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
            'multi-modality_det'
        ]
        assert args.task in all_task_choices, 'You must set '\
            f"'--task' in {all_task_choices} in the command " \
            'if you want to use visualization hook'
        visualization_hook['vis_task'] = args.task
        visualization_hook['score_thr'] = args.score_thr
    else:
        raise RuntimeError(
            'VisualizationHook must be included in default_hooks.'
            'refer to usage '
            '"visualization=dict(type=\'VisualizationHook\')"')

    return cfg


def main():
    args = parse_args()

    # load config
    cfg = Config.fromfile(args.config)

    # TODO: We will unify the ceph support approach with other OpenMMLab repos
    # if args.ceph:
    #     cfg = replace_ceph_backend(cfg)

    cfg.launcher = args.launcher
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif args.task_name is not None:
        cfg.work_dir = osp.join('./work_dirs', args.task_name)
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])

    cfg.load_from = args.checkpoint

    if args.show or args.show_dir:
        cfg = trigger_visualization_hook(cfg, args)

    if args.tta:
        # Currently, we only support tta for 3D segmentation
        # TODO: Support tta for 3D detection
        assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.'
        assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` in config.'
        cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
        cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model)

    # build the runner from config
    if 'runner_type' not in cfg:
        # build the default runner
        runner = Runner.from_cfg(cfg)
    else:
        # build customized runner from the registry
        # if 'runner_type' is set in the cfg
        runner = RUNNERS.build(cfg)

    # start testing
    runner.test()


if __name__ == '__main__':
    main()


================================================
FILE: tools/train.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import time
import os

def wait_before_import_config():
    t = int(os.environ.get('LOCAL_RANK', 0))
    time.sleep(t * 0.5)

def wait_after_import_config():
    t = int(os.environ.get('WORLD_SIZE', 0)) - int(os.environ.get('LOCAL_RANK', 0))
    time.sleep(t * 0.5)

wait_before_import_config()
from mmengine.config import Config
wait_after_import_config()


import argparse
import logging
import os
import os.path as osp

from mmengine.config import Config, DictAction
from mmengine.logging import print_log
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
from bip3d import *
import torch
# torch.autograd.set_detect_anomaly(True)
torch.multiprocessing.set_sharing_strategy('file_system')


def parse_args():
    parser = argparse.ArgumentParser(description='Train a 3D detector')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument('--task-name', help='task names')
    parser.add_argument('--amp',
                        action='store_true',
                        default=False,
                        help='enable automatic-mixed-precision training')
    parser.add_argument('--auto-scale-lr',
                        action='store_true',
                        help='enable automatically scaling LR.')
    parser.add_argument(
        '--resume',
        nargs='?',
        type=str,
        const='auto',
        help='If specify checkpoint path, resume from it, while if not '
        'specify, try to auto resume from the latest checkpoint '
        'in the work directory.')
    parser.add_argument('--ceph',
                        action='store_true',
                        help='Use ceph as data storage backend')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument('--launcher',
                        choices=['none', 'pytorch', 'slurm', 'mpi'],
                        default='none',
                        help='job launcher')
    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
    # will pass the `--local-rank` parameter to `tools/train.py` instead
    # of `--local_rank`.
    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)
    return args


def main():
    args = parse_args()

    # load config
    cfg = Config.fromfile(args.config)

    # TODO: We will unify the ceph support approach with other OpenMMLab repos
    # if args.ceph:
    #     cfg = replace_ceph_backend(cfg)

    cfg.launcher = args.launcher
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif args.task_name is not None:
        cfg.work_dir = osp.join('./work_dirs', args.task_name)
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])

    # enable automatic-mixed-precision training
    if args.amp is True:
        optim_wrapper = cfg.optim_wrapper.type
        if optim_wrapper == 'AmpOptimWrapper':
            print_log('AMP training is already enabled in your config.',
                      logger='current',
                      level=logging.WARNING)
        else:
            assert optim_wrapper == 'OptimWrapper', (
                '`--amp` is only supported when the optimizer wrapper type is '
                f'`OptimWrapper` but got {optim_wrapper}.')
            cfg.optim_wrapper.type = 'AmpOptimWrapper'
            cfg.optim_wrapper.loss_scale = 'dynamic'

    # enable automatically scaling LR
    if args.auto_scale_lr:
        if 'auto_scale_lr' in cfg and \
                'enable' in cfg.auto_scale_lr and \
                'base_batch_size' in cfg.auto_scale_lr:
            cfg.auto_scale_lr.enable = True
        else:
            raise RuntimeError('Can not find "auto_scale_lr" or '
                               '"auto_scale_lr.enable" or '
                               '"auto_scale_lr.base_batch_size" in your'
                               ' configuration file.')

    # resume is determined in this priority: resume from > auto_resume
    if args.resume == 'auto':
        cfg.resume = True
        cfg.load_from = None
    elif args.resume is not None:
        cfg.resume = True
        cfg.load_from = args.resume

    # build the runner from config
    if 'runner_type' not in cfg:
        # build the default runner
        runner = Runner.from_cfg(cfg)
    else:
        # build customized runner from the registry
        # if 'runner_type' is set in the cfg
        runner = RUNNERS.build(cfg)

    # start training
    runner.train()


if __name__ == '__main__':
    main()