Repository: HorizonRobotics/BIP3D
Branch: main
Commit: b1ed6a84faaf
Files: 87
Total size: 582.4 KB
Directory structure:
gitextract_nz633hq0/
├── .gitignore
├── LICENSE
├── README.md
├── bip3d/
│ ├── __init__.py
│ ├── converter/
│ │ ├── extract_occupancy_ann.py
│ │ ├── generate_image_3rscan.py
│ │ ├── generate_image_matterport3d.py
│ │ └── generate_image_scannet.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── embodiedscan_det_grounding_dataset.py
│ │ ├── transforms/
│ │ │ ├── __init__.py
│ │ │ ├── augmentation.py
│ │ │ ├── formatting.py
│ │ │ ├── loading.py
│ │ │ ├── multiview.py
│ │ │ └── transform.py
│ │ └── utils.py
│ ├── eval/
│ │ ├── __init__.py
│ │ ├── indoor_eval.py
│ │ └── metrics/
│ │ ├── __init__.py
│ │ ├── det_metric.py
│ │ └── grounding_metric.py
│ ├── grid_mask.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── base_target.py
│ │ ├── bbox3d_decoder.py
│ │ ├── bert.py
│ │ ├── data_preprocessors/
│ │ │ ├── __init__.py
│ │ │ ├── custom_data_preprocessor.py
│ │ │ └── utils.py
│ │ ├── deformable_aggregation.py
│ │ ├── feature_enhancer.py
│ │ ├── instance_bank.py
│ │ ├── spatial_enhancer.py
│ │ ├── structure.py
│ │ ├── target.py
│ │ └── utils.py
│ ├── ops/
│ │ ├── __init__.py
│ │ ├── deformable_aggregation.py
│ │ ├── gcc.sh
│ │ ├── setup.py
│ │ └── src/
│ │ ├── deformable_aggregation.cpp
│ │ ├── deformable_aggregation_cuda.cu
│ │ ├── deformable_aggregation_with_depth.cpp
│ │ └── deformable_aggregation_with_depth_cuda.cu
│ ├── registry.py
│ ├── structures/
│ │ ├── __init__.py
│ │ ├── bbox_3d/
│ │ │ ├── __init__.py
│ │ │ ├── base_box3d.py
│ │ │ ├── box_3d_mode.py
│ │ │ ├── coord_3d_mode.py
│ │ │ ├── euler_box3d.py
│ │ │ ├── euler_depth_box3d.py
│ │ │ └── utils.py
│ │ ├── ops/
│ │ │ ├── __init__.py
│ │ │ ├── box_np_ops.py
│ │ │ ├── iou3d_calculator.py
│ │ │ └── transforms.py
│ │ └── points/
│ │ ├── __init__.py
│ │ ├── base_points.py
│ │ ├── cam_points.py
│ │ ├── depth_points.py
│ │ └── lidar_points.py
│ └── utils/
│ ├── __init__.py
│ ├── array_converter.py
│ ├── default_color_map.py
│ ├── dist_utils.py
│ ├── line_mesh.py
│ └── typing_config.py
├── configs/
│ ├── __init__.py
│ ├── bip3d_det.py
│ ├── bip3d_det_grounding.py
│ ├── bip3d_det_rgb.py
│ ├── bip3d_grounding.py
│ ├── bip3d_grounding_rgb.py
│ └── default_runtime.py
├── docs/
│ └── quick_start.md
├── engine.sh
├── requirements.txt
├── test.sh
└── tools/
├── anchor_bbox3d_kmeans.py
├── benchmark.py
├── ckpt_rename.py
├── dist_test.sh
├── dist_train.sh
├── test.py
└── train.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
*.so
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don’t work, or not
# install at all, so you might want to exclude Pipfile.lock from version control.
Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# VS Code
.vscode/
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2024 Horizon Robotics
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# BIP3D: Bridging 2D Images and 3D Perception for Embodied Intelligence
## :rocket: News
**01/Jun/2025**: We have refactored and integrated the BIP3D code into [robo_orchard_lab](https://github.com/HorizonRobotics/robo_orchard_lab/tree/master/projects/bip3d_grounding), removing the dependency on MM series. The environment is now easier to set up, and the performance are improved. Welcome to try it out!
**14/Mar/2025**: Our code has been released.
**27/Feb/2025**: Our paper has been accepted by CVPR 2025.
**22/Nov/2024**: We release our paper to [Arxiv](https://arxiv.org/abs/2411.14869).
## :open_book: Quick Start
[Quick Start](docs/quick_start.md)
## :link: Framework
The Architecture Diagram of BIP3D, where the red stars indicate the parts that have been modified or added compared to the base model, GroundingDINO, and dashed lines indicate optional elements.
## :trophy: Results on EmbodiedScan Benchmark
We made several improvements based on the original paper, achieving better 3D perception results. The main improvements include the following two points:
1. **New Fusion Operation**: We enhanced the decoder by replacing the deformable aggregation (DAG) with a 3D deformable attention mechanism (DAT). Specifically, we improved the feature sampling process by transitioning from bilinear interpolation to trilinear interpolation, which leverages depth distribution for more accurate feature extraction.
2. **Mixed Data Training**: To optimize the grounding model's performance, we adopted a mixed-data training strategy by integrating detection data with grounding data during the grounding finetuning process.
### 1. Results on Multi-view 3D Detection Validation Dataset
Op DAG denotes deformable aggregation, and DAT denotes 3D deformable attention. Set [`with_depth=True`](configs/bip3d_det.py#L175) to activate the DAT.
The metric in the table is `AP@0.25`. For more metrics, please refer to the logs.
|Model | Inputs | Op | Overall | Head | Common | Tail | Small | Medium | Large | ScanNet | 3RScan | MP3D | ckpt | log |
| :---- | :---: | :---: | :---: |:---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: | :----: | :---: |
|[BIP3D](configs/bip3d_det_rgb.py) | RGB | DAG | 16.57|23.29|13.84|12.29|2.67|17.85|12.89|19.71|26.76|8.50 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dag/job.log) |
|[BIP3D](configs/bip3d_det_rgb.py) | RGB | DAT | 16.67|22.41|14.19|13.18|3.32|17.25|14.89|20.80|24.18|9.91 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dat/job.log) |
|[BIP3D](configs/bip3d_det.py) |RGB-D | DAG | 22.53|28.89|20.51|17.83|6.95|24.21|15.46|24.77|35.29|10.34 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dag/job.log) |
|[BIP3D](configs/bip3d_det.py) |RGB-D | DAT | 23.24|31.51|20.20|17.62|7.31|24.09|15.82|26.35|36.29|11.44 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dat/job.log) |
### 2. Results on Multi-view 3D Grounding Mini Dataset
To train and validate with mini dataset, set [`data_version="v1-mini"`](configs/bip3d_grounding.py#L333).
|Model | Inputs | Op | Overall | Easy | Hard | View-dep | View-indep | ScanNet | 3RScan | MP3D | ckpt | log |
| :---- | :---: | :---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: |:---: | :----: |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAG | 44.00|44.39|39.56|46.05|42.92|48.62|42.47|36.40 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dag/job.log) |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAT | 44.43|44.74|41.02|45.17|44.04|49.70|41.81|37.28 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dat/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAG | 45.79|46.22|40.91|45.93|45.71|48.94|46.61|37.36 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dag/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAT | 58.47|59.02|52.23|60.20|57.56|66.63|54.79|46.72 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dat/job.log) |
### 3. Results on Multi-view 3D Grounding Validation Dataset
|Model | Inputs | Op | Mixed Data | Overall | Easy | Hard | View-dep | View-indep | ScanNet | 3RScan | MP3D | ckpt | log |
| :---- | :---: | :---: | :---: |:---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: |:---: | :----: |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAG |No| 45.81|46.21|41.34|47.07|45.09|50.40|47.53|32.97 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dag/job.log) |
|[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAT |No| 47.29|47.82|41.42|48.58|46.56|52.74|47.85|34.60 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dat/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAG |No| 53.75|53.87|52.43|55.21|52.93|60.05|54.92|38.20 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dag/job.log) |
|[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAT |No|61.36|61.88|55.58|62.43|60.76|66.96|62.75|46.92 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat/job.log) |
|[BIP3D](configs/bip3d_det_grounding.py) | RGB-D | DAT |Yes|66.58|66.99|62.07|67.95|65.81|72.43|68.26|51.14 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat_mixdata/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat_mixdata/job.log) |
### 4. [Results on Multi-view 3D Grounding Test Dataset](https://huggingface.co/spaces/AGC2024/visual-grounding-2024)
|Model | Overall | Easy | Hard | View-dep | View-indep | ckpt | log |
| :---- | :---: | :---: | :---: | :---: | :---:| :---:|:---:|
|[EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan) | 39.67 | 40.52 | 30.24 | 39.05 | 39.94 | - | - |
|[SAG3D*](https://opendrivelab.github.io/Challenge%202024/multiview_Mi-Robot.pdf) | 46.92 | 47.72 | 38.03 | 46.31 | 47.18 | - | - |
|[DenseG*](https://opendrivelab.github.io/Challenge%202024/multiview_THU-LenovoAI.pdf) | 59.59 | 60.39 | 50.81 | 60.50 | 59.20 | - | - |
|[BIP3D](configs/bip3d_det_grounding.py) | 67.38 | 68.12 | 59.08 | 67.88 | 67.16 | - | - |
|BIP3D-B | 70.53 | 71.22 | 62.91 | 70.69 | 70.47 | - | - |
`*` denotes model ensemble, and note that our BIP3D does not use the ensemble trick. This differs from what is mentioned in the paper and shows significant improvements.
Our best model, BIP3D-B, is based on GroundingDINO-base and is trained with the addition ARKitScenes dataset.
## :page_facing_up: Citation
```
@article{lin2024bip3d,
title={BIP3D: Bridging 2D Images and 3D Perception for Embodied Intelligence},
author={Lin, Xuewu and Lin, Tianwei and Huang, Lichao and Xie, Hongyu and Su, Zhizhong},
journal={arXiv preprint arXiv:2411.14869},
year={2024}
}
```
## :handshake: Acknowledgement
[EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan)
[Sparse4D](https://github.com/HorizonRobotics/Sparse4D)
[3D-deformable-attention](https://github.com/IDEA-Research/3D-deformable-attention)
[mmdet-GroundingDINO](https://github.com/open-mmlab/mmdetection/tree/main/configs/grounding_dino)
================================================
FILE: bip3d/__init__.py
================================================
from .registry import *
================================================
FILE: bip3d/converter/extract_occupancy_ann.py
================================================
import os
import shutil
from argparse import ArgumentParser
from tqdm import tqdm
def extract_occupancy(dataset, src, dst):
"""Extract occupancy annotations of a single dataset to dataset root."""
print('Processing dataset', dataset)
scenes = os.listdir(os.path.join(src, dataset))
dst_dataset = os.path.join(dst, dataset)
if not os.path.exists(dst_dataset):
print('Missing dataset:', dataset)
return
for scene in tqdm(scenes):
if dataset == 'scannet':
dst_scene = os.path.join(dst_dataset, 'scans', scene)
else:
dst_scene = os.path.join(dst_dataset, scene)
if not os.path.exists(dst_scene):
print(f'Missing scene {scene} in dataset {dataset}')
continue
dst_occ = os.path.join(dst_scene, 'occupancy')
if not os.path.exists(dst_occ):
shutil.copytree(os.path.join(src, dataset, scene), dst_occ)
else:
files = os.listdir(os.path.join(src, dataset, scene))
for file in files:
if not os.path.exists(os.path.join(dst_occ, file)):
shutil.copyfile(os.path.join(src, dataset, scene, file),
os.path.join(dst_occ, file))
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--src',
required=True,
help='folder of the occupancy annotations')
parser.add_argument('--dst',
required=True,
help='folder of the raw datasets')
args = parser.parse_args()
datasets = os.listdir(args.src)
for dataset in datasets:
extract_occupancy(dataset, args.src, args.dst)
================================================
FILE: bip3d/converter/generate_image_3rscan.py
================================================
import os
import zipfile
from argparse import ArgumentParser
from functools import partial
import mmengine
def process_scene(path, scene_name):
"""Process single 3Rscan scene."""
with zipfile.ZipFile(os.path.join(path, scene_name, 'sequence.zip'),
'r') as zip_ref:
zip_ref.extractall(os.path.join(path, scene_name, 'sequence'))
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--dataset_folder',
required=True,
help='folder of the dataset.')
parser.add_argument('--nproc', type=int, default=8)
args = parser.parse_args()
mmengine.track_parallel_progress(func=partial(process_scene,
args.dataset_folder),
tasks=os.listdir(args.dataset_folder),
nproc=args.nproc)
================================================
FILE: bip3d/converter/generate_image_matterport3d.py
================================================
import os
import zipfile
from argparse import ArgumentParser
from functools import partial
import mmengine
def process_scene(path, output_folder, scene_name):
"""Process single 3Rscan scene."""
files = list(os.listdir(os.path.join(path, scene_name)))
for file in files:
if not file.endswith(".zip"):
continue
if file != "sens.zip":
continue
with zipfile.ZipFile(os.path.join(path, scene_name, file),
'r') as zip_ref:
if file == "sens.zip":
zip_ref.extractall(os.path.join(output_folder, scene_name, file[:-4]))
else:
zip_ref.extractall(output_folder)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--dataset_folder',
required=True,
help='folder of the dataset.')
parser.add_argument('--output_folder',
required=True,
help='output folder of the dataset.')
parser.add_argument('--nproc', type=int, default=8)
args = parser.parse_args()
if not os.path.exists(args.output_folder):
os.makedirs(args.output_folder, exist_ok=True)
mmengine.track_parallel_progress(func=partial(process_scene,
args.dataset_folder, args.output_folder),
tasks=os.listdir(args.dataset_folder),
nproc=args.nproc)
================================================
FILE: bip3d/converter/generate_image_scannet.py
================================================
# Modified from https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py # noqa
import os
import struct
import zlib
from argparse import ArgumentParser
from functools import partial
import imageio
import mmengine
import numpy as np
COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'}
COMPRESSION_TYPE_DEPTH = {
-1: 'unknown',
0: 'raw_ushort',
1: 'zlib_ushort',
2: 'occi_ushort'
}
class RGBDFrame:
"""Class for single ScanNet RGB-D image processing."""
def load(self, file_handle):
"""Load basic information of a given RGBD frame."""
self.camera_to_world = np.asarray(struct.unpack(
'f' * 16, file_handle.read(16 * 4)),
dtype=np.float32).reshape(4, 4)
self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0]
self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0]
self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
self.color_data = b''.join(
struct.unpack('c' * self.color_size_bytes,
file_handle.read(self.color_size_bytes)))
self.depth_data = b''.join(
struct.unpack('c' * self.depth_size_bytes,
file_handle.read(self.depth_size_bytes)))
def decompress_depth(self, compression_type):
"""Decompress the depth data."""
assert compression_type == 'zlib_ushort'
return zlib.decompress(self.depth_data)
def decompress_color(self, compression_type):
"""Decompress the RGB image data."""
assert compression_type == 'jpeg'
return imageio.imread(self.color_data)
class SensorData:
"""Class for single ScanNet scene processing.
Single scene file contains multiple RGB-D images.
"""
def __init__(self, filename, fast=False):
self.version = 4
self.load(filename, fast)
def load(self, filename, fast):
"""Load a single scene data with multiple RGBD frames."""
with open(filename, 'rb') as f:
version = struct.unpack('I', f.read(4))[0]
assert self.version == version
strlen = struct.unpack('Q', f.read(8))[0]
self.sensor_name = b''.join(
struct.unpack('c' * strlen, f.read(strlen)))
self.intrinsic_color = np.asarray(struct.unpack(
'f' * 16, f.read(16 * 4)),
dtype=np.float32).reshape(4, 4)
self.extrinsic_color = np.asarray(struct.unpack(
'f' * 16, f.read(16 * 4)),
dtype=np.float32).reshape(4, 4)
self.intrinsic_depth = np.asarray(struct.unpack(
'f' * 16, f.read(16 * 4)),
dtype=np.float32).reshape(4, 4)
self.extrinsic_depth = np.asarray(struct.unpack(
'f' * 16, f.read(16 * 4)),
dtype=np.float32).reshape(4, 4)
self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack(
'i', f.read(4))[0]]
self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack(
'i', f.read(4))[0]]
self.color_width = struct.unpack('I', f.read(4))[0]
self.color_height = struct.unpack('I', f.read(4))[0]
self.depth_width = struct.unpack('I', f.read(4))[0]
self.depth_height = struct.unpack('I', f.read(4))[0]
self.depth_shift = struct.unpack('f', f.read(4))[0]
num_frames = struct.unpack('Q', f.read(8))[0]
self.num_frames = num_frames
self.frames = []
if fast:
index = list(range(num_frames))[::10]
else:
index = list(range(num_frames))
self.index = index
for i in range(num_frames):
frame = RGBDFrame()
frame.load(f)
if i in index:
self.frames.append(frame)
def export_depth_images(self, output_path):
"""Export depth images to the output path."""
if not os.path.exists(output_path):
os.makedirs(output_path)
for f in range(len(self.frames)):
depth_data = self.frames[f].decompress_depth(
self.depth_compression_type)
depth = np.fromstring(depth_data, dtype=np.uint16).reshape(
self.depth_height, self.depth_width)
imageio.imwrite(
os.path.join(output_path,
self.index_to_str(self.index[f]) + '.png'), depth)
def export_color_images(self, output_path):
"""Export RGB images to the output path."""
if not os.path.exists(output_path):
os.makedirs(output_path)
for f in range(len(self.frames)):
color = self.frames[f].decompress_color(
self.color_compression_type)
imageio.imwrite(
os.path.join(output_path,
self.index_to_str(self.index[f]) + '.jpg'), color)
@staticmethod
def index_to_str(index):
"""Convert the sample index to string."""
return str(index).zfill(5)
@staticmethod
def save_mat_to_file(matrix, filename):
"""Save a matrix to file."""
with open(filename, 'w') as f:
for line in matrix:
np.savetxt(f, line[np.newaxis], fmt='%f')
def export_poses(self, output_path):
"""Export camera poses to the output path."""
if not os.path.exists(output_path):
os.makedirs(output_path)
for f in range(len(self.frames)):
self.save_mat_to_file(
self.frames[f].camera_to_world,
os.path.join(output_path,
self.index_to_str(self.index[f]) + '.txt'))
def export_intrinsics(self, output_path):
"""Export the intrinsic matrix to the output path."""
if not os.path.exists(output_path):
os.makedirs(output_path)
self.save_mat_to_file(self.intrinsic_color,
os.path.join(output_path, 'intrinsic.txt'))
def export_depth_intrinsics(self, output_path):
"""Export the depth intrinsic matrix to the output path."""
if not os.path.exists(output_path):
os.makedirs(output_path)
self.save_mat_to_file(self.intrinsic_depth,
os.path.join(output_path, 'depth_intrinsic.txt'))
def process_scene(path, fast, idx):
"""Process single ScanNet scene.
Extract RGB images, poses and camera intrinsics.
"""
try:
print(path, fast, idx)
data = SensorData(os.path.join(path, idx, f'{idx}.sens'), fast)
# output_path = os.path.join('/bucket/output/robot_lab/users/xuewu.lin/scannet_posed_images', idx)
output_path = os.path.join('/horizon-bucket/robot_lab/users/xuewu.lin/scannet_posed_images', idx)
data.export_color_images(output_path)
data.export_intrinsics(output_path)
data.export_poses(output_path)
data.export_depth_images(output_path)
data.export_depth_intrinsics(output_path)
except Exception as e:
print("bug info: ", path, fast, idx)
raise e
def process_directory(path, fast, nproc):
"""Process the files in a directory with parallel support."""
mmengine.track_parallel_progress(func=partial(process_scene, path, fast),
# tasks=['scene0175_00'],
tasks=['scene0288_01'],
# tasks=['scene0288_01', 'scene0175_00'],
# tasks=os.listdir(path),
nproc=nproc)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--dataset_folder',
default=None,
help='folder of the dataset.')
parser.add_argument('--nproc', type=int, default=8)
parser.add_argument('--fast', action='store_true')
args = parser.parse_args()
if args.dataset_folder is not None:
os.chdir(args.dataset_folder)
# process train and val scenes
if os.path.exists('scans'):
process_directory('scans', args.fast, args.nproc)
================================================
FILE: bip3d/datasets/__init__.py
================================================
from .embodiedscan_det_grounding_dataset import EmbodiedScanDetGroundingDataset
from .transforms import * # noqa: F401,F403
__all__ = [
'EmbodiedScanDetGroundingDataset'
]
================================================
FILE: bip3d/datasets/embodiedscan_det_grounding_dataset.py
================================================
import math
import pickle
import copy
import tqdm
import os
import warnings
from typing import Callable, List, Optional, Union
import mmengine
import numpy as np
from mmengine.dataset import BaseDataset, force_full_init
from mmengine.fileio import load
from mmengine.logging import print_log
from bip3d.registry import DATASETS
from bip3d.structures import get_box_type
from .utils import sample
class_names = (
'adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', 'backpack',
'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', 'baseboard',
'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', 'bench', 'bicycle',
'bidet', 'bin', 'blackboard', 'blanket', 'blinds', 'board', 'body loofah',
'book', 'boots', 'bottle', 'bowl', 'box', 'bread', 'broom', 'brush',
'bucket', 'cabinet', 'calendar', 'camera', 'can', 'candle', 'candlestick',
'cap', 'car', 'carpet', 'cart', 'case', 'chair', 'chandelier', 'cleanser',
'clock', 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'coil',
'column', 'commode', 'computer', 'conducting wire', 'container', 'control',
'copier', 'cosmetics', 'couch', 'counter', 'countertop', 'crate', 'crib',
'cube', 'cup', 'curtain', 'cushion', 'decoration', 'desk', 'detergent',
'device', 'dish rack', 'dishwasher', 'dispenser', 'divider', 'door',
'door knob', 'doorframe', 'doorway', 'drawer', 'dress', 'dresser', 'drum',
'duct', 'dumbbell', 'dustpan', 'dvd', 'eraser', 'excercise equipment',
'fan', 'faucet', 'fence', 'file', 'fire extinguisher', 'fireplace',
'flowerpot', 'flush', 'folder', 'food', 'footstool', 'frame', 'fruit',
'furniture', 'garage door', 'garbage', 'glass', 'globe', 'glove',
'grab bar', 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 'hanger',
'hat', 'headboard', 'headphones', 'heater', 'helmets', 'holder', 'hook',
'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle',
'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', 'ladder',
'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', 'machine',
'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu',
'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', 'napkins',
'notebook', 'ottoman', 'oven', 'pack', 'package', 'pad', 'pan', 'panel',
'paper', 'paper cutter', 'partition', 'pedestal', 'pen', 'person', 'piano',
'picture', 'pillar', 'pillow', 'pipe', 'pitcher', 'plant', 'plate',
'player', 'plug', 'plunger', 'pool', 'pool table', 'poster', 'pot',
'price tag', 'printer', 'projector', 'purse', 'rack', 'radiator', 'radio',
'rail', 'range hood', 'refrigerator', 'remote control', 'ridge', 'rod',
'roll', 'roof', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen',
'seasoning', 'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel',
'shower', 'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', 'socket',
'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', 'stapler',
'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', 'structure',
'sunglasses', 'support', 'switch', 'table', 'tablet', 'teapot',
'telephone', 'thermostat', 'tissue', 'tissue box', 'toaster', 'toilet',
'toilet paper', 'toiletry', 'tool', 'toothbrush', 'toothpaste', 'towel',
'toy', 'tray', 'treadmill', 'trophy', 'tube', 'tv', 'umbrella', 'urn',
'utensil', 'vacuum cleaner', 'vanity', 'vase', 'vent', 'ventilation',
'wardrobe', 'washbasin', 'washing machine', 'water cooler', 'water heater',
'window', 'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap')
head_labels = [
48, 177, 82, 179, 37, 243, 28, 277, 32, 84, 215, 145, 182, 170, 22, 72, 30,
141, 65, 257, 221, 225, 52, 75, 231, 158, 236, 156, 47, 74, 6, 18, 71, 242,
217, 251, 66, 263, 5, 45, 14, 73, 278, 198, 24, 23, 196, 252, 19, 135, 26,
229, 183, 200, 107, 272, 246, 269, 125, 59, 279, 15, 163, 258, 57, 195, 51,
88, 97, 58, 102, 36, 137, 31, 80, 160, 155, 61, 238, 96, 190, 25, 219, 152,
142, 201, 274, 249, 178, 192
]
common_labels = [
189, 164, 101, 205, 273, 233, 131, 180, 86, 220, 67, 268, 224, 270, 53,
203, 237, 226, 10, 133, 248, 41, 55, 16, 199, 134, 99, 185, 2, 20, 234,
194, 253, 35, 174, 8, 223, 13, 91, 262, 230, 121, 49, 63, 119, 162, 79,
168, 245, 267, 122, 104, 100, 1, 176, 280, 140, 209, 259, 143, 165, 147,
117, 85, 105, 95, 109, 207, 68, 175, 106, 60, 4, 46, 171, 204, 111, 211,
108, 120, 157, 222, 17, 264, 151, 98, 38, 261, 123, 78, 118, 127, 240, 124
]
tail_labels = [
76, 149, 173, 250, 275, 255, 34, 77, 266, 283, 112, 115, 186, 136, 256, 40,
254, 172, 9, 212, 213, 181, 154, 94, 191, 193, 3, 130, 146, 70, 128, 167,
126, 81, 7, 11, 148, 228, 239, 247, 21, 42, 89, 153, 161, 244, 110, 0, 29,
114, 132, 159, 218, 232, 260, 56, 92, 116, 282, 33, 113, 138, 12, 188, 44,
150, 197, 271, 169, 206, 90, 235, 103, 281, 184, 208, 216, 202, 214, 241,
129, 210, 276, 64, 27, 87, 139, 227, 187, 62, 43, 50, 69, 93, 144, 166,
265, 54, 83, 39
]
@DATASETS.register_module()
class EmbodiedScanDetGroundingDataset(BaseDataset):
def __init__(
self,
data_root: str,
ann_file: str,
vg_file=None,
metainfo: Optional[dict] = None,
pipeline: List[Union[dict, Callable]] = [],
test_mode: bool = False,
load_eval_anns: bool = True,
filter_empty_gt: bool = True,
remove_dontcare: bool = False,
box_type_3d: str = "Euler-Depth",
dataset_length=None,
mode="detection",
max_n_images=50,
n_images_per_sample=1,
drop_last_per_scene=False,
part=None,
temporal=False,
num_text=1,
tokens_positive_rebuild=True,
sep_token="[SEP]",
**kwargs,
):
self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
self.filter_empty_gt = filter_empty_gt
self.remove_dontcare = remove_dontcare
self.load_eval_anns = load_eval_anns
self.dataset_length = dataset_length
self.part = part
self.mode = mode
assert self.mode in ["detection", "continuous", "grounding"]
super().__init__(
ann_file=ann_file,
metainfo=metainfo,
data_root=data_root,
pipeline=pipeline,
test_mode=test_mode,
serialize_data=self.mode == "detection",
**kwargs,
)
if self.mode == "continuous":
self.max_n_images = max_n_images
self.n_images_per_sample = n_images_per_sample
self.drop_last_per_scene = drop_last_per_scene
self.convert_to_continuous()
elif self.mode == "grounding":
self.vg_file = vg_file
self.num_text = num_text
self.tokens_positive_rebuild = tokens_positive_rebuild
self.sep_token = sep_token
self.load_language_data()
self.data_bytes, self.data_address = self._serialize_data()
self.serialize_data = True
print_log(f"dataset length : {self.__len__()}")
def process_metainfo(self):
assert "categories" in self._metainfo
if "classes" not in self._metainfo:
self._metainfo.setdefault(
"classes", list(self._metainfo["categories"].keys())
)
self.label_mapping = np.full(
max(list(self._metainfo["categories"].values())) + 1, -1, dtype=int
)
for key, value in self._metainfo["categories"].items():
if key in self._metainfo["classes"]:
self.label_mapping[value] = self._metainfo["classes"].index(
key
)
def parse_data_info(self, info: dict):
info["box_type_3d"] = self.box_type_3d
info["axis_align_matrix"] = self._get_axis_align_matrix(info)
info["scan_id"] = info["sample_idx"]
ann_dataset = info["sample_idx"].split("/")[0]
if ann_dataset == "matterport3d":
info["depth_shift"] = 4000.0
else:
info["depth_shift"] = 1000.0
# Because multi-view settings are different from original designs
# we temporarily follow the ori design in ImVoxelNet
info["img_path"] = []
info["depth_img_path"] = []
if "cam2img" in info:
cam2img = info["cam2img"].astype(np.float32)
else:
cam2img = []
extrinsics = []
for i in range(len(info["images"])):
img_path = os.path.join(
self.data_prefix.get("img_path", ""),
info["images"][i]["img_path"],
)
depth_img_path = os.path.join(
self.data_prefix.get("img_path", ""),
info["images"][i]["depth_path"],
)
info["img_path"].append(img_path)
info["depth_img_path"].append(depth_img_path)
align_global2cam = np.linalg.inv(
info["axis_align_matrix"] @ info["images"][i]["cam2global"]
)
extrinsics.append(align_global2cam.astype(np.float32))
if "cam2img" not in info:
cam2img.append(info["images"][i]["cam2img"].astype(np.float32))
info["depth2img"] = dict(
extrinsic=extrinsics,
intrinsic=cam2img,
origin=np.array([0.0, 0.0, 0.5]).astype(np.float32),
)
if "depth_cam2img" not in info:
info["depth_cam2img"] = cam2img
if not self.test_mode:
info["ann_info"] = self.parse_ann_info(info)
if self.test_mode and self.load_eval_anns:
info["ann_info"] = self.parse_ann_info(info)
info["eval_ann_info"] = self._remove_dontcare(info["ann_info"])
return info
def parse_ann_info(self, info: dict):
"""Process the `instances` in data info to `ann_info`.
Args:
info (dict): Info dict.
Returns:
dict: Processed `ann_info`.
"""
ann_info = None
if "instances" in info and len(info["instances"]) > 0:
ann_info = dict(
gt_bboxes_3d=np.zeros(
(len(info["instances"]), 9), dtype=np.float32
),
gt_labels_3d=np.zeros(
(len(info["instances"]),), dtype=np.int64
),
gt_names=[],
bbox_id=np.zeros((len(info["instances"]),), dtype=np.int64) - 1,
)
for idx, instance in enumerate(info["instances"]):
ann_info["gt_bboxes_3d"][idx] = instance["bbox_3d"]
ann_info["gt_labels_3d"][idx] = self.label_mapping[
instance["bbox_label_3d"]
]
ann_info["gt_names"].append(
self._metainfo["classes"][ann_info["gt_labels_3d"][idx]]
if ann_info["gt_labels_3d"][idx] >= 0
else "others"
)
ann_info["bbox_id"][idx] = instance["bbox_id"]
# pack ann_info for return
if ann_info is None:
ann_info = dict()
ann_info["gt_bboxes_3d"] = np.zeros((0, 9), dtype=np.float32)
ann_info["gt_labels_3d"] = np.zeros((0,), dtype=np.int64)
ann_info["bbox_id"] = np.zeros((0,), dtype=np.int64) - 1
ann_info["gt_names"] = []
# post-processing/filtering ann_info if not empty gt
if "visible_instance_ids" in info["images"][0]:
ids = []
for i in range(len(info["images"])):
ids.append(info["images"][i]["visible_instance_ids"])
mask_length = ann_info["gt_labels_3d"].shape[0]
ann_info["visible_instance_masks"] = self._ids2masks(
ids, mask_length
)
if self.remove_dontcare:
ann_info = self._remove_dontcare(ann_info)
ann_dataset = info["sample_idx"].split("/")[0]
ann_info["gt_bboxes_3d"] = self.box_type_3d(
ann_info["gt_bboxes_3d"],
box_dim=ann_info["gt_bboxes_3d"].shape[-1],
with_yaw=True,
origin=(0.5, 0.5, 0.5),
)
return ann_info
@staticmethod
def _get_axis_align_matrix(info: dict):
"""Get axis_align_matrix from info. If not exist, return identity mat.
Args:
info (dict): Info of a single sample data.
Returns:
np.ndarray: 4x4 transformation matrix.
"""
if "axis_align_matrix" in info:
return np.array(info["axis_align_matrix"])
else:
warnings.warn(
"axis_align_matrix is not found in ScanNet data info, please "
"use new pre-process scripts to re-generate ScanNet data"
)
return np.eye(4).astype(np.float32)
def _ids2masks(self, ids, mask_length):
"""Change visible_instance_ids to visible_instance_masks."""
masks = []
for idx in range(len(ids)):
mask = np.zeros((mask_length,), dtype=bool)
mask[ids[idx]] = 1
masks.append(mask)
return masks
def _remove_dontcare(self, ann_info: dict):
"""Remove annotations that do not need to be cared.
-1 indicates dontcare in MMDet3d.
Args:
ann_info (dict): Dict of annotation infos. The
instance with label `-1` will be removed.
Returns:
dict: Annotations after filtering.
"""
img_filtered_annotations = {}
filter_mask = ann_info["gt_labels_3d"] > -1
for key in ann_info.keys():
if key == "instances":
img_filtered_annotations[key] = ann_info[key]
elif key == "visible_instance_masks":
img_filtered_annotations[key] = []
for idx in range(len(ann_info[key])):
img_filtered_annotations[key].append(
ann_info[key][idx][filter_mask]
)
elif key == "gt_names":
img_filtered_annotations[key] = [
x for i, x in enumerate(ann_info[key]) if filter_mask[i]
]
else:
img_filtered_annotations[key] = ann_info[key][filter_mask]
return img_filtered_annotations
def load_data_list(self):
annotations = load(self.ann_file)
if not isinstance(annotations, dict):
raise TypeError(
f"The annotations loaded from annotation file "
f"should be a dict, but got {type(annotations)}!"
)
if "data_list" not in annotations or "metainfo" not in annotations:
raise ValueError(
"Annotation must have data_list and metainfo " "keys"
)
metainfo = annotations["metainfo"]
raw_data_list = annotations["data_list"]
# Meta information load from annotation file will not influence the
# existed meta information load from `BaseDataset.METAINFO` and
# `metainfo` arguments defined in constructor.
for k, v in metainfo.items():
self._metainfo.setdefault(k, v)
self.process_metainfo()
# load and parse data_infos.
data_list = []
for raw_data_info in tqdm.tqdm(
raw_data_list,
mininterval=10,
desc=f"Loading {'Test' if self.test_mode else 'Train'} dataset",
):
if self.part is not None:
valid = False
for x in self.part:
if x in raw_data_info["sample_idx"]:
valid = True
break
if not valid:
continue
data_info = self.parse_data_info(raw_data_info)
if data_info is None:
continue
assert isinstance(data_info, dict)
data_list.append(data_info)
if (
self.dataset_length is not None
and len(data_list) >= self.dataset_length
):
break
return data_list
@staticmethod
def _get_axis_align_matrix(info: dict):
if 'axis_align_matrix' in info:
return np.array(info['axis_align_matrix'])
else:
warnings.warn(
'axis_align_matrix is not found in ScanNet data info, please '
'use new pre-process scripts to re-generate ScanNet data')
return np.eye(4).astype(np.float32)
@staticmethod
def _is_view_dep(text):
"""Check whether to augment based on sr3d utterance."""
rels = [
'front', 'behind', 'back', 'left', 'right', 'facing', 'leftmost',
'rightmost', 'looking', 'across'
]
words = set(text.split())
return any(rel in words for rel in rels)
def convert_info_to_scan(self):
self.scans = dict()
for data in self.data_list:
scan_id = data['scan_id']
self.scans[scan_id] = data
self.scan_ids = list(self.scans.keys())
def load_language_data(self):
self.convert_info_to_scan()
if isinstance(self.vg_file, str):
language_annotations = load(os.path.join(self.data_root, self.vg_file))
else:
language_annotations = []
for x in self.vg_file:
language_annotations.extend(load(os.path.join(self.data_root, x)))
if self.dataset_length is not None:
interval = len(language_annotations) / self.dataset_length
output = []
for i in range(self.dataset_length):
output.append(ids[int(interval*i)])
language_annotations = output
self.data_list = language_annotations
self.scan_id_to_data_idx = {}
for scan_id in self.scan_ids:
self.scan_id_to_data_idx[scan_id] = []
for i, d in enumerate(self.data_list):
self.scan_id_to_data_idx[d["scan_id"]].append(i)
def convert_to_continuous(self):
self.convert_info_to_scan()
data_list = []
self.flag = []
self.image_id_dict = {}
for flag, scan_id in enumerate(self.scan_ids):
total_n = len(self.scans[scan_id]["images"])
sample_n = min(self.max_n_images, total_n)
ids = sample(total_n, sample_n, True).tolist()
self.image_id_dict[scan_id] = ids
if self.n_images_per_sample > 1:
if self.drop_last_per_scene:
sample_n = math.floor(sample_n / self.n_images_per_sample)
else:
sample_n = math.ceil(sample_n / self.n_images_per_sample)
ids = [
ids[i*self.n_images_per_sample : (i+1)*self.n_images_per_sample]
for i in range(sample_n)
]
data_list.extend(
[dict(scan_id=scan_id, image_id=i) for i in ids]
)
self.flag.extend([flag] * len(ids))
self.data_list = data_list
self.flag = np.array(self.flag)
def get_data_info_continuous(self, data_info):
scan_id = data_info["scan_id"]
data = copy.deepcopy(self.scans[scan_id])
img_idx = data_info["image_id"]
if isinstance(img_idx, int):
img_idx = [img_idx]
for k in ["images", "img_path", "depth_img_path"]:
data[k] = index(data[k], img_idx)
seen_image_id = self.image_id_dict[scan_id]
seen_image_id = seen_image_id[:seen_image_id.index(img_idx[0])]
if len(seen_image_id) != 0:
last_visible_mask = index(
data["ann_info"]["visible_instance_masks"], seen_image_id
)
last_visible_mask = np.stack(last_visible_mask).any(axis=0)
else:
last_visible_mask = data["ann_info"]["visible_instance_masks"][0] * False
visible_instance_masks = index(
data["ann_info"]["visible_instance_masks"], img_idx
)
current_visible_mask = np.stack(visible_instance_masks).any(axis=0)
ignore_mask = np.logical_and(
last_visible_mask, ~current_visible_mask
)
all_visible_mask = np.logical_or(
last_visible_mask, current_visible_mask,
)
data["visible_instance_masks"] = visible_instance_masks
data["all_visible_mask"] = all_visible_mask
data["ignore_mask"] = ignore_mask
data["depth2img"]["extrinsic"] = index(
data["depth2img"]["extrinsic"], img_idx
)
if isinstance(data["depth2img"]["intrinsic"], list):
data["depth2img"]["intrinsic"] = index(
data["depth2img"]["intrinsic"], img_idx
)
return data
def merge_grounding_data(self, data_infos):
output = dict(
text="",
)
for key in ["target_id", "distractor_ids", "target", "anchors", "anchor_ids", "tokens_positive"]:
if key in data_infos[0]:
output[key] = []
for data_info in data_infos:
if "target_id" in data_info and data_info["target_id"] in output["target_id"]:
continue
if self.tokens_positive_rebuild and "target" in data_info:
start_idx = data_info["text"].find(data_info["target"])
end_idx = start_idx + len(data_info["target"])
tokens_positive = [[start_idx, end_idx]]
elif "tokens_positive" in data_info:
tokens_positive = data_info["tokens_positive"]
else:
tokens_positive = None
if len(output["text"]) == 0:
output["text"] = data_info["text"]
else:
if tokens_positive is not None:
tokens_positive = np.array(tokens_positive)
tokens_positive += len(output["text"]) + len(self.sep_token)
tokens_positive = tokens_positive.tolist()
output["text"] += self.sep_token + data_info["text"]
if tokens_positive is not None:
output["tokens_positive"].append(tokens_positive)
for k in ["target_id", "distractor_ids", "target", "anchors", "anchor_ids"]:
if k not in data_info:
continue
output[k].append(data_info[k])
output["scan_id"] = data_infos[0]["scan_id"]
return output
def get_data_info_grounding(self, data_info):
flags = {}
if "distractor_ids" in data_info:
flags["is_unique"] = len(data_info["distractor_ids"]) == 0
flags["is_hard"] = len(data_info["distractor_ids"]) > 3
if "text" in data_info:
flags["is_view_dep"] = self._is_view_dep(data_info["text"])
scan_id = data_info["scan_id"]
scan_data = copy.deepcopy(self.scans[scan_id])
data_info = [data_info]
if self.num_text > 1:
data_idx = self.scan_id_to_data_idx[scan_id]
sample_idx = sample(
len(data_idx),
max(min(int(np.random.rand()*self.num_text), len(data_idx))-1, 1),
fix_interval=False
)
for i in sample_idx:
data_info.append(super().get_data_info(data_idx[i]))
data_info = self.merge_grounding_data(data_info)
scan_data["text"] = data_info["text"]
if "ann_info" in scan_data and "target" in data_info:
tokens_positive = []
obj_idx = []
for i, (target_name, id) in enumerate(
zip(data_info["target"], data_info["target_id"])
):
mask = np.logical_and(
scan_data["ann_info"]["bbox_id"] == id,
np.array(scan_data["ann_info"]["gt_names"]) == target_name
)
if np.sum(mask) != 1:
continue
obj_idx.append(np.where(mask)[0][0])
tokens_positive.append(data_info["tokens_positive"][i])
obj_idx = np.array(obj_idx, dtype=np.int32)
scan_data["ann_info"]["gt_bboxes_3d"] = scan_data["ann_info"]["gt_bboxes_3d"][obj_idx]
scan_data["ann_info"]["gt_labels_3d"] = scan_data["ann_info"]["gt_labels_3d"][obj_idx]
scan_data["ann_info"]["gt_names"] = [
scan_data["ann_info"]["gt_names"][i] for i in obj_idx
]
if "visible_instance_masks" in scan_data["ann_info"]:
scan_data["ann_info"]["visible_instance_masks"] = [
visible_instance_mask[obj_idx]
for visible_instance_mask in scan_data["ann_info"]["visible_instance_masks"]
]
scan_data["tokens_positive"] = tokens_positive
scan_data["eval_ann_info"] = scan_data["ann_info"]
scan_data["eval_ann_info"].update(flags)
elif "tokens_positive" in data_info:
scan_data["tokens_positive"] = data_info.get("tokens_positive")
return scan_data
@force_full_init
def get_data_info(self, idx):
data_info = super().get_data_info(idx)
if self.mode == "detection":
return data_info
elif self.mode == "continuous":
return self.get_data_info_continuous(data_info)
elif self.mode == "grounding":
return self.get_data_info_grounding(data_info)
def index(input, idx):
if isinstance(idx, int):
idx = [idx]
output = []
for i in idx:
output.append(input[i])
return output
================================================
FILE: bip3d/datasets/transforms/__init__.py
================================================
from .augmentation import (
GlobalRotScaleTrans,
RandomFlip3D,
ResizeCropFlipImage,
)
from .formatting import Pack3DDetInputs
from .loading import LoadAnnotations3D, LoadDepthFromFile
from .multiview import MultiViewPipeline
from .transform import (
CategoryGroundingDataPrepare,
CamIntrisicStandardization,
CustomResize,
DepthProbLabelGenerator,
)
================================================
FILE: bip3d/datasets/transforms/augmentation.py
================================================
from typing import List, Union
import cv2
import numpy as np
from mmcv.transforms import BaseTransform
from mmdet.datasets.transforms import RandomFlip
from bip3d.registry import TRANSFORMS
@TRANSFORMS.register_module()
class RandomFlip3D(RandomFlip):
"""Flip the points & bbox.
If the input dict contains the key "flip", then the flag will be used,
otherwise it will be randomly decided by a ratio specified in the init
method.
Required Keys:
- points (np.float32)
- gt_bboxes_3d (np.float32)
Modified Keys:
- points (np.float32)
- gt_bboxes_3d (np.float32)
Added Keys:
- points (np.float32)
- pcd_trans (np.float32)
- pcd_rotation (np.float32)
- pcd_rotation_angle (np.float32)
- pcd_scale_factor (np.float32)
Args:
sync_2d (bool): Whether to apply flip according to the 2D
images. If True, it will apply the same flip as that to 2D images.
If False, it will decide whether to flip randomly and independently
to that of 2D images. Defaults to True.
flip_2d (bool): Whether to apply flip for the img data.
If True, it will adopt the flip augmentation for the img.
False occurs on bev augmentation for bev-based image 3d det.
Defaults to True.
flip_3d (bool): Whether to apply flip for the 3d point cloud data.
If True, it will adopt the flip augmentation for the point cloud.
Defaults to True.
flip_ratio_bev_horizontal (float): The flipping probability
in horizontal direction. Defaults to 0.0.
flip_ratio_bev_vertical (float): The flipping probability
in vertical direction. Defaults to 0.0.
flip_box3d (bool): Whether to flip bounding box. In most of the case,
the box should be fliped. In cam-based bev detection, this is set
to False, since the flip of 2D images does not influence the 3D
box. Defaults to True.
"""
def __init__(self,
sync_2d: bool = True,
flip_2d: bool = True,
flip_3d: bool = True,
flip_ratio_bev_horizontal: float = 0.0,
flip_ratio_bev_vertical: float = 0.0,
flip_box3d: bool = True,
update_lidar2cam: bool = False,
**kwargs) -> None:
# `flip_ratio_bev_horizontal` is equal to
# for flip prob of 2d image when
# `sync_2d` is True
super(RandomFlip3D, self).__init__(prob=flip_ratio_bev_horizontal,
direction='horizontal',
**kwargs)
self.sync_2d = sync_2d
self.flip_2d = flip_2d
self.flip_3d = flip_3d
self.flip_ratio_bev_horizontal = flip_ratio_bev_horizontal
self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
self.flip_box3d = flip_box3d
self.update_lidar2cam = update_lidar2cam
if flip_ratio_bev_horizontal is not None:
assert isinstance(flip_ratio_bev_horizontal, (int, float)) \
and 0 <= flip_ratio_bev_horizontal <= 1
if flip_ratio_bev_vertical is not None:
assert isinstance(flip_ratio_bev_vertical, (int, float)) \
and 0 <= flip_ratio_bev_vertical <= 1
def transform(self, input_dict: dict) -> dict:
"""Call function to flip points, values in the ``bbox3d_fields`` and
also flip 2D image and its annotations.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Flipped results, 'flip', 'flip_direction',
'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added
into result dict.
"""
# flip 2D image and its annotations
if self.flip_2d:
# only handle the 2D image
if 'img' in input_dict:
super(RandomFlip3D, self).transform(input_dict)
flip = input_dict.get('flip', False)
if flip:
input_dict = self.random_flip_data_2d(input_dict)
if self.flip_3d:
# only handle the 3D points
if self.sync_2d and 'img' in input_dict:
# TODO check if this is necessary in FOCS3D
input_dict['pcd_horizontal_flip'] = input_dict['flip']
input_dict['pcd_vertical_flip'] = False
else:
if 'pcd_horizontal_flip' not in input_dict:
if np.random.rand() < self.flip_ratio_bev_horizontal:
flip_horizontal = True
else:
flip_horizontal = False
input_dict['pcd_horizontal_flip'] = flip_horizontal
if 'pcd_vertical_flip' not in input_dict:
if np.random.rand() < self.flip_ratio_bev_vertical:
flip_vertical = True
else:
flip_vertical = False
input_dict['pcd_vertical_flip'] = flip_vertical
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
if input_dict['pcd_horizontal_flip']:
self.random_flip_data_3d(input_dict, 'horizontal')
input_dict['transformation_3d_flow'].extend(['HF'])
if input_dict['pcd_vertical_flip']:
self.random_flip_data_3d(input_dict, 'vertical')
input_dict['transformation_3d_flow'].extend(['VF'])
if self.update_lidar2cam:
self._transform_lidar2cam(input_dict)
return input_dict
def random_flip_data_3d(self,
input_dict: dict,
direction: str = 'horizontal') -> None:
"""Flip 3D data randomly.
`random_flip_data_3d` should take these situations into consideration:
- 1. LIDAR-based 3d detection
- 2. LIDAR-based 3d segmentation
- 3. vision-only detection
- 4. multi-modality 3d detection.
Args:
input_dict (dict): Result dict from loading pipeline.
direction (str): Flip direction. Defaults to 'horizontal'.
Returns:
dict: Flipped results, 'points', 'bbox3d_fields' keys are
updated in the result dict.
"""
assert direction in ['horizontal', 'vertical']
if self.flip_box3d:
if 'gt_bboxes_3d' in input_dict:
if 'points' in input_dict:
input_dict['points'] = input_dict['gt_bboxes_3d'].flip(
direction, points=input_dict['points'])
else:
# vision-only detection
input_dict['gt_bboxes_3d'].flip(direction)
else:
input_dict['points'].flip(direction)
def random_flip_data_2d(self,
input_dict: dict,
direction: str = 'horizontal') -> dict:
if 'centers_2d' in input_dict:
assert self.sync_2d is True and direction == 'horizontal', \
'Only support sync_2d=True and horizontal flip with images'
w = input_dict['img_shape'][1]
input_dict['centers_2d'][..., 0] = \
w - input_dict['centers_2d'][..., 0]
# need to modify the horizontal position of camera center
# along u-axis in the image (flip like centers2d)
# ['cam2img'][0][2] = c_u
# see more details and examples at
# https://github.com/open-mmlab/mmdetection3d/pull/744
input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
if 'fov_ori2aug' not in input_dict:
fov_ori2aug = np.eye(4, 4)
else:
fov_ori2aug = input_dict['fov_ori2aug']
# get the value of w
w = input_dict['img_shape'][1]
# flip_matrix[0,0] = -1
# flip_matrix[0,3] = w
# fov_ori2aug = np.matmul(fov_ori2aug, flip_matrix)
fov_ori2aug[0] *= -1
fov_ori2aug[0, 3] += w
input_dict['fov_ori2aug'] = fov_ori2aug
return input_dict
def _flip_on_direction(self, results: dict) -> None:
"""Function to flip images, bounding boxes, semantic segmentation map
and keypoints.
Add the override feature that if 'flip' is already in results, use it
to do the augmentation.
"""
if 'flip' not in results:
cur_dir = self._choose_direction()
else:
# `flip_direction` works only when `flip` is True.
# For example, in `MultiScaleFlipAug3D`, `flip_direction` is
# 'horizontal' but `flip` is False.
if results['flip']:
assert 'flip_direction' in results, 'flip and flip_direction '
'must exist simultaneously'
cur_dir = results['flip_direction']
else:
cur_dir = None
if cur_dir is None:
results['flip'] = False
results['flip_direction'] = None
else:
results['flip'] = True
results['flip_direction'] = cur_dir
self._flip(results)
def _transform_lidar2cam(self, results: dict) -> None:
"""TODO."""
aug_matrix = np.eye(4)
if results.get('pcd_horizontal_flip', False):
aug_matrix[1, 1] *= -1
if results.get('pcd_vertical_flip', False):
aug_matrix[0, 0] *= -1
lidar2cam_list = []
for lidar2cam in results['lidar2cam']:
lidar2cam = np.array(lidar2cam)
lidar2cam = np.matmul(lidar2cam, aug_matrix)
lidar2cam_list.append(lidar2cam.tolist())
results['lidar2cam'] = lidar2cam_list
def __repr__(self) -> str:
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(sync_2d={self.sync_2d},'
repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
return repr_str
@TRANSFORMS.register_module()
class GlobalRotScaleTrans(BaseTransform):
"""Apply global rotation, scaling and translation to a 3D scene.
Required Keys:
- points (np.float32)
- gt_bboxes_3d (np.float32)
Modified Keys:
- points (np.float32)
- gt_bboxes_3d (np.float32)
Added Keys:
- points (np.float32)
- pcd_trans (np.float32)
- pcd_rotation (np.float32)
- pcd_rotation_angle (np.float32)
- pcd_scale_factor (np.float32)
Args:
rot_range (list[float]): Range of rotation angle.
Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
rot_dof (int): DoF of rotation noise. Defaults to 1.
scale_ratio_range (list[float]): Range of scale ratio.
Defaults to [0.95, 1.05].
translation_std (list[float]): The standard deviation of
translation noise applied to a scene, which
is sampled from a gaussian distribution whose standard deviation
is set by ``translation_std``. Defaults to [0, 0, 0].
shift_height (bool): Whether to shift height.
(the fourth dimension of indoor points) when scaling.
Defaults to False.
"""
def __init__(self,
rot_range: Union[List[float], int,
float] = [-0.78539816, 0.78539816],
rot_dof: int = 1,
scale_ratio_range: List[float] = [0.95, 1.05],
translation_std: List[int] = [0, 0, 0],
shift_height: bool = False,
update_lidar2cam: bool = False) -> None:
seq_types = (list, tuple, np.ndarray)
if not isinstance(rot_range, seq_types):
assert isinstance(rot_range, (int, float)), \
f'unsupported rot_range type {type(rot_range)}'
rot_range = [-rot_range, rot_range]
self.rot_range = rot_range
self.rot_dof = rot_dof
self.update_lidar2cam = update_lidar2cam
assert isinstance(scale_ratio_range, seq_types), \
f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
self.scale_ratio_range = scale_ratio_range
if not isinstance(translation_std, seq_types):
assert isinstance(translation_std, (int, float)), \
f'unsupported translation_std type {type(translation_std)}'
translation_std = [
translation_std, translation_std, translation_std
]
assert all([std >= 0 for std in translation_std]), \
'translation_std should be positive'
self.translation_std = translation_std
self.shift_height = shift_height
def transform(self, input_dict: dict) -> dict:
"""Private function to rotate, scale and translate bounding boxes and
points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'points', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated
in the result dict.
"""
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
self._rot_bbox_points(input_dict)
if 'pcd_scale_factor' not in input_dict:
self._random_scale(input_dict)
self._scale_bbox_points(input_dict)
self._trans_bbox_points(input_dict)
input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
if self.update_lidar2cam:
self._transform_lidar2cam(input_dict)
return input_dict
def _trans_bbox_points(self, input_dict: dict) -> None:
"""Private function to translate bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after translation, 'points', 'pcd_trans'
and `gt_bboxes_3d` is updated in the result dict.
"""
translation_std = np.array(self.translation_std, dtype=np.float32)
trans_factor = np.random.normal(scale=translation_std, size=3).T
if 'points' in input_dict:
input_dict['points'].translate(trans_factor)
input_dict['pcd_trans'] = trans_factor
if 'gt_bboxes_3d' in input_dict:
input_dict['gt_bboxes_3d'].translate(trans_factor)
def _rot_bbox_points(self, input_dict: dict) -> None:
"""Private function to rotate bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after rotation, 'points', 'pcd_rotation'
and `gt_bboxes_3d` is updated in the result dict.
"""
rotation = self.rot_range
if self.rot_dof == 1:
noise_rotation = np.random.uniform(rotation[0], rotation[1])
noise_rotation *= -1
elif self.rot_dof > 1:
noise_rotation = np.array([
-np.random.uniform(rotation[0], rotation[1]),
-np.random.uniform(rotation[0], rotation[1]),
-np.random.uniform(rotation[0], rotation[1])
])
else:
raise NotImplementedError
# TODO delete this. And -1 is to align the rotation with
# the version of 0.17.
# if 'gt_bboxes_3d' in input_dict and \
# len(input_dict['gt_bboxes_3d'].tensor) != 0:
if 'gt_bboxes_3d' in input_dict:
# rotate points with bboxes
if 'points' in input_dict:
points, rot_mat_T = input_dict['gt_bboxes_3d'].rotate(
noise_rotation, input_dict['points'])
input_dict['points'] = points
else:
rot_mat_T = input_dict['gt_bboxes_3d'].rotate(noise_rotation)
elif 'points' in input_dict:
# if no bbox in input_dict, only rotate points
rot_mat_T = input_dict['points'].rotate(noise_rotation)
input_dict['pcd_rotation'] = rot_mat_T
input_dict['pcd_rotation_angle'] = noise_rotation
def _scale_bbox_points(self, input_dict: dict) -> None:
"""Private function to scale bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'points' and
`gt_bboxes_3d` is updated in the result dict.
"""
scale = input_dict['pcd_scale_factor']
if 'points' in input_dict:
points = input_dict['points']
points.scale(scale)
if self.shift_height:
assert 'height' in points.attribute_dims.keys(), \
'setting shift_height=True \
but points have no height attribute'
points.tensor[:, points.attribute_dims['height']] *= scale
input_dict['points'] = points
if 'gt_bboxes_3d' in input_dict and \
len(input_dict['gt_bboxes_3d'].tensor) != 0:
input_dict['gt_bboxes_3d'].scale(scale)
def _random_scale(self, input_dict: dict) -> None:
"""Private function to randomly set the scale factor.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'pcd_scale_factor'
are updated in the result dict.
"""
scale_factor = np.random.uniform(self.scale_ratio_range[0],
self.scale_ratio_range[1])
input_dict['pcd_scale_factor'] = scale_factor
def _transform_lidar2cam(self, input_dict: dict) -> None:
aug_matrix = np.eye(4)
if 'pcd_rotation' in input_dict:
aug_matrix[:3, :3] = input_dict['pcd_rotation'].T.numpy(
) * input_dict['pcd_scale_factor']
else:
aug_matrix[:3, :3] = np.eye(3).view(
1, 3, 3) * input_dict['pcd_scale_factor']
aug_matrix[:3, -1] = input_dict['pcd_trans'].reshape(1, 3)
aug_matrix[-1, -1] = 1.0
aug_matrix = np.linalg.inv(aug_matrix)
lidar2cam_list = []
for lidar2cam in input_dict["depth2img"]["extrinsic"]:
# for lidar2cam in input_dict['lidar2cam']:
lidar2cam = np.array(lidar2cam)
lidar2cam = np.matmul(lidar2cam, aug_matrix)
lidar2cam_list.append(lidar2cam.tolist())
input_dict["depth2img"]["extrinsic"] = lidar2cam_list
# input_dict['lidar2cam'] = lidar2cam_list
def __repr__(self) -> str:
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(rot_range={self.rot_range},'
repr_str += f' scale_ratio_range={self.scale_ratio_range},'
repr_str += f' translation_std={self.translation_std},'
repr_str += f' shift_height={self.shift_height})'
return repr_str
@TRANSFORMS.register_module()
class ResizeCropFlipImage(object):
def __init__(self, data_aug_conf, test_mode=False):
self.data_aug_conf = data_aug_conf
self.test_mode = test_mode
def get_augmentation(self):
if self.data_aug_conf is None:
return None
H, W = self.data_aug_conf["H"], self.data_aug_conf["W"]
fH, fW = self.data_aug_conf["final_dim"]
if not self.test_mode:
resize = np.random.uniform(*self.data_aug_conf["resize_lim"])
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = (
int(
(1 - np.random.uniform(*self.data_aug_conf["bot_pct_lim"]))
* newH
)
- fH
)
crop_w = int(np.random.uniform(0, max(0, newW - fW)))
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]):
flip = True
rotate = np.random.uniform(*self.data_aug_conf["rot_lim"])
# rotate_3d = np.random.uniform(*self.data_aug_conf["rot3d_range"])
rotate_3d = 0
else:
resize = max(fH / H, fW / W)
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = (
int((1 - np.mean(self.data_aug_conf["bot_pct_lim"])) * newH)
- fH
)
crop_w = int(max(0, newW - fW) / 2)
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
rotate = 0
rotate_3d = 0
aug_config = {
"resize": resize,
"resize_dims": resize_dims,
"crop": crop,
"flip": flip,
"rotate": rotate,
"rotate_3d": rotate_3d,
}
return aug_config
def __call__(self, results):
img = results["img"]
aug_configs = self.get_augmentation()
resize = aug_configs.get("resize", 1)
results["img"], extend_matrix = self._transform(img, aug_configs)
results["trans_mat"] = extend_matrix
if "depth_img" in results:
results["depth_img"], _ = self._transform(results["depth_img"], aug_configs)
return results
def _transform(self, img, aug_configs):
H, W = img.shape[:2]
resize = aug_configs.get("resize", 1)
resize_dims = (int(W * resize), int(H * resize))
crop = aug_configs.get("crop", [0, 0, *resize_dims])
flip = aug_configs.get("flip", False)
rotate = aug_configs.get("rotate", 0)
transform_matrix = np.eye(3)
transform_matrix[:2, :2] *= resize
transform_matrix[:2, 2] -= np.array(crop[:2])
if flip:
flip_matrix = np.array(
[[-1, 0, crop[2] - crop[0]], [0, 1, 0], [0, 0, 1]]
)
transform_matrix = flip_matrix @ transform_matrix
rotate = rotate / 180 * np.pi
rot_matrix = np.array(
[
[np.cos(rotate), np.sin(rotate), 0],
[-np.sin(rotate), np.cos(rotate), 0],
[0, 0, 1],
]
)
rot_center = np.array([crop[2] - crop[0], crop[3] - crop[1]]) / 2
rot_matrix[:2, 2] = -rot_matrix[:2, :2] @ rot_center + rot_center
transform_matrix = rot_matrix @ transform_matrix
extend_matrix = np.eye(4)
extend_matrix[:3, :3] = transform_matrix
img = cv2.warpAffine(img, extend_matrix[:2, :3], img.shape[:2][::-1])
return img, extend_matrix
================================================
FILE: bip3d/datasets/transforms/formatting.py
================================================
from typing import List, Sequence, Union
import mmengine
import numpy as np
import torch
from mmcv.transforms import BaseTransform
from mmengine.structures import InstanceData, PixelData
from bip3d.registry import TRANSFORMS
from bip3d.structures.bbox_3d import BaseInstance3DBoxes
from bip3d.structures.points import BasePoints
from bip3d.utils.typing_config import Det3DDataElement, PointData
def to_tensor(
data: Union[torch.Tensor, np.ndarray, Sequence, int,
float]) -> torch.Tensor:
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
Returns:
torch.Tensor: the converted data.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
if data.dtype is np.dtype('float64'):
data = data.astype(np.float32)
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not mmengine.is_str(data):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
@TRANSFORMS.register_module()
class Pack3DDetInputs(BaseTransform):
INPUTS_KEYS = ['points', 'img', "depth_img"]
# to be compatible with depths in bevdepth
INSTANCEDATA_3D_KEYS = [
'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
]
INSTANCEDATA_2D_KEYS = [
'gt_bboxes',
'gt_bboxes_labels',
]
SEG_KEYS = [
'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
'gt_semantic_seg'
]
def __init__(
self,
keys: dict,
meta_keys: dict = (
'img_path', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img',
'cam2img', 'pad_shape', 'depth_map_path', 'scale_factor', 'flip',
'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
'box_type_3d', 'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
'pcd_rotation_angle', 'lidar_path', 'transformation_3d_flow',
'trans_mat', 'affine_aug', 'sweep_img_metas', 'ori_cam2img',
'cam2global', 'crop_offset', 'img_crop_offset', 'resize_img_shape',
'lidar2cam', 'ori_lidar2img', 'num_ref_frames', 'num_views',
'ego2global', 'fov_ori2aug', 'ego2cam', 'axis_align_matrix',
'text', 'tokens_positive', 'scan_id', "distractor_bboxes", "anchor_bboxes",
"trans_mat", "ignore_mask",
)):
self.keys = keys
self.meta_keys = meta_keys
def _remove_prefix(self, key: str) -> str:
if key.startswith('gt_'):
key = key[3:]
return key
def transform(self, results: Union[dict,
List[dict]]):
"""Method to pack the input data. when the value in this dict is a
list, it usually is in Augmentations Testing.
Args:
results (dict | list[dict]): Result dict from the data pipeline.
Returns:
dict | List[dict]:
- 'inputs' (dict): The forward data of models. It usually contains
following keys:
- points
- img
- 'data_samples' (:obj:`Det3DDataSample`): The annotation info of
the sample.
"""
# augtest
if isinstance(results, list):
if len(results) == 1:
# simple test
return self.pack_single_results(results[0])
pack_results = []
for single_result in results:
pack_results.append(self.pack_single_results(single_result))
return pack_results
# norm training and simple testing
elif isinstance(results, dict):
return self.pack_single_results(results)
else:
raise NotImplementedError
def pack_single_results(self, results: dict) -> dict:
"""Method to pack the single input data. when the value in this dict is
a list, it usually is in Augmentations Testing.
Args:
results (dict): Result dict from the data pipeline.
Returns:
dict: A dict contains
- 'inputs' (dict): The forward data of models. It usually contains
following keys:
- points
- img
- 'data_samples' (:obj:`Det3DDataSample`): The annotation info
of the sample.
"""
if 'points' in results:
if isinstance(results['points'], BasePoints):
results['points'] = results['points'].tensor
# multi-sweep points
elif isinstance(results['points'], list):
if isinstance(results['points'][0], BasePoints):
for idx in range(len(results['points'])):
results['points'][idx] = results['points'][idx].tensor
for key in ["img", "depth_img"]:
if key not in results:
continue
if isinstance(results[key], list):
# process multiple imgs in single frame
imgs = np.stack(results[key], axis=0)
if len(imgs.shape) <= 3:
imgs = imgs[..., None]
if imgs.flags.c_contiguous:
imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
else:
imgs = to_tensor(
np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
results[key] = imgs
else:
img = results[key]
if len(img.shape) < 3:
img = np.expand_dims(img, -1)
# To improve the computational speed by by 3-5 times, apply:
# `torch.permute()` rather than `np.transpose()`.
# Refer to https://github.com/open-mmlab/mmdetection/pull/9533
# for more details
if img.flags.c_contiguous:
img = to_tensor(img).permute(2, 0, 1).contiguous()
else:
img = to_tensor(
np.ascontiguousarray(img.transpose(2, 0, 1)))
results[key] = img
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
]:
if key not in results:
continue
if isinstance(results[key], list):
results[key] = [to_tensor(res) for res in results[key]]
else:
results[key] = to_tensor(results[key])
if 'gt_bboxes_3d' in results:
# multi-sweep version
if isinstance(results['gt_bboxes_3d'], list):
if not isinstance(results['gt_bboxes_3d'][0],
BaseInstance3DBoxes):
for idx in range(len(results['gt_bboxes_3d'])):
results['gt_bboxes_3d'][idx] = to_tensor(
results['gt_bboxes_3d'][idx])
elif not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])
if 'gt_semantic_seg' in results:
results['gt_semantic_seg'] = to_tensor(
results['gt_semantic_seg'][None])
if 'gt_seg_map' in results:
results['gt_seg_map'] = results['gt_seg_map'][None, ...]
if 'depth_map' in results:
results['depth_map'] = to_tensor(results['depth_map'])
data_sample = Det3DDataElement()
gt_instances_3d = InstanceData()
gt_instances = InstanceData()
gt_pts_seg = PointData()
gt_depth_map = PixelData()
data_metas = {}
for key in self.meta_keys:
if key in results:
data_metas[key] = results[key]
# TODO: unify ScanNet multi-view info with nuScenes and Waymo
elif 'images' in results and isinstance(results['images'], dict):
if len(results['images'].keys()) == 1:
cam_type = list(results['images'].keys())[0]
# single-view image
if key in results['images'][cam_type]:
data_metas[key] = results['images'][cam_type][key]
else:
# multi-view image
img_metas = []
cam_types = list(results['images'].keys())
for cam_type in cam_types:
if key in results['images'][cam_type]:
img_metas.append(results['images'][cam_type][key])
if len(img_metas) > 0:
data_metas[key] = img_metas
elif 'lidar_points' in results and isinstance(
results['lidar_points'], dict):
if key in results['lidar_points']:
data_metas[key] = results['lidar_points'][key]
data_sample.set_metainfo(data_metas)
inputs = {}
for key in self.keys:
if key in results:
if key in self.INPUTS_KEYS:
inputs[key] = results[key]
elif key in self.INSTANCEDATA_3D_KEYS:
gt_instances_3d[self._remove_prefix(key)] = results[key]
elif key in self.INSTANCEDATA_2D_KEYS:
if key == 'gt_bboxes_labels':
gt_instances['labels'] = results[key]
else:
gt_instances[self._remove_prefix(key)] = results[key]
elif key in self.SEG_KEYS:
gt_pts_seg[self._remove_prefix(key)] = results[key]
elif key == 'depth_map':
gt_depth_map.set_data(dict(data=results[key]))
elif key == 'gt_occupancy':
data_sample.gt_occupancy = to_tensor(
results['gt_occupancy'])
if isinstance(results['gt_occupancy_masks'], list):
data_sample.gt_occupancy_masks = [
to_tensor(mask)
for mask in results['gt_occupancy_masks']
]
else:
data_sample.gt_occupancy_masks = to_tensor(
results['gt_occupancy_masks'])
else:
raise NotImplementedError(f'Please modified '
f'`Pack3DDetInputs` '
f'to put {key} to '
f'corresponding field')
data_sample.gt_instances_3d = gt_instances_3d
data_sample.gt_instances = gt_instances
data_sample.gt_pts_seg = gt_pts_seg
data_sample.gt_depth_map = gt_depth_map
if 'eval_ann_info' in results:
data_sample.eval_ann_info = results['eval_ann_info']
else:
data_sample.eval_ann_info = None
packed_results = dict()
packed_results['data_samples'] = data_sample
packed_results['inputs'] = inputs
return packed_results
def __repr__(self) -> str:
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(keys={self.keys})'
repr_str += f'(meta_keys={self.meta_keys})'
return repr_str
================================================
FILE: bip3d/datasets/transforms/loading.py
================================================
from typing import Optional
import mmcv
import mmengine
import numpy as np
from mmcv.transforms import BaseTransform
from mmdet.datasets.transforms import LoadAnnotations
from bip3d.registry import TRANSFORMS
@TRANSFORMS.register_module()
class LoadDepthFromFile(BaseTransform):
"""Load a depth image from file.
Required Keys:
- depth_img_path
Modified Keys:
- depth_img
- depth_img_shape
Args:
imdecode_backend (str): The image decoding backend type. The backend
argument for :func:`mmcv.imfrombytes`.
See :func:`mmcv.imfrombytes` for details.
Defaults to 'cv2'.
ignore_empty (bool): Whether to allow loading empty image or file path
not existent. Defaults to False.
backend_args (dict, optional): Instantiates the corresponding file
backend. It may contain `backend` key to specify the file
backend. If it contains, the file backend corresponding to this
value will be used and initialized with the remaining values,
otherwise the corresponding file backend will be selected
based on the prefix of the file path. Defaults to None.
New in version 2.0.0rc4.
"""
def __init__(self,
imdecode_backend: str = 'cv2',
ignore_empty: bool = False,
*,
backend_args: Optional[dict] = None):
self.ignore_empty = ignore_empty
self.imdecode_backend = imdecode_backend
self.backend_args = None
if backend_args is not None:
self.backend_args = backend_args.copy()
def transform(self, results: dict):
"""Functions to load image.
Args:
results (dict): Result dict from
:class:`mmengine.dataset.BaseDataset`.
Returns:
dict: The dict contains loaded image and meta information.
"""
filename = results['depth_img_path']
depth_shift = results['depth_shift']
try:
depth_img_bytes = mmengine.fileio.get(
filename, backend_args=self.backend_args)
depth_img = mmcv.imfrombytes(depth_img_bytes,
flag='unchanged',
backend=self.imdecode_backend).astype(
np.float32) / depth_shift
except Exception as e:
if self.ignore_empty:
return None
else:
raise e
results['depth_img'] = depth_img
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'ignore_empty={self.ignore_empty}, '
f"imdecode_backend='{self.imdecode_backend}', ")
if self.backend_args is not None:
repr_str += f'backend_args={self.backend_args})'
else:
repr_str += f'backend_args={self.backend_args})'
return repr_str
# TODO : refine
@TRANSFORMS.register_module()
class LoadAnnotations3D(LoadAnnotations):
"""Load Annotations3D.
Load instance mask and semantic mask of points and
encapsulate the items into related fields.
Required Keys:
- ann_info (dict)
- gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
:obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
3D ground truth bboxes. Only when `with_bbox_3d` is True
- gt_labels_3d (np.int64): Labels of ground truths.
Only when `with_label_3d` is True.
- gt_bboxes (np.float32): 2D ground truth bboxes.
Only when `with_bbox` is True.
- gt_labels (np.ndarray): Labels of ground truths.
Only when `with_label` is True.
- depths (np.ndarray): Only when
`with_bbox_depth` is True.
- centers_2d (np.ndarray): Only when
`with_bbox_depth` is True.
- attr_labels (np.ndarray): Attribute labels of instances.
Only when `with_attr_label` is True.
- pts_instance_mask_path (str): Path of instance mask file.
Only when `with_mask_3d` is True.
- pts_semantic_mask_path (str): Path of semantic mask file.
Only when `with_seg_3d` is True.
- pts_panoptic_mask_path (str): Path of panoptic mask file.
Only when both `with_panoptic_3d` is True.
Added Keys:
- gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
:obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
3D ground truth bboxes. Only when `with_bbox_3d` is True
- gt_labels_3d (np.int64): Labels of ground truths.
Only when `with_label_3d` is True.
- gt_bboxes (np.float32): 2D ground truth bboxes.
Only when `with_bbox` is True.
- gt_labels (np.int64): Labels of ground truths.
Only when `with_label` is True.
- depths (np.float32): Only when
`with_bbox_depth` is True.
- centers_2d (np.ndarray): Only when
`with_bbox_depth` is True.
- attr_labels (np.int64): Attribute labels of instances.
Only when `with_attr_label` is True.
- pts_instance_mask (np.int64): Instance mask of each point.
Only when `with_mask_3d` is True.
- pts_semantic_mask (np.int64): Semantic mask of each point.
Only when `with_seg_3d` is True.
Args:
with_bbox_3d (bool): Whether to load 3D boxes. Defaults to True.
with_label_3d (bool): Whether to load 3D labels. Defaults to True.
with_attr_label (bool): Whether to load attribute label.
Defaults to False.
with_mask_3d (bool): Whether to load 3D instance masks for points.
Defaults to False.
with_seg_3d (bool): Whether to load 3D semantic masks for points.
Defaults to False.
with_bbox (bool): Whether to load 2D boxes. Defaults to False.
with_label (bool): Whether to load 2D labels. Defaults to False.
with_mask (bool): Whether to load 2D instance masks. Defaults to False.
with_seg (bool): Whether to load 2D semantic masks. Defaults to False.
with_bbox_depth (bool): Whether to load 2.5D boxes. Defaults to False.
with_panoptic_3d (bool): Whether to load 3D panoptic masks for points.
Defaults to False.
poly2mask (bool): Whether to convert polygon annotations to bitmasks.
Defaults to True.
seg_3d_dtype (str): String of dtype of 3D semantic masks.
Defaults to 'np.int64'.
seg_offset (int): The offset to split semantic and instance labels from
panoptic labels. Defaults to None.
dataset_type (str): Type of dataset used for splitting semantic and
instance labels. Defaults to None.
backend_args (dict, optional): Arguments to instantiate the
corresponding backend. Defaults to None.
"""
def __init__(self,
with_bbox_3d: bool = True,
with_label_3d: bool = True,
with_depth_map: bool = False,
with_attr_label: bool = False,
with_mask_3d: bool = False,
with_seg_3d: bool = False,
with_bbox: bool = False,
with_label: bool = False,
with_mask: bool = False,
with_seg: bool = False,
with_bbox_depth: bool = False,
with_panoptic_3d: bool = False,
with_visible_instance_masks: bool = False,
with_occupancy: bool = False,
with_visible_occupancy_masks: bool = False,
poly2mask: bool = True,
seg_3d_dtype: str = 'np.int64',
seg_offset: int = None,
dataset_type: str = None,
backend_args: Optional[dict] = None):
super().__init__(with_bbox=with_bbox,
with_label=with_label,
with_mask=with_mask,
with_seg=with_seg,
poly2mask=poly2mask,
backend_args=backend_args)
self.with_bbox_3d = with_bbox_3d
self.with_bbox_depth = with_bbox_depth
self.with_label_3d = with_label_3d
self.with_depth_map = with_depth_map
self.with_attr_label = with_attr_label
self.with_mask_3d = with_mask_3d
self.with_seg_3d = with_seg_3d
self.with_panoptic_3d = with_panoptic_3d
self.with_visible_instance_masks = with_visible_instance_masks
self.with_occupancy = with_occupancy
self.with_visible_occupancy_masks = with_visible_occupancy_masks
self.seg_3d_dtype = eval(seg_3d_dtype)
self.seg_offset = seg_offset
self.dataset_type = dataset_type
def _load_bboxes_3d(self, results: dict):
"""Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box annotations.
"""
results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
return results
def _load_bboxes_depth(self, results: dict):
"""Private function to load 2.5D bounding box annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 2.5D bounding box annotations.
"""
results['depths'] = results['ann_info']['depths']
results['centers_2d'] = results['ann_info']['centers_2d']
return results
def _load_labels_3d(self, results: dict):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded label annotations.
"""
results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
return results
def _load_attr_labels(self, results: dict):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded label annotations.
"""
results['attr_labels'] = results['ann_info']['attr_labels']
return results
def _load_depth_map(self, results: dict):
img_filename = results['img_path']
pts_filename = img_filename.replace('samples', 'depth_points') + '.bin'
results['depth_map_path'] = pts_filename
if self.file_client is None:
self.file_client = mmengine.FileClient(**self.backend_args)
try:
pts_bytes = self.file_client.get(pts_filename)
points = np.frombuffer(pts_bytes, dtype=np.float32)
except ConnectionError:
mmengine.check_file_exist(pts_filename)
if pts_filename.endswith('.npy'):
points = np.load(pts_filename)
else:
points = np.fromfile(pts_filename, dtype=np.float32)
pts_img = points.reshape(-1, 3)
img_shape = results['ori_shape']
depth_img = np.zeros(img_shape, dtype=np.float32)
iy = np.round(pts_img[:, 1]).astype(np.int64)
ix = np.round(pts_img[:, 0]).astype(np.int64)
depth_img[iy, ix] = pts_img[:, 2]
results['depth_map'] = depth_img
return results
def _load_masks_3d(self, results: dict):
"""Private function to load 3D mask annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D mask annotations.
"""
pts_instance_mask_path = results['pts_instance_mask_path']
try:
mask_bytes = mmengine.fileio.get(pts_instance_mask_path,
backend_args=self.backend_args)
pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int64)
except ConnectionError:
mmengine.check_file_exist(pts_instance_mask_path)
pts_instance_mask = np.fromfile(pts_instance_mask_path,
dtype=np.int64)
results['pts_instance_mask'] = pts_instance_mask
# 'eval_ann_info' will be passed to evaluator
if 'eval_ann_info' in results:
results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask
return results
def _load_semantic_seg_3d(self, results: dict):
"""Private function to load 3D semantic segmentation annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing the semantic segmentation annotations.
"""
pts_semantic_mask_path = results['pts_semantic_mask_path']
try:
mask_bytes = mmengine.fileio.get(pts_semantic_mask_path,
backend_args=self.backend_args)
# add .copy() to fix read-only bug
pts_semantic_mask = np.frombuffer(mask_bytes,
dtype=self.seg_3d_dtype).copy()
except ConnectionError:
mmengine.check_file_exist(pts_semantic_mask_path)
pts_semantic_mask = np.fromfile(pts_semantic_mask_path,
dtype=np.int64)
if self.dataset_type == 'semantickitti':
pts_semantic_mask = pts_semantic_mask.astype(np.int64)
pts_semantic_mask = pts_semantic_mask % self.seg_offset
# nuScenes loads semantic and panoptic labels from different files.
results['pts_semantic_mask'] = pts_semantic_mask
# 'eval_ann_info' will be passed to evaluator
if 'eval_ann_info' in results:
results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
return results
def _load_panoptic_3d(self, results: dict):
"""Private function to load 3D panoptic segmentation annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing the panoptic segmentation annotations.
"""
pts_panoptic_mask_path = results['pts_panoptic_mask_path']
try:
mask_bytes = mmengine.fileio.get(pts_panoptic_mask_path,
backend_args=self.backend_args)
# add .copy() to fix read-only bug
pts_panoptic_mask = np.frombuffer(mask_bytes,
dtype=self.seg_3d_dtype).copy()
except ConnectionError:
mmengine.check_file_exist(pts_panoptic_mask_path)
pts_panoptic_mask = np.fromfile(pts_panoptic_mask_path,
dtype=np.int64)
if self.dataset_type == 'semantickitti':
pts_semantic_mask = pts_panoptic_mask.astype(np.int64)
pts_semantic_mask = pts_semantic_mask % self.seg_offset
elif self.dataset_type == 'nuscenes':
pts_semantic_mask = pts_semantic_mask // self.seg_offset
results['pts_semantic_mask'] = pts_semantic_mask
# We can directly take panoptic labels as instance ids.
pts_instance_mask = pts_panoptic_mask.astype(np.int64)
results['pts_instance_mask'] = pts_instance_mask
# 'eval_ann_info' will be passed to evaluator
if 'eval_ann_info' in results:
results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask
return results
def _load_visible_instance_masks(self, results: dict):
"""Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box annotations.
"""
results['visible_instance_masks'] = results['ann_info'][
'visible_instance_masks']
return results
def _load_occupancy(self, results: dict):
"""Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box annotations.
"""
results['gt_occupancy'] = results['ann_info']['gt_occupancy']
return results
def _load_visible_occupancy_masks(self, results: dict):
"""Private function to move the 3D bounding box annotation from
`ann_info` field to the root of `results`.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box annotations.
"""
results['visible_occupancy_masks'] = results['ann_info'][
'visible_occupancy_masks']
return results
def _load_bboxes(self, results: dict):
"""Private function to load bounding box annotations.
The only difference is it remove the proceess for
`ignore_flag`
Args:
results (dict): Result dict from :obj:`mmcv.BaseDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
results['gt_bboxes'] = results['ann_info']['gt_bboxes']
def _load_labels(self, results: dict):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj :obj:`mmcv.BaseDataset`.
Returns:
dict: The dict contains loaded label annotations.
"""
results['gt_bboxes_labels'] = results['ann_info']['gt_bboxes_labels']
def transform(self, results: dict):
"""Function to load multiple types annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box, label, mask and
semantic segmentation annotations.
"""
results = super().transform(results)
if self.with_bbox_3d:
results = self._load_bboxes_3d(results)
if self.with_bbox_depth:
results = self._load_bboxes_depth(results)
if self.with_label_3d:
results = self._load_labels_3d(results)
if self.with_depth_map:
results = self._load_depth_map(results)
if self.with_attr_label:
results = self._load_attr_labels(results)
if self.with_panoptic_3d:
results = self._load_panoptic_3d(results)
if self.with_mask_3d:
results = self._load_masks_3d(results)
if self.with_seg_3d:
results = self._load_semantic_seg_3d(results)
if self.with_visible_instance_masks:
results = self._load_visible_instance_masks(results)
if self.with_occupancy:
results = self._load_occupancy(results)
if self.with_visible_occupancy_masks:
results = self._load_visible_occupancy_masks(results)
return results
def __repr__(self):
"""str: Return a string that describes the module."""
indent_str = ' '
repr_str = self.__class__.__name__ + '(\n'
repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
repr_str += f'{indent_str}with_panoptic_3d={self.with_panoptic_3d}, '
repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
repr_str += f'{indent_str}with_label={self.with_label}, '
repr_str += f'{indent_str}with_mask={self.with_mask}, '
repr_str += f'{indent_str}with_seg={self.with_seg}, '
repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
repr_str += f'{indent_str}poly2mask={self.poly2mask}), '
repr_str += f'{indent_str}seg_offset={self.seg_offset}), '
repr_str += f'{indent_str}with_visible_instance_masks='
repr_str += f'{self.with_visible_instance_masks}), '
repr_str += f'{indent_str}with_occupancy={self.with_occupancy}), '
repr_str += f'{indent_str}with_visible_occupancy_masks='
repr_str += f'{self.with_visible_occupancy_masks})'
return repr_str
================================================
FILE: bip3d/datasets/transforms/multiview.py
================================================
import copy
import random
import numpy as np
import torch
from mmcv.transforms import BaseTransform, Compose
from bip3d.registry import TRANSFORMS
from bip3d.structures.points import get_points_type
from ..utils import sample
@TRANSFORMS.register_module()
class MultiViewPipeline(BaseTransform):
"""HARD CODE"""
def __init__(
self,
transforms,
n_images,
max_n_images=None,
ordered=False,
rotate_3rscan=False,
):
super().__init__()
self.transforms = Compose(transforms)
self.n_images = n_images
self.max_n_images = (
max_n_images if max_n_images is not None else n_images
)
self.ordered = ordered
self.rotate_3rscan = rotate_3rscan
def transform(self, results: dict):
"""Transform function.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: output dict after transformation.
"""
imgs = []
img_paths = []
points = []
intrinsics = []
extrinsics = []
depth_imgs = []
trans_mat = []
total_n = len(results["img_path"])
sample_n = min(max(self.n_images, total_n), self.max_n_images)
ids = sample(total_n, sample_n, self.ordered)
for i in ids.tolist():
_results = dict()
_results["img_path"] = results["img_path"][i]
if "depth_img_path" in results:
_results["depth_img_path"] = results["depth_img_path"][i]
if isinstance(results["depth_cam2img"], list):
_results["depth_cam2img"] = results["depth_cam2img"][i]
_results["cam2img"] = results["depth2img"]["intrinsic"][i]
else:
_results["depth_cam2img"] = results["depth_cam2img"]
_results["cam2img"] = results["cam2img"]
_results["depth_shift"] = results["depth_shift"]
_results = self.transforms(_results)
if "depth_shift" in _results:
_results.pop("depth_shift")
if "img" in _results:
imgs.append(_results["img"])
img_paths.append(_results["img_path"])
if "depth_img" in _results:
depth_imgs.append(_results["depth_img"])
if "points" in _results:
points.append(_results["points"])
if _results.get("modify_cam2img"):
intrinsics.append(_results["cam2img"])
elif isinstance(results["depth2img"]["intrinsic"], list):
intrinsics.append(results["depth2img"]["intrinsic"][i])
else:
intrinsics.append(results["depth2img"]["intrinsic"])
extrinsics.append(results["depth2img"]["extrinsic"][i])
if "trans_mat" in _results:
trans_mat.append(_results["trans_mat"])
for key in _results.keys():
if key not in ["img", "points", "img_path"]:
results[key] = _results[key]
if len(imgs):
if self.rotate_3rscan and "3rscan" in img_paths[0]:
imgs = [np.transpose(x, (1, 0, 2)) for x in imgs]
rot_mat = np.array(
[
[0, 1, 0, 0],
[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1],
]
)
rot_mat = [np.linalg.inv(x) @ rot_mat @ x for x in intrinsics]
extrinsics = [x @ y for x, y in zip(rot_mat, extrinsics)]
results["scale_factor"] = results["scale_factor"][::-1]
results["ori_shape"] = results["ori_shape"][::-1]
results["img"] = imgs
results["img_path"] = img_paths
if len(depth_imgs):
if self.rotate_3rscan and "3rscan" in img_paths[0]:
depth_imgs = [np.transpose(x, (1, 0)) for x in depth_imgs]
results["depth_img"] = depth_imgs
if len(points):
results["points"] = points
if (
"ann_info" in results
and "visible_instance_masks" in results["ann_info"]
):
results["visible_instance_masks"] = [
results["ann_info"]["visible_instance_masks"][i] for i in ids
]
results["ann_info"]["visible_instance_masks"] = results[
"visible_instance_masks"
]
elif "visible_instance_masks" in results:
results["visible_instance_masks"] = [
results["visible_instance_masks"][i] for i in ids
]
results["depth2img"]["intrinsic"] = intrinsics
results["depth2img"]["extrinsic"] = extrinsics
if len(trans_mat) != 0:
results["depth2img"]["trans_mat"] = trans_mat
return results
================================================
FILE: bip3d/datasets/transforms/transform.py
================================================
from typing import List, Optional, Tuple, Union
import cv2
import copy
import numpy as np
from numpy import random
import mmcv
import torch
from PIL import Image
from mmcv.transforms import BaseTransform, Resize
from bip3d.registry import TRANSFORMS
from bip3d.structures.bbox_3d import points_cam2img, points_img2cam
from bip3d.structures.points import BasePoints, get_points_type
@TRANSFORMS.register_module()
class CategoryGroundingDataPrepare(BaseTransform):
def __init__(
self,
classes,
training,
max_class=None,
sep_token="[SEP]",
filter_others=True,
z_range=None,
filter_invisible=False,
instance_mask_key="visible_instance_masks",
):
self.classes = list(classes)
self.training = training
self.max_class = max_class
self.sep_token = sep_token
self.filter_others = filter_others
self.z_range = z_range
self.filter_invisible = filter_invisible
self.instance_mask_key = instance_mask_key
def transform(self, results):
visible_instance_masks = results.get(self.instance_mask_key)
if isinstance(visible_instance_masks, (list, tuple)):
visible_instance_masks = np.stack(visible_instance_masks).any(axis=0)
ignore_mask = results.get("ignore_mask")
gt_names = results["ann_info"]["gt_names"]
if self.filter_others and "gt_labels_3d" in results:
mask = results["gt_labels_3d"] >= 0
results["gt_labels_3d"] = results["gt_labels_3d"][mask]
results["gt_bboxes_3d"] = results["gt_bboxes_3d"][mask]
visible_instance_masks = visible_instance_masks[mask]
gt_names = [x for i, x in enumerate(gt_names) if mask[i]]
if ignore_mask is not None:
ignore_mask = ignore_mask[mask]
if (
self.z_range is not None
and "gt_labels_3d" in results
and self.training
):
mask = torch.logical_and(
results["gt_bboxes_3d"].tensor[..., 2] >= self.z_range[0],
results["gt_bboxes_3d"].tensor[..., 2] <= self.z_range[1],
).numpy()
results["gt_labels_3d"] = results["gt_labels_3d"][mask]
results["gt_bboxes_3d"] = results["gt_bboxes_3d"][mask]
visible_instance_masks = visible_instance_masks[mask]
gt_names = [x for i, x in enumerate(gt_names) if mask[i]]
if ignore_mask is not None:
ignore_mask = ignore_mask[mask]
if self.training or self.filter_invisible:
results["gt_labels_3d"] = results["gt_labels_3d"][
visible_instance_masks
]
results["gt_bboxes_3d"] = results["gt_bboxes_3d"][
visible_instance_masks
]
gt_names = [
x for i, x in enumerate(gt_names) if visible_instance_masks[i]
]
if ignore_mask is not None:
ignore_mask = ignore_mask[visible_instance_masks]
results["ignore_mask"] = ignore_mask
if self.training:
if (
self.max_class is not None
and len(self.classes) > self.max_class
):
classes = copy.deepcopy(gt_names)
random.shuffle(self.classes)
for c in self.classes:
if c in classes:
continue
classes.append(c)
if len(classes) >= self.max_class:
break
random.shuffle(classes)
else:
classes = copy.deepcopy(self.classes)
else:
classes = copy.deepcopy(self.classes)
gt_names = classes
results["text"] = self.sep_token.join(classes)
tokens_positive = []
for name in gt_names:
start = results["text"].find(
self.sep_token + name + self.sep_token
)
if start == -1:
if results["text"].startswith(name + self.sep_token):
start = 0
else:
start = results["text"].find(self.sep_token + name) + len(
self.sep_token
)
else:
start += len(self.sep_token)
end = start + len(name)
tokens_positive.append([[start, end]])
results["tokens_positive"] = tokens_positive
return results
@TRANSFORMS.register_module()
class CamIntrisicStandardization(BaseTransform):
def __init__(self, dst_intrinsic, dst_wh):
if not isinstance(dst_intrinsic, np.ndarray):
dst_intrinsic = np.array(dst_intrinsic)
if dst_intrinsic.shape[0] == 3:
tmp = np.eye(4)
tmp[:3, :3] = dst_intrinsic
dst_intrinsic = tmp
self.dst_intrinsic = dst_intrinsic
self.dst_wh = dst_wh
u, v = np.arange(dst_wh[0]), np.arange(dst_wh[1])
u = np.repeat(u[None], dst_wh[1], 0)
v = np.repeat(v[:, None], dst_wh[0], 1)
uv = np.stack([u, v, np.ones_like(u)], axis=-1)
self.dst_pts = uv @ np.linalg.inv(self.dst_intrinsic[:3, :3]).T
def transform(self, results):
src_intrinsic = results["cam2img"][:3, :3]
src_uv = self.dst_pts @ src_intrinsic.T
src_uv = src_uv.astype(np.float32)
if "depth_img" in results and results["img"].shape[:2] != results["depth_img"].shape[:2]:
results["depth_img"] = cv2.resize(
results["depth_img"], results["img"].shape[:2][::-1],
interpolation=cv2.INTER_LINEAR,
)
for key in ["img", "depth_img"]:
if key not in results:
continue
results[key] = cv2.remap(
results[key],
src_uv[..., 0],
src_uv[..., 1],
cv2.INTER_NEAREST,
)
# warp_mat = self.dst_intrinsic[:3, :3] @ np.linalg.inv(src_intrinsic)
# cv2.warpAffine(results[key], warp_mat[:2, :3], self.dst_wh)
results["cam2img"] = copy.deepcopy(self.dst_intrinsic)
results["depth_cam2img"] = copy.deepcopy(self.dst_intrinsic)
results["scale"] = self.dst_wh
results['img_shape'] = results["img"].shape[:2]
results['scale_factor'] = (1, 1)
results['keep_ratio'] = False
results["modify_cam2img"] = True
return results
@TRANSFORMS.register_module()
class CustomResize(Resize):
def _resize_img(self, results: dict, key="img"):
"""Resize images with ``results['scale']``."""
if results.get(key, None) is not None:
if self.keep_ratio:
img, scale_factor = mmcv.imrescale(
results[key],
results['scale'],
interpolation=self.interpolation,
return_scale=True,
backend=self.backend)
# the w_scale and h_scale has minor difference
# a real fix should be done in the mmcv.imrescale in the future
new_h, new_w = img.shape[:2]
h, w = results[key].shape[:2]
w_scale = new_w / w
h_scale = new_h / h
else:
img, w_scale, h_scale = mmcv.imresize(
results[key],
results['scale'],
interpolation=self.interpolation,
return_scale=True,
backend=self.backend)
results[key] = img
if key == "img":
results['img_shape'] = img.shape[:2]
results['scale_factor'] = (w_scale, h_scale)
results['keep_ratio'] = self.keep_ratio
def transform(self, results: dict):
if self.scale:
results['scale'] = self.scale
else:
img_shape = results['img'].shape[:2]
results['scale'] = _scale_size(img_shape[::-1],
self.scale_factor) # type: ignore
self._resize_img(results)
self._resize_bboxes(results)
self._resize_seg(results)
self._resize_keypoints(results)
if "depth_img" in results:
self._resize_img(results, key="depth_img")
return results
@TRANSFORMS.register_module()
class DepthProbLabelGenerator(BaseTransform):
def __init__(
self,
max_depth=10,
min_depth=0.25,
num_depth=64,
stride=[8, 16, 32, 64],
origin_stride=1,
):
self.max_depth = max_depth
self.min_depth = min_depth
self.num_depth = num_depth
self.stride = [x//origin_stride for x in stride]
self.origin_stride = origin_stride
def transform(self, input_dict):
depth = input_dict["inputs"]["depth_img"].cpu().numpy()
if self.origin_stride != 1:
H, W = depth.shape[-2:]
depth = np.transpose(depth, (0, 2, 3, 1))
depth = [mmcv.imresize(
x, (W//self.origin_stride, H//self.origin_stride),
interpolation="nearest",
) for x in depth]
depth = np.stack(depth)[:,None]
depth = np.clip(
depth,
a_min=self.min_depth,
a_max=self.max_depth,
)
depth_anchor = np.linspace(
self.min_depth, self.max_depth, self.num_depth)[:, None, None]
distance = np.abs(depth - depth_anchor)
mask = distance < (depth_anchor[1] - depth_anchor[0])
depth_gt = np.where(mask, depth_anchor, 0)
y = depth_gt.sum(axis=1, keepdims=True) - depth_gt
depth_valid_mask = depth > 0
depth_prob_gt = np.where(
(depth_gt != 0) & depth_valid_mask,
(depth - y) / (depth_gt - y),
0,
)
views, _, H, W = depth.shape
gt = []
gt_map = []
for s in self.stride:
gt_tmp = np.reshape(
depth_prob_gt, (views, self.num_depth, H//s, s, W//s, s))
gt_tmp = gt_tmp.sum(axis=-1).sum(axis=3)
mask_tmp = depth_valid_mask.reshape(views, 1, H//s, s, W//s, s)
mask_tmp = mask_tmp.sum(axis=-1).sum(axis=3)
gt_tmp /= np.clip(mask_tmp, a_min=1, a_max=None)
# gt_map.append(np.transpose(gt_tmp, (0, 2, 3, 1)))
gt_tmp = gt_tmp.reshape(views, self.num_depth, -1)
gt_tmp = np.transpose(gt_tmp, (0, 2, 1))
gt.append(gt_tmp)
gt = np.concatenate(gt, axis=1)
gt = np.clip(gt, a_min=0.0, a_max=1.0)
input_dict["inputs"]["depth_prob_gt"] = torch.from_numpy(gt).to(
input_dict["inputs"]["depth_img"])
return input_dict
================================================
FILE: bip3d/datasets/utils.py
================================================
import numpy as np
from scipy.spatial.transform import Rotation
def sample(total_n, sample_n, fix_interval=True):
ids = np.arange(total_n)
if sample_n == total_n:
return ids
elif sample_n > total_n:
return np.concatenate([
ids, sample(total_n, sample_n - total_n, fix_interval)
])
elif fix_interval:
interval = total_n / sample_n
output = []
for i in range(sample_n):
output.append(ids[int(interval * i)])
return np.array(output)
return np.random.choice(ids, sample_n, replace=False)
def xyzrpy_to_matrix(input):
output = np.eye(4)
output[:3, :3] = Rotation.from_euler("xyz", np.array(input[3:])).as_matrix()
output[:3, 3] = input[:3]
return output
================================================
FILE: bip3d/eval/__init__.py
================================================
from .metrics import GroundingMetric, IndoorDetMetric
__all__ = ['IndoorDetMetric', 'GroundingMetric']
================================================
FILE: bip3d/eval/indoor_eval.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import numpy as np
import torch
from mmengine.logging import print_log
from terminaltables import AsciiTable
# from bip3d.visualization.default_color_map import DEFAULT_COLOR_MAP
import matplotlib.pyplot as plt
def average_precision(recalls, precisions, mode='area'):
"""Calculate average precision (for single or multiple scales).
Args:
recalls (np.ndarray): Recalls with shape of (num_scales, num_dets)
or (num_dets, ).
precisions (np.ndarray): Precisions with shape of
(num_scales, num_dets) or (num_dets, ).
mode (str): 'area' or '11points', 'area' means calculating the area
under precision-recall curve, '11points' means calculating
the average precision of recalls at [0, 0.1, ..., 1]
Returns:
float or np.ndarray: Calculated average precision.
"""
if recalls.ndim == 1:
recalls = recalls[np.newaxis, :]
precisions = precisions[np.newaxis, :]
assert recalls.shape == precisions.shape
assert recalls.ndim == 2
num_scales = recalls.shape[0]
ap = np.zeros(num_scales, dtype=np.float32)
if mode == 'area':
zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
ones = np.ones((num_scales, 1), dtype=recalls.dtype)
mrec = np.hstack((zeros, recalls, ones))
mpre = np.hstack((zeros, precisions, zeros))
for i in range(mpre.shape[1] - 1, 0, -1):
mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
for i in range(num_scales):
ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
ap[i] = np.sum(
(mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
elif mode == '11points':
for i in range(num_scales):
for thr in np.arange(0, 1 + 1e-3, 0.1):
precs = precisions[i, recalls[i, :] >= thr]
prec = precs.max() if precs.size > 0 else 0
ap[i] += prec
ap /= 11
else:
raise ValueError(
'Unrecognized mode, only "area" and "11points" are supported')
return ap
def eval_det_cls(pred, gt, iou_thr=None, area_range=None):
"""Generic functions to compute precision/recall for object detection for a
single class.
Args:
pred (dict): Predictions mapping from image id to bounding boxes
and scores.
gt (dict): Ground truths mapping from image id to bounding boxes.
iou_thr (list[float]): A list of iou thresholds.
Return:
tuple (np.ndarray, np.ndarray, float): Recalls, precisions and
average precision.
"""
# {img_id: {'bbox': box structure, 'det': matched list}}
class_recs = {}
npos = 0
# figure out the bbox code size first
gt_bbox_code_size = 9
pred_bbox_code_size = 9
for img_id in gt.keys():
if len(gt[img_id]) != 0:
gt_bbox_code_size = gt[img_id][0].tensor.shape[1]
break
for img_id in pred.keys():
if len(pred[img_id][0]) != 0:
pred_bbox_code_size = pred[img_id][0][0].tensor.shape[1]
break
assert gt_bbox_code_size == pred_bbox_code_size
for img_id in gt.keys():
cur_gt_num = len(gt[img_id])
if cur_gt_num != 0:
gt_cur = torch.zeros([cur_gt_num, gt_bbox_code_size],
dtype=torch.float32)
for i in range(cur_gt_num):
gt_cur[i] = gt[img_id][i].tensor
bbox = gt[img_id][0].new_box(gt_cur)
else:
bbox = gt[img_id]
det = [[False] * len(bbox) for i in iou_thr]
npos += len(bbox)
class_recs[img_id] = {'bbox': bbox, 'det': det}
if area_range is not None and len(bbox) > 0:
area = bbox.tensor[:, 3:6].cumprod(dim=-1)[..., -1]
ignore = torch.logical_or(area < area_range[0], area > area_range[1])
npos -= ignore.sum().item()
class_recs[img_id]["ignore"] = ignore
# construct dets
image_ids = []
confidence = []
ious = []
for img_id in pred.keys():
cur_num = len(pred[img_id])
if cur_num == 0:
continue
pred_cur = torch.zeros((cur_num, pred_bbox_code_size),
dtype=torch.float32)
box_idx = 0
for box, score in pred[img_id]:
image_ids.append(img_id)
confidence.append(score)
# handle outlier (too thin) predicted boxes
w, l, h = box.tensor[0, 3:6]
faces = [w * l, w * h, h * l]
if torch.any(box.tensor.new_tensor(faces) < 2e-4):
# print('Find small predicted boxes,',
# 'and clamp short edges to 2e-2 meters.')
box.tensor[:, 3:6] = torch.clamp(box.tensor[:, 3:6], min=2e-2)
pred_cur[box_idx] = box.tensor
box_idx += 1
pred_cur = box.new_box(pred_cur)
gt_cur = class_recs[img_id]['bbox']
if len(gt_cur) > 0:
# calculate iou in each image
iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
for i in range(cur_num):
ious.append(iou_cur[i])
else:
for i in range(cur_num):
ious.append(np.zeros(1))
confidence = np.array(confidence)
# sort by confidence
sorted_ind = np.argsort(-confidence)
image_ids = [image_ids[x] for x in sorted_ind]
ious = [ious[x] for x in sorted_ind]
# go down dets and mark TPs and FPs
num_images = len(image_ids)
tp_thr = [np.zeros(num_images) for i in iou_thr]
fp_thr = [np.zeros(num_images) for i in iou_thr]
for d in range(num_images):
R = class_recs[image_ids[d]]
iou_max = -np.inf
BBGT = R['bbox']
cur_iou = ious[d]
if len(BBGT) > 0:
# compute overlaps
for j in range(len(BBGT)):
# iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
iou = cur_iou[j]
if iou > iou_max:
iou_max = iou
jmax = j
for iou_idx, thresh in enumerate(iou_thr):
if iou_max > thresh:
if "ignore" in R and R["ignore"][jmax]:
continue
if not R['det'][iou_idx][jmax]:
tp_thr[iou_idx][d] = 1.
R['det'][iou_idx][jmax] = 1
else:
fp_thr[iou_idx][d] = 1.
else:
fp_thr[iou_idx][d] = 1.
ret = []
for iou_idx, thresh in enumerate(iou_thr):
# compute precision recall
fp = np.cumsum(fp_thr[iou_idx])
tp = np.cumsum(tp_thr[iou_idx])
recall = tp / float(npos)
# avoid divide by zero in case the first detection matches a difficult
# ground truth
precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
ap = average_precision(recall, precision)
ret.append((recall, precision, ap))
return ret
def eval_map_recall(pred, gt, ovthresh=None, area_range=None):
"""Evaluate mAP and recall.
Generic functions to compute precision/recall for object detection
for multiple classes.
Args:
pred (dict): Information of detection results,
which maps class_id and predictions.
gt (dict): Information of ground truths, which maps class_id and
ground truths.
ovthresh (list[float], optional): iou threshold. Default: None.
Return:
tuple[dict]: dict results of recall, AP, and precision for all classes.
"""
ret_values = {}
for classname in gt.keys():
if classname in pred:
ret_values[classname] = eval_det_cls(pred[classname],
gt[classname], ovthresh, area_range)
recall = [{} for i in ovthresh]
precision = [{} for i in ovthresh]
ap = [{} for i in ovthresh]
for label in gt.keys():
for iou_idx, thresh in enumerate(ovthresh):
if label in pred:
recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
label] = ret_values[label][iou_idx]
else:
recall[iou_idx][label] = np.zeros(1)
precision[iou_idx][label] = np.zeros(1)
ap[iou_idx][label] = np.zeros(1)
return recall, precision, ap
def save_pr_curve(recs, precs, metrics):
color = list(DEFAULT_COLOR_MAP.values())
for rec, prec, metric in zip(recs, precs, metrics):
plt.figure(figsize=(20, 18))
fontsize = 50
font = 'Times New Roman'
for k in rec.keys():
if np.isnan(rec[k]).any():
continue
plt.plot(rec[k], prec[k])
plt.xlabel('Recall', fontsize=fontsize)
plt.ylabel('Precision', fontsize=fontsize)
plt.savefig(f'./pr_curver_{metric}.png',dpi=500.)
def indoor_eval(gt_annos,
dt_annos,
metric,
label2cat,
logger=None,
box_mode_3d=None,
classes_split=None,
part="Overall"):
"""Indoor Evaluation.
Evaluate the result of the detection.
Args:
gt_annos (list[dict]): Ground truth annotations.
dt_annos (list[dict]): Detection annotations. the dict
includes the following keys
- labels_3d (torch.Tensor): Labels of boxes.
- bboxes_3d (:obj:`BaseInstance3DBoxes`):
3D bounding boxes in Depth coordinate.
- scores_3d (torch.Tensor): Scores of boxes.
metric (list[float]): IoU thresholds for computing average precisions.
label2cat (tuple): Map from label to category.
logger (logging.Logger | str, optional): The way to print the mAP
summary. See `mmdet.utils.print_log()` for details. Default: None.
Return:
dict[str, float]: Dict of results.
"""
assert len(dt_annos) == len(gt_annos)
pred = {} # map {class_id: pred}
gt = {} # map {class_id: gt}
for img_id in range(len(dt_annos)):
# parse detected annotations
det_anno = dt_annos[img_id]
for i in range(len(det_anno['labels_3d'])):
label = det_anno['labels_3d'].numpy()[i]
bbox = det_anno['bboxes_3d'].convert_to(box_mode_3d)[i]
score = det_anno['scores_3d'].numpy()[i]
if label not in pred:
pred[int(label)] = {}
if img_id not in pred[label]:
pred[int(label)][img_id] = []
if label not in gt:
gt[int(label)] = {}
if img_id not in gt[label]:
gt[int(label)][img_id] = []
pred[int(label)][img_id].append((bbox, score))
# parse gt annotations
gt_anno = gt_annos[img_id]
gt_boxes = gt_anno['gt_bboxes_3d']
labels_3d = gt_anno['gt_labels_3d']
for i in range(len(labels_3d)):
label = labels_3d[i]
bbox = gt_boxes[i]
if label not in gt:
gt[label] = {}
if img_id not in gt[label]:
gt[label][img_id] = []
gt[label][img_id].append(bbox)
rec, prec, ap = eval_map_recall(pred, gt, metric)
# filter nan results
ori_keys = list(ap[0].keys())
for key in ori_keys:
if np.isnan(ap[0][key][0]):
for r in rec:
del r[key]
for p in prec:
del p[key]
for a in ap:
del a[key]
ret_dict = dict()
header = ['classes']
table_columns = [[label2cat[label]
for label in ap[0].keys()] + [part]]
for i, iou_thresh in enumerate(metric):
header.append(f'AP_{iou_thresh:.2f}')
header.append(f'AR_{iou_thresh:.2f}')
rec_list = []
for label in ap[i].keys():
ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
ap[i][label][0])
ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list(
ap[i].values())))
table_columns.append(list(map(float, list(ap[i].values()))))
table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
for label in rec[i].keys():
ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
rec[i][label][-1])
rec_list.append(rec[i][label][-1])
ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
table_columns.append(list(map(float, rec_list)))
table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
table_data = [header]
table_rows = list(zip(*table_columns))
table_data += table_rows
table = AsciiTable(table_data)
table.inner_footing_row_border = True
print_log('\n' + table.table, logger=logger)
if classes_split is not None:
splits = ['head', 'common', 'tail']
for idx in range(len(splits)):
header = [f'{splits[idx]}_classes']
# init the category list/column
cat_list = []
for label in classes_split[idx]:
if label in ap[0]:
cat_list.append(label2cat[label])
table_columns = [cat_list + [part]]
for i, iou_thresh in enumerate(metric):
header.append(f'AP_{iou_thresh:.2f}')
header.append(f'AR_{iou_thresh:.2f}')
ap_list = []
for label in classes_split[idx]:
if label in ap[i]:
ap_list.append(float(ap[i][label][0]))
mean_ap = float(np.mean(ap_list))
table_columns.append(list(map(float, ap_list)))
table_columns[-1] += [mean_ap]
table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
rec_list = []
for label in classes_split[idx]:
if label in rec[i]:
rec_list.append(rec[i][label][-1])
mean_rec = float(np.mean(rec_list))
table_columns.append(list(map(float, rec_list)))
table_columns[-1] += [mean_rec]
table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
table_data = [header]
table_rows = list(zip(*table_columns))
table_data += table_rows
table = AsciiTable(table_data)
table.inner_footing_row_border = True
print_log('\n' + table.table, logger=logger)
area_split = {
"small": [0, 0.2**3],
"medium": [0.2**3, 1.0**3],
"large": [1.0**3, float("inf")],
}
if area_split is not None:
header = ["area"]
for i, iou_thresh in enumerate(metric):
header.append(f'AP_{iou_thresh:.2f}')
header.append(f'AR_{iou_thresh:.2f}')
table_rows = []
area_ret_dict = dict()
for idx, (area, area_range) in enumerate(area_split.items()):
# header = [area]
rec, prec, ap = eval_map_recall(pred, gt, metric, area_range)
ori_keys = list(ap[0].keys())
for key in ori_keys:
if np.isnan(ap[0][key][0]):
for r in rec:
del r[key]
for p in prec:
del p[key]
for a in ap:
del a[key]
# table_columns = [[f"{area}_{part}"]]
table_rows.append([f"{area}_{part}"])
for i, iou_thresh in enumerate(metric):
# header.append(f'AP_{iou_thresh:.2f}')
# header.append(f'AR_{iou_thresh:.2f}')
rec_list = []
for label in ap[i].keys():
area_ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
ap[i][label][0])
area_ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list(
ap[i].values())))
table_rows[-1].append(f"{area_ret_dict[f'mAP_{iou_thresh:.2f}']:.4f}")
# table_columns.append([f"{area_ret_dict[f'mAP_{iou_thresh:.2f}']:.4f}"])
for label in rec[i].keys():
area_ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
rec[i][label][-1])
rec_list.append(rec[i][label][-1])
area_ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
table_rows[-1].append(f"{area_ret_dict[f'mAR_{iou_thresh:.2f}']:.4f}")
# table_columns.append([f"{area_ret_dict[f'mAR_{iou_thresh:.2f}']:.4f}"])
# table_data = [header]
# table_rows = list(zip(*table_columns))
# table_data += table_rows
# table = AsciiTable(table_data)
# table.inner_footing_row_border = True
# print_log('\n' + table.table, logger=logger)
table_data = [header]
table_data += table_rows
table = AsciiTable(table_data)
table.inner_footing_row_border = False
print_log('\n' + table.table, logger=logger)
return ret_dict
================================================
FILE: bip3d/eval/metrics/__init__.py
================================================
from .det_metric import IndoorDetMetric
from .grounding_metric import GroundingMetric
# from .occupancy_metric import OccupancyMetric
__all__ = ['IndoorDetMetric', 'GroundingMetric']
================================================
FILE: bip3d/eval/metrics/det_metric.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import logging
from collections import OrderedDict
from typing import Dict, List, Optional, Sequence, Union
import numpy as np
from terminaltables import AsciiTable
from mmdet.evaluation import eval_map
from mmengine.dist import (broadcast_object_list, collect_results,
is_main_process, get_rank)
from mmengine.evaluator import BaseMetric
from mmengine.evaluator.metric import _to_cpu
from mmengine.logging import MMLogger, print_log
from bip3d.registry import METRICS
from bip3d.structures import get_box_type
from ..indoor_eval import indoor_eval
@METRICS.register_module()
class IndoorDetMetric(BaseMetric):
"""Indoor scene evaluation metric.
Args:
iou_thr (float or List[float]): List of iou threshold when calculate
the metric. Defaults to [0.25, 0.5].
collect_device (str): Device name used for collecting results from
different ranks during distributed training. Must be 'cpu' or
'gpu'. Defaults to 'cpu'.
prefix (str, optional): The prefix that will be added in the metric
names to disambiguate homonymous metrics of different evaluators.
If prefix is not provided in the argument, self.default_prefix will
be used instead. Defaults to None.
"""
def __init__(self,
iou_thr: List[float] = [0.25, 0.5],
collect_device: str = 'cpu',
prefix: Optional[str] = None,
batchwise_anns: bool = False,
save_result_path=None,
eval_part=None,
**kwargs):
super(IndoorDetMetric, self).__init__(prefix=prefix,
collect_device=collect_device,
**kwargs)
self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
self.batchwise_anns = batchwise_anns
self.save_result_path = save_result_path
if eval_part is None:
eval_part = ["scannet", "3rscan", "matterport3d", "arkit"]
self.eval_part = eval_part
def process(self, data_batch: dict, data_samples: Sequence[dict]):
"""Process one batch of data samples and predictions.
The processed results should be stored in ``self.results``, which will
be used to compute the metrics when all batches have been processed.
Args:
data_batch (dict): A batch of data from the dataloader.
data_samples (Sequence[dict]): A batch of outputs from the model.
"""
for data_sample in data_samples:
pred_3d = data_sample['pred_instances_3d']
eval_ann_info = data_sample['eval_ann_info']
eval_ann_info["scan_id"] = data_sample["scan_id"]
cpu_pred_3d = dict()
for k, v in pred_3d.items():
if hasattr(v, 'to'):
cpu_pred_3d[k] = v.to('cpu')
else:
cpu_pred_3d[k] = v
self.results.append((eval_ann_info, cpu_pred_3d))
def compute_metrics(self, results: list, part="Overall", split=True):
"""Compute the metrics from processed results.
Args:
results (list): The processed results of each batch.
Returns:
Dict[str, float]: The computed metrics. The keys are the names of
the metrics, and the values are corresponding results.
"""
logger: MMLogger = MMLogger.get_current_instance()
ann_infos = []
pred_results = []
for eval_ann, sinlge_pred_results in results:
ann_infos.append(eval_ann)
pred_results.append(sinlge_pred_results)
# some checkpoints may not record the key "box_type_3d"
box_type_3d, box_mode_3d = get_box_type(
self.dataset_meta.get('box_type_3d', 'depth'))
print_log(f"eval : {len(ann_infos)}, {len(pred_results)}, rank: {get_rank()}")
ret_dict = indoor_eval(ann_infos,
pred_results,
self.iou_thr,
self.dataset_meta['classes'],
logger=logger,
box_mode_3d=box_mode_3d,
classes_split=self.dataset_meta.get(
'classes_split', None) if split else None,
part=part)
return ret_dict
def evaluate(self, size: int):
"""Evaluate the model performance of the whole dataset after processing
all batches.
Args:
size (int): Length of the entire validation dataset. When batch
size > 1, the dataloader may pad some data samples to make
sure all ranks have the same length of dataset slice. The
``collect_results`` function will drop the padded data based on
this size.
Returns:
dict: Evaluation metrics dict on the val dataset. The keys are the
names of the metrics, and the values are corresponding results.
"""
if len(self.results) == 0:
print_log(
f'{self.__class__.__name__} got empty `self.results`. Please '
'ensure that the processed results are properly added into '
'`self.results` in `process` method.',
logger='current',
level=logging.WARNING)
print_log(
f"number of results: "
f"{len(self.results)}, size: {size}, "
f"batchwise_anns: {self.batchwise_anns}, rank: {get_rank()}, "
f"collect_dir: {self.collect_dir}"
)
if self.batchwise_anns:
# the actual dataset length/size is the len(self.results)
if self.collect_device == 'cpu':
results = collect_results(self.results,
len(self.results),
self.collect_device,
tmpdir=self.collect_dir)
else:
results = collect_results(self.results, len(self.results),
self.collect_device)
else:
if self.collect_device == 'cpu':
results = collect_results(self.results,
size,
self.collect_device,
tmpdir=self.collect_dir)
else:
results = collect_results(self.results, size,
self.collect_device)
print_log(
f"number of collected results: "
f"{len(results) if results is not None else 'None'},"
f"rank: {get_rank()}"
)
if is_main_process():
# cast all tensors in results list to cpu
results = _to_cpu(results)
# import pdb; pdb.set_trace()
if self.save_result_path is not None:
import pickle
import copy
_ret = []
for x in results:
x = list(copy.deepcopy(x))
x[0]["gt_bboxes_3d"] = x[0]["gt_bboxes_3d"].cpu().numpy()
x[1]["bboxes_3d"] = x[1]["bboxes_3d"].cpu().numpy()
x[1]["scores_3d"] = x[1]["scores_3d"].cpu().numpy()
x[1]["labels_3d"] = x[1]["labels_3d"].cpu().numpy()
x[1]["target_scores_3d"] = x[1]["target_scores_3d"].cpu().numpy()
_ret.append(x)
pickle.dump(_ret, open(os.path.join(self.save_result_path, "results.pkl"), "wb"))
_metrics = self.compute_metrics(results) # type: ignore
for part in self.eval_part:
# for part in []:
print(f"=================== {part} =========================")
part_ret = [x for x in results if part in x[0]["scan_id"]]
if len(part_ret) > 0:
_metrics_part = self.compute_metrics(part_ret, part=part)
_metrics_part = {f"{part}--{k}": v for k, v in _metrics_part.items()}
_metrics.update(_metrics_part)
# Add prefix to metric names
if self.prefix:
_metrics = {
'/'.join((self.prefix, k)): v
for k, v in _metrics.items()
}
metrics = [_metrics]
else:
metrics = [None] # type: ignore
broadcast_object_list(metrics)
if is_main_process():
table = []
header = ["Summary"]
for i, iou_thresh in enumerate(self.iou_thr):
header.append(f'AP_{iou_thresh:.2f}')
header.append(f'AR_{iou_thresh:.2f}')
table.append(header)
for part in self.eval_part + ["Overall"]:
table_data = [part]
for i, iou_thresh in enumerate(self.iou_thr):
key = f"{part+'--' if part!='Overall' else ''}mAP_{iou_thresh:.2f}"
if key in _metrics:
table_data.append(f"{_metrics[key]:.4f}")
else:
table_data.append(f"0.xxxx")
key = f"{part+'--' if part!='Overall' else ''}mAR_{iou_thresh:.2f}"
if key in _metrics:
table_data.append(f"{_metrics[key]:.4f}")
else:
table_data.append(f"0.xxxx")
table.append(table_data)
table = AsciiTable(table)
table.inner_footing_row_border = True
logger: MMLogger = MMLogger.get_current_instance()
print_log('\n' + table.table, logger=logger)
# reset the results list
self.results.clear()
return metrics[0]
@METRICS.register_module()
class Indoor2DMetric(BaseMetric):
"""indoor 2d predictions evaluation metric.
Args:
iou_thr (float or List[float]): List of iou threshold when calculate
the metric. Defaults to [0.5].
collect_device (str): Device name used for collecting results from
different ranks during distributed training. Must be 'cpu' or
'gpu'. Defaults to 'cpu'.
prefix (str, optional): The prefix that will be added in the metric
names to disambiguate homonymous metrics of different evaluators.
If prefix is not provided in the argument, self.default_prefix will
be used instead. Defaults to None.
"""
def __init__(self,
iou_thr: Union[float, List[float]] = [0.5],
collect_device: str = 'cpu',
prefix: Optional[str] = None):
super(Indoor2DMetric, self).__init__(prefix=prefix,
collect_device=collect_device)
self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
"""Process one batch of data samples and predictions.
The processed results should be stored in ``self.results``, which will
be used to compute the metrics when all batches have been processed.
Args:
data_batch (dict): A batch of data from the dataloader.
data_samples (Sequence[dict]): A batch of outputs from the model.
"""
for data_sample in data_samples:
pred = data_sample['pred_instances']
eval_ann_info = data_sample['eval_ann_info']
ann = dict(labels=eval_ann_info['gt_bboxes_labels'],
bboxes=eval_ann_info['gt_bboxes'])
pred_bboxes = pred['bboxes'].cpu().numpy()
pred_scores = pred['scores'].cpu().numpy()
pred_labels = pred['labels'].cpu().numpy()
dets = []
for label in range(len(self.dataset_meta['classes'])):
index = np.where(pred_labels == label)[0]
pred_bbox_scores = np.hstack(
[pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
dets.append(pred_bbox_scores)
self.results.append((ann, dets))
def compute_metrics(self, results: list) -> Dict[str, float]:
"""Compute the metrics from processed results.
Args:
results (list): The processed results of each batch.
Returns:
Dict[str, float]: The computed metrics. The keys are the names of
the metrics, and the values are corresponding results.
"""
logger: MMLogger = MMLogger.get_current_instance()
annotations, preds = zip(*results)
eval_results = OrderedDict()
for iou_thr_2d_single in self.iou_thr:
mean_ap, _ = eval_map(preds,
annotations,
scale_ranges=None,
iou_thr=iou_thr_2d_single,
dataset=self.dataset_meta['classes'],
logger=logger)
eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
return eval_results
================================================
FILE: bip3d/eval/metrics/grounding_metric.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import os
from typing import Dict, List, Optional, Sequence
import pickle
import mmengine
from mmengine.evaluator import BaseMetric
from mmengine.logging import MMLogger, print_log
from terminaltables import AsciiTable
from bip3d.registry import METRICS
from bip3d.structures import EulerDepthInstance3DBoxes
@METRICS.register_module()
class GroundingMetric(BaseMetric):
"""Lanuage grounding evaluation metric. We calculate the grounding
performance based on the alignment score of each bbox with the input
prompt.
Args:
iou_thr (float or List[float]): List of iou threshold when calculate
the metric. Defaults to [0.25, 0.5].
collect_device (str): Device name used for collecting results from
different ranks during distributed training. Must be 'cpu' or
'gpu'. Defaults to 'cpu'.
prefix (str, optional): The prefix that will be added in the metric
names to disambiguate homonymous metrics of different evaluators.
If prefix is not provided in the argument, self.default_prefix will
be used instead. Defaults to None.
format_only (bool): Whether to only inference the predictions without
evaluation. Defaults to False.
result_dir (str): Dir to save results, e.g., if result_dir = './',
the result file will be './test_results.json'. Defaults to ''.
"""
def __init__(self,
iou_thr: List[float] = [0.25, 0.5],
collect_device: str = 'cpu',
prefix: Optional[str] = None,
format_only=False,
submit_info=None,
result_dir='', **kwargs) -> None:
super(GroundingMetric, self).__init__(prefix=prefix,
collect_device=collect_device,
**kwargs)
self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
self.prefix = prefix
self.format_only = format_only
self.result_dir = result_dir
self.submit_info = submit_info if submit_info is not None else {}
def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
"""Process one batch of data samples and predictions.
The processed results should be stored in ``self.results``, which will
be used to compute the metrics when all batches have been processed.
Args:
data_batch (dict): A batch of data from the dataloader.
data_samples (Sequence[dict]): A batch of outputs from the model.
"""
for data_sample in data_samples:
pred_3d = data_sample['pred_instances_3d']
eval_ann_info = data_sample['eval_ann_info']
eval_ann_info["scan_id"] = data_sample["scan_id"]
cpu_pred_3d = dict()
for k, v in pred_3d.items():
if hasattr(v, 'to'):
cpu_pred_3d[k] = v.to('cpu')
else:
cpu_pred_3d[k] = v
self.results.append((eval_ann_info, cpu_pred_3d))
def ground_eval(self, gt_annos, det_annos, logger=None, part="Overall"):
assert len(det_annos) == len(gt_annos)
pred = {}
gt = {}
object_types = [
'Easy', 'Hard', 'View-Dep', 'View-Indep', 'Unique', 'Multi',
'Overall'
]
for t in self.iou_thr:
for object_type in object_types:
pred.update({object_type + '@' + str(t): 0})
gt.update({object_type + '@' + str(t): 1e-14})
for sample_id in range(len(det_annos)):
det_anno = det_annos[sample_id]
gt_anno = gt_annos[sample_id]
target_scores = det_anno['target_scores_3d'] # (num_query, )
bboxes = det_anno['bboxes_3d']
gt_bboxes = gt_anno['gt_bboxes_3d']
bboxes = EulerDepthInstance3DBoxes(bboxes.tensor,
origin=(0.5, 0.5, 0.5))
gt_bboxes = EulerDepthInstance3DBoxes(gt_bboxes.tensor,
origin=(0.5, 0.5, 0.5))
view_dep = gt_anno['is_view_dep']
hard = gt_anno['is_hard']
unique = gt_anno['is_unique']
box_index = target_scores.argsort(dim=-1, descending=True)[:10]
top_bbox = bboxes[box_index]
iou = top_bbox.overlaps(top_bbox, gt_bboxes) # (num_query, 1)
for t in self.iou_thr:
threshold = iou > t
found = int(threshold.any())
if view_dep:
gt['View-Dep@' + str(t)] += 1
pred['View-Dep@' + str(t)] += found
else:
gt['View-Indep@' + str(t)] += 1
pred['View-Indep@' + str(t)] += found
if hard:
gt['Hard@' + str(t)] += 1
pred['Hard@' + str(t)] += found
else:
gt['Easy@' + str(t)] += 1
pred['Easy@' + str(t)] += found
if unique:
gt['Unique@' + str(t)] += 1
pred['Unique@' + str(t)] += found
else:
gt['Multi@' + str(t)] += 1
pred['Multi@' + str(t)] += found
gt['Overall@' + str(t)] += 1
pred['Overall@' + str(t)] += found
header = ['Type']
header.extend(object_types)
ret_dict = {}
for t in self.iou_thr:
if part == "Overall":
table_columns = [[f'AP@{t:.2f}']]
else:
table_columns = [[f'{part}_AP@{t:.2f}']]
for object_type in object_types:
metric = object_type + '@' + str(t)
value = pred[metric] / max(gt[metric], 1)
ret_dict[metric] = value
table_columns.append([f'{value:.4f}'])
table_data = [header]
table_rows = list(zip(*table_columns))
table_data += table_rows
table = AsciiTable(table_data)
table.inner_footing_row_border = True
print_log('\n' + table.table, logger=logger)
return ret_dict
def compute_metrics(self, results: list) -> Dict[str, float]:
"""Compute the metrics from processed results after all batches have
been processed.
Args:
results (list): The processed results of each batch.
Returns:
Dict[str, float]: The computed metrics. The keys are the names of
the metrics, and the values are corresponding results.
"""
logger: MMLogger = MMLogger.get_current_instance() # noqa
annotations, preds = zip(*results)
ret_dict = {}
if self.format_only:
# preds is a list of dict
results = []
for pred in preds:
result = dict()
# convert the Euler boxes to the numpy array to save
bboxes_3d = pred['bboxes_3d'].tensor
scores_3d = pred['scores_3d']
# Note: hard-code save top-20 predictions
# eval top-10 predictions during the test phase by default
box_index = scores_3d.argsort(dim=-1, descending=True)[:20]
top_bboxes_3d = bboxes_3d[box_index]
top_scores_3d = scores_3d[box_index]
result['bboxes_3d'] = top_bboxes_3d.numpy()
result['scores_3d'] = top_scores_3d.numpy()
results.append(result)
mmengine.dump(results,
os.path.join(self.result_dir, 'test_results.json'))
for x in results:
x["bboxes_3d"] = x["bboxes_3d"].tolist()
x["scores_3d"] = x["scores_3d"].tolist()
submission = {
'method': 'xxx',
'team': 'xxx',
'authors': ['xxx'],
'e-mail': 'xxx',
'institution': 'xxx',
'country': 'xxx',
'results': results,
}
submission.update(self.submit_info)
pickle.dump(submission, open(os.path.join(self.result_dir, 'submission.pkl'), "wb"))
return ret_dict
ret_dict = self.ground_eval(annotations, preds)
for part in ["scannet", "3rscan", "matterport3d", "arkit"]:
ann = [x for x in annotations if part in x["scan_id"]]
pred = [y for x,y in zip(annotations, preds) if part in x["scan_id"]]
if len(pred) > 0:
part_ret_dict= self.ground_eval(ann, pred, part=part)
part_ret_dict = {f"{part}--{k}": v for k, v in part_ret_dict.items()}
ret_dict.update(part_ret_dict)
return ret_dict
================================================
FILE: bip3d/grid_mask.py
================================================
import torch
import torch.nn as nn
import numpy as np
from PIL import Image
class Grid(object):
def __init__(
self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0
):
self.use_h = use_h
self.use_w = use_w
self.rotate = rotate
self.offset = offset
self.ratio = ratio
self.mode = mode
self.st_prob = prob
self.prob = prob
def set_prob(self, epoch, max_epoch):
self.prob = self.st_prob * epoch / max_epoch
def __call__(self, img, label):
if np.random.rand() > self.prob:
return img, label
h = img.size(1)
w = img.size(2)
self.d1 = 2
self.d2 = min(h, w)
hh = int(1.5 * h)
ww = int(1.5 * w)
d = np.random.randint(self.d1, self.d2)
if self.ratio == 1:
self.l = np.random.randint(1, d)
else:
self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
mask = np.ones((hh, ww), np.float32)
st_h = np.random.randint(d)
st_w = np.random.randint(d)
if self.use_h:
for i in range(hh // d):
s = d * i + st_h
t = min(s + self.l, hh)
mask[s:t, :] *= 0
if self.use_w:
for i in range(ww // d):
s = d * i + st_w
t = min(s + self.l, ww)
mask[:, s:t] *= 0
r = np.random.randint(self.rotate)
mask = Image.fromarray(np.uint8(mask))
mask = mask.rotate(r)
mask = np.asarray(mask)
mask = mask[
(hh - h) // 2 : (hh - h) // 2 + h,
(ww - w) // 2 : (ww - w) // 2 + w,
]
mask = torch.from_numpy(mask).float()
if self.mode == 1:
mask = 1 - mask
mask = mask.expand_as(img)
if self.offset:
offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
offset = (1 - mask) * offset
img = img * mask + offset
else:
img = img * mask
return img, label
class GridMask(object):
def __init__(
self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0
):
super(GridMask, self).__init__()
self.use_h = use_h
self.use_w = use_w
self.rotate = rotate
self.offset = offset
self.ratio = ratio
self.mode = mode
self.st_prob = prob
self.prob = prob
def set_prob(self, epoch, max_epoch):
self.prob = self.st_prob * epoch / max_epoch # + 1.#0.5
def __call__(self, x, offset=None):
if np.random.rand() > self.prob:
return x
n, c, h, w = x.size()
x = x.view(-1, h, w)
hh = int(1.5 * h)
ww = int(1.5 * w)
d = np.random.randint(2, h)
self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
mask = np.ones((hh, ww), np.float32)
st_h = np.random.randint(d)
st_w = np.random.randint(d)
if self.use_h:
for i in range(hh // d):
s = d * i + st_h
t = min(s + self.l, hh)
mask[s:t, :] *= 0
if self.use_w:
for i in range(ww // d):
s = d * i + st_w
t = min(s + self.l, ww)
mask[:, s:t] *= 0
r = np.random.randint(self.rotate)
mask = Image.fromarray(np.uint8(mask))
mask = mask.rotate(r)
mask = np.asarray(mask)
mask = mask[
(hh - h) // 2 : (hh - h) // 2 + h,
(ww - w) // 2 : (ww - w) // 2 + w,
]
mask = torch.tensor(mask).float().cuda()
if self.mode == 1:
mask = 1 - mask
mask = mask.expand_as(x)
if offset is not None:
x = x.view(n, c, h, w)
mask = mask.view(n, c, h, w)
x = x * mask + offset * (1 - mask)
return x
elif self.offset:
offset = (
torch.from_numpy(2 * (np.random.rand(h, w) - 0.5))
.float()
.cuda()
)
x = x * mask + offset * (1 - mask)
else:
x = x * mask
return x.view(n, c, h, w)
================================================
FILE: bip3d/models/__init__.py
================================================
from .structure import *
from .feature_enhancer import *
from .spatial_enhancer import *
from .bbox3d_decoder import *
from .instance_bank import *
from .target import *
from .data_preprocessors import *
from .deformable_aggregation import *
from .bert import *
================================================
FILE: bip3d/models/base_target.py
================================================
from torch import nn
from abc import ABC, abstractmethod
__all__ = ["BaseTargetWithDenoising"]
class BaseTargetWithDenoising(ABC, nn.Module):
def __init__(self, num_dn=0, num_temp_dn_groups=0):
super(BaseTargetWithDenoising, self).__init__()
self.num_dn = num_dn
self.num_temp_dn_groups = num_temp_dn_groups
self.dn_metas = None
@abstractmethod
def sample(self, cls_pred, box_pred, cls_target, box_target):
"""
Perform Hungarian matching between predictions and ground truth,
returning the matched ground truth corresponding to the predictions
along with the corresponding regression weights.
"""
def get_dn_anchors(self, cls_target, box_target, *args, **kwargs):
"""
Generate noisy instances for the current frame, with a total of
'self.num_dn_groups' groups.
"""
return None
def update_dn(self, instance_feature, anchor, *args, **kwargs):
"""
Insert the previously saved 'self.dn_metas' into the noisy instances
of the current frame.
"""
def cache_dn(
self,
dn_instance_feature,
dn_anchor,
dn_cls_target,
valid_mask,
dn_id_target,
):
"""
Randomly save information for 'self.num_temp_dn_groups' groups of
temporal noisy instances to 'self.dn_metas'.
"""
if self.num_temp_dn_groups < 0:
return
self.dn_metas = dict(dn_anchor=dn_anchor[:, : self.num_temp_dn_groups])
================================================
FILE: bip3d/models/bbox3d_decoder.py
================================================
import math
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from mmcv.cnn import Linear, Scale
from mmcv.ops import nms3d_normal, nms3d
from mmengine.model import BaseModel
from mmengine.structures import InstanceData
from mmdet.models.layers import SinePositionalEncoding
from mmdet.models.layers.transformer.deformable_detr_layers import (
DeformableDetrTransformerEncoder as DDTE,
)
from mmdet.models.layers.transformer.utils import MLP
from mmdet.utils import reduce_mean
from mmdet.models.detectors.glip import create_positive_map_label_to_token
from mmdet.models.dense_heads.atss_vlfusion_head import (
convert_grounding_to_cls_scores,
)
from bip3d.registry import MODELS, TASK_UTILS
from bip3d.structures.bbox_3d.utils import rotation_3d_in_euler
from .utils import (
deformable_format,
wasserstein_distance,
permutation_corner_distance,
center_distance,
get_positive_map,
get_entities,
linear_act_ln,
)
__all__ = ["BBox3DDecoder"]
X, Y, Z, W, L, H, ALPHA, BETA, GAMMA = range(9)
def decode_box(box, min_size=None, max_size=None):
size = box[..., 3:6].exp()
# size = box[..., 3:6]
if min_size is not None or max_size is not None:
size = size.clamp(min=min_size, max=max_size)
box = torch.cat(
[box[..., :3], size, box[..., 6:]],
dim=-1,
)
return box
@MODELS.register_module()
class BBox3DDecoder(BaseModel):
def __init__(
self,
instance_bank: dict,
anchor_encoder: dict,
graph_model: dict,
norm_layer: dict,
ffn: dict,
deformable_model: dict,
refine_layer: dict,
num_decoder: int = 6,
num_single_frame_decoder: int = -1,
temp_graph_model: dict = None,
text_cross_attn: dict = None,
loss_cls: dict = None,
loss_reg: dict = None,
post_processor: dict = None,
sampler: dict = None,
gt_cls_key: str = "gt_labels_3d",
gt_reg_key: str = "gt_bboxes_3d",
gt_id_key: str = "instance_id",
with_instance_id: bool = True,
task_prefix: str = "det",
reg_weights: List = None,
operation_order: Optional[List[str]] = None,
cls_threshold_to_reg: float = -1,
look_forward_twice: bool = False,
init_cfg: dict = None,
**kwargs,
):
super().__init__(init_cfg)
self.num_decoder = num_decoder
self.num_single_frame_decoder = num_single_frame_decoder
self.gt_cls_key = gt_cls_key
self.gt_reg_key = gt_reg_key
self.gt_id_key = gt_id_key
self.with_instance_id = with_instance_id
self.task_prefix = task_prefix
self.cls_threshold_to_reg = cls_threshold_to_reg
self.look_forward_twice = look_forward_twice
if reg_weights is None:
self.reg_weights = [1.0] * 9
else:
self.reg_weights = reg_weights
if operation_order is None:
operation_order = [
"gnn",
"norm",
"text_cross_attn",
"norm",
"deformable",
"norm",
"ffn",
"norm",
"refine",
] * num_decoder
self.operation_order = operation_order
# =========== build modules ===========
def build(cfg, registry=MODELS):
if cfg is None:
return None
return registry.build(cfg)
self.instance_bank = build(instance_bank)
self.anchor_encoder = build(anchor_encoder)
self.sampler = build(sampler, TASK_UTILS)
self.post_processor = build(post_processor, TASK_UTILS)
self.loss_cls = build(loss_cls)
self.loss_reg = build(loss_reg)
self.op_config_map = {
"temp_gnn": temp_graph_model,
"gnn": graph_model,
"norm": norm_layer,
"ffn": ffn,
"deformable": deformable_model,
"text_cross_attn": text_cross_attn,
"refine": refine_layer,
}
self.layers = nn.ModuleList(
[
build(self.op_config_map.get(op, None))
for op in self.operation_order
]
)
self.embed_dims = self.instance_bank.embed_dims
self.norm = nn.LayerNorm(self.embed_dims)
def init_weights(self):
from mmengine.model import constant_init
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
for i, op in enumerate(self.operation_order):
if op == "refine":
m = self.layers[i]
constant_init(m.layers[-2], 0, bias=0)
constant_init(m.layers[-1], 1)
nn.init.constant_(m.layers[-2].bias.data[2:], 0.0)
def forward(
self,
feature_maps,
text_dict=None,
batch_inputs=None,
depth_prob=None,
**kwargs,
):
batch_size = feature_maps[0].shape[0]
feature_maps = list(deformable_format(feature_maps))
# ========= get instance info ============
if (
self.sampler.dn_metas is not None
and self.sampler.dn_metas["dn_anchor"].shape[0] != batch_size
):
self.sampler.dn_metas = None
(
instance_feature,
anchor,
temp_instance_feature,
temp_anchor,
time_interval,
) = self.instance_bank.get(
batch_size, batch_inputs, dn_metas=self.sampler.dn_metas
)
# ========= prepare for denosing training ============
# 1. get dn metas: noisy-anchors and corresponding GT
# 2. concat learnable instances and noisy instances
# 3. get attention mask
attn_mask = None
dn_metas = None
temp_dn_reg_target = None
if self.training and hasattr(self.sampler, "get_dn_anchors"):
dn_metas = self.sampler.get_dn_anchors(
batch_inputs[self.gt_cls_key],
batch_inputs[self.gt_reg_key],
text_dict=text_dict,
label=batch_inputs["gt_labels_3d"],
)
if dn_metas is not None:
(
dn_anchor,
dn_reg_target,
dn_cls_target,
dn_attn_mask,
valid_mask,
dn_query,
) = dn_metas
num_dn_anchor = dn_anchor.shape[1]
if dn_anchor.shape[-1] != anchor.shape[-1]:
remain_state_dims = anchor.shape[-1] - dn_anchor.shape[-1]
dn_anchor = torch.cat(
[
dn_anchor,
dn_anchor.new_zeros(
batch_size, num_dn_anchor, remain_state_dims
),
],
dim=-1,
)
anchor = torch.cat([anchor, dn_anchor], dim=1)
if dn_query is None:
dn_query = instance_feature.new_zeros(
batch_size, num_dn_anchor, instance_feature.shape[-1]
),
instance_feature = torch.cat(
[instance_feature, dn_query], dim=1,
)
num_instance = instance_feature.shape[1]
num_free_instance = num_instance - num_dn_anchor
attn_mask = anchor.new_ones(
(num_instance, num_instance), dtype=torch.bool
)
attn_mask[:num_free_instance, :num_free_instance] = False
attn_mask[num_free_instance:, num_free_instance:] = dn_attn_mask
else:
num_dn_anchor = None
num_free_instance = None
anchor_embed = self.anchor_encoder(anchor)
if temp_anchor is not None:
temp_anchor_embed = self.anchor_encoder(temp_anchor)
else:
temp_anchor_embed = None
# =================== forward the layers ====================
prediction = []
classification = []
quality = []
_anchor = None
for i, (op, layer) in enumerate(
zip(self.operation_order, self.layers)
):
if self.layers[i] is None:
continue
elif op == "temp_gnn":
instance_feature = layer(
query=instance_feature,
key=temp_instance_feature,
value=temp_instance_feature,
query_pos=anchor_embed,
key_pos=temp_anchor_embed,
attn_mask=(
attn_mask if temp_instance_feature is None else None
),
)
elif op == "gnn":
instance_feature = layer(
query=instance_feature,
key=instance_feature,
value=instance_feature,
query_pos=anchor_embed,
key_pos=anchor_embed,
attn_mask=attn_mask,
)
elif op == "norm" or op == "ffn":
instance_feature = layer(instance_feature)
elif op == "deformable":
instance_feature = layer(
instance_feature,
anchor,
anchor_embed,
feature_maps,
batch_inputs,
depth_prob=depth_prob,
)
elif op == "text_cross_attn":
text_feature = text_dict["embedded"]
instance_feature = layer(
query=instance_feature,
key=text_feature,
value=text_feature,
query_pos=anchor_embed,
key_padding_mask=~text_dict["text_token_mask"],
key_pos=0,
)
elif op == "refine":
_instance_feature = self.norm(instance_feature)
if self.look_forward_twice:
if _anchor is None:
_anchor = anchor.clone()
_anchor, cls, qt = layer(
_instance_feature,
_anchor,
anchor_embed,
time_interval=time_interval,
text_feature=text_feature,
text_token_mask=text_dict["text_token_mask"],
)
prediction.append(_anchor)
anchor = layer(
instance_feature,
anchor,
anchor_embed,
time_interval=time_interval,
)[0]
anchor_embed = self.anchor_encoder(anchor)
_anchor = anchor
anchor = anchor.detach()
else:
anchor, cls, qt = layer(
_instance_feature,
anchor,
anchor_embed,
time_interval=time_interval,
text_feature=text_feature,
text_token_mask=text_dict["text_token_mask"],
)
anchor_embed = self.anchor_encoder(anchor)
prediction.append(anchor)
classification.append(cls)
quality.append(qt)
if len(prediction) == self.num_single_frame_decoder:
instance_feature, anchor = self.instance_bank.update(
instance_feature, anchor if _anchor is None else _anchor,
cls, num_dn_anchor
)
anchor_embed = self.anchor_encoder(anchor)
if self.look_forward_twice:
_anchor = anchor
anchor = anchor.detach()
if dn_metas is not None:
num_instance = instance_feature.shape[1]
attn_mask = anchor.new_ones(
(num_instance, num_instance), dtype=torch.bool
)
attn_mask[:-num_dn_anchor, :-num_dn_anchor] = False
attn_mask[-num_dn_anchor:, -num_dn_anchor:] = dn_attn_mask
if (
len(prediction) > self.num_single_frame_decoder
and temp_anchor_embed is not None
):
temp_anchor_embed = anchor_embed[
:, : self.instance_bank.num_temp_instances
]
else:
raise NotImplementedError(f"{op} is not supported.")
output = {}
# split predictions of learnable instances and noisy instances
if dn_metas is not None:
dn_classification = [
x[:, -num_dn_anchor:] for x in classification
]
classification = [x[:, :-num_dn_anchor] for x in classification]
dn_prediction = [x[:, -num_dn_anchor:] for x in prediction]
prediction = [x[:, :-num_dn_anchor] for x in prediction]
quality = [
x[:, :-num_dn_anchor] if x is not None else None
for x in quality
]
output.update(
{
"dn_prediction": dn_prediction,
"dn_classification": dn_classification,
"dn_reg_target": dn_reg_target,
"dn_cls_target": dn_cls_target,
"dn_valid_mask": valid_mask,
}
)
if temp_dn_reg_target is not None:
output.update(
{
"temp_dn_reg_target": temp_dn_reg_target,
"temp_dn_cls_target": temp_dn_cls_target,
"temp_dn_valid_mask": temp_valid_mask,
# "dn_id_target": dn_id_target,
}
)
dn_cls_target = temp_dn_cls_target
valid_mask = temp_valid_mask
dn_instance_feature = instance_feature[:, -num_dn_anchor:]
dn_anchor = anchor[:, -num_dn_anchor:]
instance_feature = instance_feature[:, :-num_dn_anchor]
anchor_embed = anchor_embed[:, :-num_dn_anchor]
anchor = anchor[:, :-num_dn_anchor]
cls = cls[:, :-num_dn_anchor]
output.update(
{
"classification": classification,
"prediction": prediction,
"quality": quality,
"instance_feature": instance_feature,
"anchor_embed": anchor_embed,
}
)
# cache current instances for temporal modeling
self.instance_bank.cache(
instance_feature, anchor, cls, batch_inputs, feature_maps
)
if self.with_instance_id:
instance_id = self.instance_bank.get_instance_id(
cls, anchor, self.decoder.score_threshold
)
output["instance_id"] = instance_id
return output
def loss(self, model_outs, data, text_dict=None):
# ===================== prediction losses ======================
cls_scores = model_outs["classification"]
reg_preds = model_outs["prediction"]
quality = model_outs["quality"]
output = {}
for decoder_idx, (cls, reg, qt) in enumerate(
zip(cls_scores, reg_preds, quality)
):
reg = reg[..., : len(self.reg_weights)]
reg = decode_box(reg)
cls_target, reg_target, reg_weights, ignore_mask = self.sampler.sample(
cls,
reg,
data[self.gt_cls_key],
data[self.gt_reg_key],
text_dict=text_dict,
ignore_mask=data.get("ignore_mask"),
)
reg_target = reg_target[..., : len(self.reg_weights)]
reg_target_full = reg_target.clone()
mask = torch.logical_not(torch.all(reg_target == 0, dim=-1))
mask = mask.reshape(-1)
if ignore_mask is not None:
ignore_mask = ~ignore_mask.reshape(-1)
mask = torch.logical_and(mask, ignore_mask)
ignore_mask = ignore_mask.tile(1, cls.shape[-1])
mask_valid = mask.clone()
num_pos = max(
reduce_mean(torch.sum(mask).to(dtype=reg.dtype)), 1.0
)
cls = cls.flatten(end_dim=1)
cls_target = cls_target.flatten(end_dim=1)
token_mask = torch.logical_not(cls.isinf())
cls = cls[token_mask]
cls_target = cls_target[token_mask]
if ignore_mask is None:
cls_loss = self.loss_cls(cls, cls_target, avg_factor=num_pos)
else:
ignore_mask = ignore_mask[token_mask]
cls_loss = self.loss_cls(
cls[ignore_mask],
cls_target[ignore_mask],
avg_factor=num_pos,
weight=cls_mask[ignore_mask],
)
reg_weights = reg_weights * reg.new_tensor(self.reg_weights)
reg_target = reg_target.flatten(end_dim=1)[mask]
reg = reg.flatten(end_dim=1)[mask]
reg_weights = reg_weights.flatten(end_dim=1)[mask]
reg_target = torch.where(
reg_target.isnan(), reg.new_tensor(0.0), reg_target
)
# cls_target = cls_target[mask]
if qt is not None:
qt = qt.flatten(end_dim=1)[mask]
reg_loss = self.loss_reg(
reg,
reg_target,
weight=reg_weights,
avg_factor=num_pos,
prefix=f"{self.task_prefix}_",
suffix=f"_{decoder_idx}",
quality=qt,
# cls_target=cls_target,
)
output[f"{self.task_prefix}_loss_cls_{decoder_idx}"] = cls_loss
output.update(reg_loss)
if "dn_prediction" not in model_outs:
return output
# ===================== denoising losses ======================
dn_cls_scores = model_outs["dn_classification"]
dn_reg_preds = model_outs["dn_prediction"]
(
dn_valid_mask,
dn_cls_target,
dn_reg_target,
dn_pos_mask,
reg_weights,
num_dn_pos,
) = self.prepare_for_dn_loss(model_outs)
for decoder_idx, (cls, reg) in enumerate(
zip(dn_cls_scores, dn_reg_preds)
):
if (
"temp_dn_valid_mask" in model_outs
and decoder_idx == self.num_single_frame_decoder
):
(
dn_valid_mask,
dn_cls_target,
dn_reg_target,
dn_pos_mask,
reg_weights,
num_dn_pos,
) = self.prepare_for_dn_loss(model_outs, prefix="temp_")
cls = cls.flatten(end_dim=1)[dn_valid_mask]
mask = torch.logical_not(cls.isinf())
cls_loss = self.loss_cls(
cls[mask],
dn_cls_target[mask],
avg_factor=num_dn_pos,
)
reg = reg.flatten(end_dim=1)[dn_valid_mask][dn_pos_mask][
..., : len(self.reg_weights)
]
reg = decode_box(reg)
reg_loss = self.loss_reg(
reg,
dn_reg_target,
avg_factor=num_dn_pos,
weight=reg_weights,
prefix=f"{self.task_prefix}_",
suffix=f"_dn_{decoder_idx}",
)
output[f"{self.task_prefix}_loss_cls_dn_{decoder_idx}"] = cls_loss
output.update(reg_loss)
return output
def prepare_for_dn_loss(self, model_outs, prefix=""):
dn_valid_mask = model_outs[f"{prefix}dn_valid_mask"].flatten(end_dim=1)
dn_cls_target = model_outs[f"{prefix}dn_cls_target"].flatten(
end_dim=1
)[dn_valid_mask]
dn_reg_target = model_outs[f"{prefix}dn_reg_target"].flatten(
end_dim=1
)[dn_valid_mask][..., : len(self.reg_weights)]
dn_pos_mask = dn_cls_target.sum(dim=-1) > 0
dn_reg_target = dn_reg_target[dn_pos_mask]
reg_weights = dn_reg_target.new_tensor(self.reg_weights)[None].tile(
dn_reg_target.shape[0], 1
)
num_dn_pos = max(
reduce_mean(torch.sum(dn_valid_mask).to(dtype=reg_weights.dtype)),
1.0,
)
return (
dn_valid_mask,
dn_cls_target,
dn_reg_target,
dn_pos_mask,
reg_weights,
num_dn_pos,
)
def post_process(
self,
model_outs,
text_dict,
batch_inputs,
batch_data_samples,
results_in_data_samples=True,
):
results = self.post_processor(
model_outs["classification"],
model_outs["prediction"],
model_outs.get("instance_id"),
model_outs.get("quality"),
text_dict=text_dict,
batch_inputs=batch_inputs,
)
if results_in_data_samples:
for i, ret in enumerate(results):
instances = InstanceData()
for k, v in ret.items():
if k == "bboxes_3d":
type = batch_data_samples[i].metainfo["box_type_3d"]
v = type(
v,
box_dim=v.shape[1],
origin=(0.5, 0.5, 0.5),
)
instances.__setattr__(k, v)
batch_data_samples[i].pred_instances_3d = instances
return batch_data_samples
return results
@MODELS.register_module()
class GroundingRefineClsHead(BaseModel):
def __init__(
self,
embed_dims=256,
output_dim=9,
scale=None,
cls_layers=False,
cls_bias=True,
):
super().__init__()
self.embed_dims = embed_dims
self.output_dim = output_dim
self.refine_state = list(range(output_dim))
self.scale = scale
self.layers = nn.Sequential(
*linear_act_ln(embed_dims, 2, 2),
# MLP(embed_dims, embed_dims, embed_dims, 2),
# nn.LayerNorm(self.embed_dims),
# MLP(embed_dims, embed_dims, embed_dims, 2),
Linear(self.embed_dims, self.output_dim),
Scale([1.0] * self.output_dim),
)
if cls_layers:
self.cls_layers = nn.Sequential(
MLP(embed_dims, embed_dims, embed_dims, 2),
nn.LayerNorm(self.embed_dims),
)
else:
self.cls_layers = nn.Identity()
if cls_bias:
bias_value = -math.log((1 - 0.01) / 0.01)
self.bias = nn.Parameter(
torch.Tensor([bias_value]), requires_grad=True
)
else:
self.bias = None
def forward(
self,
instance_feature: torch.Tensor,
anchor: torch.Tensor = None,
anchor_embed: torch.Tensor = None,
time_interval: torch.Tensor = 1.0,
text_feature=None,
text_token_mask=None,
**kwargs,
):
if anchor_embed is not None:
feature = instance_feature + anchor_embed
else:
feature = instance_feature
output = self.layers(feature)
if self.scale is not None:
output = output * output.new_tensor(self.scale)
if anchor is not None:
output = output + anchor
if text_feature is not None:
cls = self.cls_layers(
instance_feature) @ text_feature.transpose(-1, -2)
cls = cls / math.sqrt(instance_feature.shape[-1])
if self.bias is not None:
cls = cls + self.bias
if text_token_mask is not None:
cls.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
else:
cls = None
return output, cls, None
@MODELS.register_module()
class DoF9BoxLoss(nn.Module):
def __init__(
self,
loss_weight_wd=1.0,
loss_weight_pcd=0.0,
loss_weight_cd=0.8,
decode_pred=False,
):
super().__init__()
self.loss_weight_wd = loss_weight_wd
self.loss_weight_pcd = loss_weight_pcd
self.loss_weight_cd = loss_weight_cd
self.decode_pred = decode_pred
def forward(
self,
box,
box_target,
weight=None,
avg_factor=None,
prefix="",
suffix="",
**kwargs,
):
if box_target.shape[0] == 0:
loss = box.sum() * 0
return {f"{prefix}loss_box{suffix}": loss}
if self.decode_pred:
box = decode_box(box)
loss = 0
if self.loss_weight_wd > 0:
loss += self.loss_weight_wd * wasserstein_distance(box, box_target)
if self.loss_weight_pcd > 0:
loss += self.loss_weight_pcd * permutation_corner_distance(
box, box_target
)
if self.loss_weight_cd > 0:
loss += self.loss_weight_cd * center_distance(box, box_target)
if avg_factor is None:
loss = loss.mean()
else:
loss = loss.sum() / avg_factor
output = {f"{prefix}loss_box{suffix}": loss}
return output
@TASK_UTILS.register_module()
class GroundingBox3DPostProcess:
def __init__(
self,
num_output: int = 300,
score_threshold: Optional[float] = None,
sorted: bool = True,
):
super(GroundingBox3DPostProcess, self).__init__()
self.num_output = num_output
self.score_threshold = score_threshold
self.sorted = sorted
def __call__(
self,
cls_scores,
box_preds,
instance_id=None,
quality=None,
output_idx=-1,
text_dict=None,
batch_inputs=None,
):
cls_scores = cls_scores[output_idx].sigmoid()
if "tokens_positive" in batch_inputs:
tokens_positive_maps = get_positive_map(
batch_inputs["tokens_positive"],
text_dict,
)
label_to_token = [
create_positive_map_label_to_token(x, plus=1)
for x in tokens_positive_maps
]
cls_scores = convert_grounding_to_cls_scores(
cls_scores, label_to_token
)
entities = get_entities(
batch_inputs["text"],
batch_inputs["tokens_positive"],
)
else:
cls_scores, _ = cls_scores.max(dim=-1, keepdim=True)
entities = batch_inputs["text"]
# if squeeze_cls:
# cls_scores, cls_ids = cls_scores.max(dim=-1)
# cls_scores = cls_scores.unsqueeze(dim=-1)
box_preds = box_preds[output_idx]
bs, num_pred, num_cls = cls_scores.shape
num_output = min(self.num_output, num_pred*num_cls)
cls_scores, indices = cls_scores.flatten(start_dim=1).topk(
num_output, dim=1, sorted=self.sorted
)
# if not squeeze_cls:
cls_ids = indices % num_cls
if self.score_threshold is not None:
mask = cls_scores >= self.score_threshold
if quality[output_idx] is None:
quality = None
if quality is not None:
centerness = quality[output_idx][..., CNS]
centerness = torch.gather(centerness, 1, indices // num_cls)
cls_scores_origin = cls_scores.clone()
cls_scores *= centerness.sigmoid()
cls_scores, idx = torch.sort(cls_scores, dim=1, descending=True)
# if not squeeze_cls:
cls_ids = torch.gather(cls_ids, 1, idx)
if self.score_threshold is not None:
mask = torch.gather(mask, 1, idx)
indices = torch.gather(indices, 1, idx)
output = []
for i in range(bs):
category_ids = cls_ids[i]
# if squeeze_cls:
# category_ids = category_ids[indices[i]]
scores = cls_scores[i]
box = box_preds[i, indices[i] // num_cls]
if self.score_threshold is not None:
category_ids = category_ids[mask[i]]
scores = scores[mask[i]]
box = box[mask[i]]
# nms_idx = nms3d(
# box[..., :7],
# scores,
# iou_threshold=0.4
# )
# box = box[nms_idx]
# scores = scores[nms_idx]
# category_ids = category_ids[nms_idx]
if quality is not None:
scores_origin = cls_scores_origin[i]
if self.score_threshold is not None:
scores_origin = scores_origin[mask[i]]
box = decode_box(box, 0.1, 20)
category_ids = category_ids.cpu()
label_names = []
for id in category_ids.tolist():
if isinstance(entities[i], (tuple, list)):
label_names.append(entities[i][id])
else:
label_names.append(entities[i])
output.append(
{
"bboxes_3d": box.cpu(),
"scores_3d": scores.cpu(),
"labels_3d": category_ids,
"target_scores_3d": scores.cpu(),
"label_names": label_names,
}
)
if quality is not None:
output[-1]["cls_scores"] = scores_origin.cpu()
if instance_id is not None:
ids = instance_id[i, indices[i]]
if self.score_threshold is not None:
ids = ids[mask[i]]
output[-1]["instance_ids"] = ids
return output
@MODELS.register_module()
class DoF9BoxEncoder(nn.Module):
def __init__(
self,
embed_dims,
rot_dims=3,
output_fc=True,
in_loops=1,
out_loops=2,
):
super().__init__()
self.embed_dims = embed_dims
def embedding_layer(input_dims, output_dims):
return nn.Sequential(
*linear_act_ln(output_dims, in_loops, out_loops, input_dims)
)
if not isinstance(embed_dims, (list, tuple)):
embed_dims = [embed_dims] * 5
self.pos_fc = embedding_layer(3, embed_dims[0])
self.size_fc = embedding_layer(3, embed_dims[1])
self.yaw_fc = embedding_layer(rot_dims, embed_dims[2])
self.rot_dims = rot_dims
if output_fc:
self.output_fc = embedding_layer(embed_dims[-1], embed_dims[-1])
else:
self.output_fc = None
def forward(self, box_3d: torch.Tensor):
pos_feat = self.pos_fc(box_3d[..., [X, Y, Z]])
if box_3d.shape[-1] == 3:
return pos_feat
size_feat = self.size_fc(box_3d[..., [W, L, H]])
yaw_feat = self.yaw_fc(box_3d[..., ALPHA : ALPHA + self.rot_dims])
output = pos_feat + size_feat + yaw_feat
if self.output_fc is not None:
output = self.output_fc(output)
return output
@MODELS.register_module()
class SparseBox3DKeyPointsGenerator(nn.Module):
def __init__(
self,
embed_dims=256,
num_learnable_pts=0,
fix_scale=None,
):
super(SparseBox3DKeyPointsGenerator, self).__init__()
self.embed_dims = embed_dims
self.num_learnable_pts = num_learnable_pts
if fix_scale is None:
fix_scale = ((0.0, 0.0, 0.0),)
self.fix_scale = torch.tensor(fix_scale)
self.num_pts = len(self.fix_scale) + num_learnable_pts
if num_learnable_pts > 0:
self.learnable_fc = Linear(self.embed_dims, num_learnable_pts * 3)
def forward(
self,
anchor,
instance_feature=None,
T_cur2temp_list=None,
cur_timestamp=None,
temp_timestamps=None,
):
bs, num_anchor = anchor.shape[:2]
size = anchor[..., None, [W, L, H]].exp()
key_points = self.fix_scale.to(anchor) * size
if self.num_learnable_pts > 0 and instance_feature is not None:
learnable_scale = (
self.learnable_fc(instance_feature)
.reshape(bs, num_anchor, self.num_learnable_pts, 3)
.sigmoid()
- 0.5
)
key_points = torch.cat(
[key_points, learnable_scale * size], dim=-2
)
key_points = rotation_3d_in_euler(
key_points.flatten(0, 1),
anchor[..., [ALPHA, BETA, GAMMA]].flatten(0, 1),
).unflatten(0, (bs, num_anchor))
key_points = key_points + anchor[..., None, [X, Y, Z]]
if (
cur_timestamp is None
or temp_timestamps is None
or len(temp_timestamps) == 0
) and T_cur2temp_list is None:
return key_points
temp_key_points_list = []
velocity = anchor[..., VX:]
for i, t_time in enumerate(temp_timestamps):
time_interval = cur_timestamp - t_time
translation = (
velocity
* time_interval.to(dtype=velocity.dtype)[:, None, None]
)
temp_key_points = key_points - translation[:, :, None]
if T_cur2temp_list is not None:
T_cur2temp = T_cur2temp_list[i].to(dtype=key_points.dtype)
temp_key_points = T_cur2temp[:, None, None, :3] @ torch.cat(
[
temp_key_points,
torch.ones_like(temp_key_points[..., :1]),
],
dim=-1,
).unsqueeze(-1)
temp_key_points = temp_key_points.squeeze(-1)
temp_key_points_list.append(temp_key_points)
return key_points, temp_key_points_list
@staticmethod
def anchor_projection(
anchor,
T_src2dst_list,
src_timestamp=None,
dst_timestamps=None,
time_intervals=None,
):
dst_anchors = []
for i in range(len(T_src2dst_list)):
vel = anchor[..., VX:]
vel_dim = vel.shape[-1]
T_src2dst = torch.unsqueeze(
T_src2dst_list[i].to(dtype=anchor.dtype), dim=1
)
center = anchor[..., [X, Y, Z]]
if time_intervals is not None:
time_interval = time_intervals[i]
elif src_timestamp is not None and dst_timestamps is not None:
time_interval = (src_timestamp - dst_timestamps[i]).to(
dtype=vel.dtype
)
else:
time_interval = None
if time_interval is not None:
translation = vel.transpose(0, -1) * time_interval
translation = translation.transpose(0, -1)
center = center - translation
center = (
torch.matmul(
T_src2dst[..., :3, :3], center[..., None]
).squeeze(dim=-1)
+ T_src2dst[..., :3, 3]
)
size = anchor[..., [W, L, H]]
yaw = torch.matmul(
T_src2dst[..., :2, :2],
anchor[..., [COS_YAW, SIN_YAW], None],
).squeeze(-1)
yaw = yaw[..., [1, 0]]
vel = torch.matmul(
T_src2dst[..., :vel_dim, :vel_dim], vel[..., None]
).squeeze(-1)
dst_anchor = torch.cat([center, size, yaw, vel], dim=-1)
dst_anchors.append(dst_anchor)
return dst_anchors
@staticmethod
def distance(anchor):
return torch.norm(anchor[..., :2], p=2, dim=-1)
================================================
FILE: bip3d/models/bert.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from collections import OrderedDict
from typing import Sequence
import torch
from mmengine.model import BaseModel
from torch import nn
try:
from transformers import AutoTokenizer, BertConfig
from transformers import BertModel as HFBertModel
except ImportError:
AutoTokenizer = None
HFBertModel = None
from bip3d.registry import MODELS
def generate_masks_with_special_tokens_and_transfer_map(
tokenized, special_tokens_list
):
"""Generate attention mask between each pair of special tokens.
Only token pairs in between two special tokens are attended to
and thus the attention mask for these pairs is positive.
Args:
input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
special_tokens_mask (list): special tokens mask.
Returns:
Tuple(Tensor, Tensor):
- attention_mask is the attention mask between each tokens.
Only token pairs in between two special tokens are positive.
Shape: [bs, num_token, num_token].
- position_ids is the position id of tokens within each valid sentence.
The id starts from 0 whenenver a special token is encountered.
Shape: [bs, num_token]
"""
input_ids = tokenized["input_ids"]
bs, num_token = input_ids.shape
# special_tokens_mask:
# bs, num_token. 1 for special tokens. 0 for normal tokens
special_tokens_mask = torch.zeros(
(bs, num_token), device=input_ids.device
).bool()
for special_token in special_tokens_list:
special_tokens_mask |= input_ids == special_token
# idxs: each row is a list of indices of special tokens
idxs = torch.nonzero(special_tokens_mask)
# generate attention mask and positional ids
attention_mask = (
torch.eye(num_token, device=input_ids.device)
.bool()
.unsqueeze(0)
.repeat(bs, 1, 1)
)
position_ids = torch.zeros((bs, num_token), device=input_ids.device)
previous_col = 0
for i in range(idxs.shape[0]):
row, col = idxs[i]
if col == 0:
# if (col == 0) or (col == num_token - 1):
attention_mask[row, col, col] = True
position_ids[row, col] = 0
else:
# attention_mask[row, previous_col + 1:col,
# previous_col + 1:col] = True
# position_ids[row, previous_col + 1:col] = torch.arange(
# 0, col - previous_col - 1, device=input_ids.device)
attention_mask[
row, previous_col + 1 : col + 1, previous_col + 1 : col + 1
] = True
position_ids[row, previous_col + 1 : col + 1] = torch.arange(
0, col - previous_col, device=input_ids.device
)
previous_col = col
return attention_mask, position_ids.to(torch.long)
@MODELS.register_module()
class BertModel(BaseModel):
"""BERT model for language embedding only encoder.
Args:
name (str, optional): name of the pretrained BERT model from
HuggingFace. Defaults to bert-base-uncased.
max_tokens (int, optional): maximum number of tokens to be
used for BERT. Defaults to 256.
pad_to_max (bool, optional): whether to pad the tokens to max_tokens.
Defaults to True.
use_sub_sentence_represent (bool, optional): whether to use sub
sentence represent introduced in `Grounding DINO
`. Defaults to False.
special_tokens_list (list, optional): special tokens used to split
subsentence. It cannot be None when `use_sub_sentence_represent`
is True. Defaults to None.
add_pooling_layer (bool, optional): whether to adding pooling
layer in bert encoder. Defaults to False.
num_layers_of_embedded (int, optional): number of layers of
the embedded model. Defaults to 1.
use_checkpoint (bool, optional): whether to use gradient checkpointing.
Defaults to False.
"""
def __init__(
self,
name: str = "bert-base-uncased",
max_tokens: int = 256,
pad_to_max: bool = True,
use_sub_sentence_represent: bool = False,
special_tokens_list: list = None,
add_pooling_layer: bool = False,
num_layers_of_embedded: int = 1,
use_checkpoint: bool = False,
return_tokenized: bool = False,
**kwargs
) -> None:
super().__init__(**kwargs)
self.max_tokens = max_tokens
self.pad_to_max = pad_to_max
self.return_tokenized = return_tokenized
if AutoTokenizer is None:
raise RuntimeError(
"transformers is not installed, please install it by: "
"pip install transformers."
)
self.tokenizer = AutoTokenizer.from_pretrained(name)
self.language_backbone = nn.Sequential(
OrderedDict(
[
(
"body",
BertEncoder(
name,
add_pooling_layer=add_pooling_layer,
num_layers_of_embedded=num_layers_of_embedded,
use_checkpoint=use_checkpoint,
),
)
]
)
)
self.use_sub_sentence_represent = use_sub_sentence_represent
if self.use_sub_sentence_represent:
assert (
special_tokens_list is not None
), "special_tokens should not be None \
if use_sub_sentence_represent is True"
self.special_tokens = self.tokenizer.convert_tokens_to_ids(
special_tokens_list
)
def forward(self, captions: Sequence[str], **kwargs) -> dict:
"""Forward function."""
device = next(self.language_backbone.parameters()).device
tokenized = self.tokenizer.batch_encode_plus(
captions,
max_length=self.max_tokens,
padding="max_length" if self.pad_to_max else "longest",
return_special_tokens_mask=True,
return_tensors="pt",
truncation=True,
).to(device)
input_ids = tokenized.input_ids
if self.use_sub_sentence_represent:
attention_mask, position_ids = (
generate_masks_with_special_tokens_and_transfer_map(
tokenized, self.special_tokens
)
)
token_type_ids = tokenized["token_type_ids"]
else:
attention_mask = tokenized.attention_mask
position_ids = None
token_type_ids = None
tokenizer_input = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"position_ids": position_ids,
"token_type_ids": token_type_ids,
}
language_dict_features = self.language_backbone(tokenizer_input)
if self.use_sub_sentence_represent:
language_dict_features["position_ids"] = position_ids
language_dict_features["text_token_mask"] = (
tokenized.attention_mask.bool()
)
if self.return_tokenized:
language_dict_features["tokenized"] = tokenized
return language_dict_features
class BertEncoder(nn.Module):
"""BERT encoder for language embedding.
Args:
name (str): name of the pretrained BERT model from HuggingFace.
Defaults to bert-base-uncased.
add_pooling_layer (bool): whether to add a pooling layer.
num_layers_of_embedded (int): number of layers of the embedded model.
Defaults to 1.
use_checkpoint (bool): whether to use gradient checkpointing.
Defaults to False.
"""
def __init__(
self,
name: str,
add_pooling_layer: bool = False,
num_layers_of_embedded: int = 1,
use_checkpoint: bool = False,
):
super().__init__()
if BertConfig is None:
raise RuntimeError(
"transformers is not installed, please install it by: "
"pip install transformers."
)
config = BertConfig.from_pretrained(name)
config.gradient_checkpointing = use_checkpoint
# only encoder
self.model = HFBertModel.from_pretrained(
name, add_pooling_layer=add_pooling_layer, config=config
)
self.language_dim = config.hidden_size
self.num_layers_of_embedded = num_layers_of_embedded
def forward(self, x) -> dict:
mask = x["attention_mask"]
outputs = self.model(
input_ids=x["input_ids"],
attention_mask=mask,
position_ids=x["position_ids"],
token_type_ids=x["token_type_ids"],
output_hidden_states=True,
)
# outputs has 13 layers, 1 input layer and 12 hidden layers
encoded_layers = outputs.hidden_states[1:]
features = torch.stack(
encoded_layers[-self.num_layers_of_embedded :], 1
).mean(1)
# language embedding has shape [len(phrase), seq_len, language_dim]
features = features / self.num_layers_of_embedded
if mask.dim() == 2:
embedded = features * mask.unsqueeze(-1).float()
else:
embedded = features
results = {
"embedded": embedded,
"masks": mask,
"hidden": encoded_layers[-1],
}
return results
================================================
FILE: bip3d/models/data_preprocessors/__init__.py
================================================
from .custom_data_preprocessor import CustomDet3DDataPreprocessor
__all__ = ["CustomDet3DDataPreprocessor"]
================================================
FILE: bip3d/models/data_preprocessors/custom_data_preprocessor.py
================================================
import math
from numbers import Number
from typing import Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
import torch
from mmdet.models import DetDataPreprocessor
from mmdet.models.utils.misc import samplelist_boxtype2tensor
from mmengine.model import stack_batch
from mmengine.structures import InstanceData
from mmengine.utils import is_seq_of
from torch import Tensor
from torch.nn import functional as F
from bip3d.registry import MODELS
from bip3d.utils.typing_config import ConfigType, SampleList
from bip3d.structures.bbox_3d import get_proj_mat_by_coord_type
from .utils import multiview_img_stack_batch
@MODELS.register_module()
class CustomDet3DDataPreprocessor(DetDataPreprocessor):
"""Points / Image pre-processor for point clouds / vision-only / multi-
modality 3D detection tasks.
It provides the data pre-processing as follows
- Collate and move image and point cloud data to the target device.
- 1) For image data:
- Pad images in inputs to the maximum size of current batch with defined
``pad_value``. The padding size can be divisible by a defined
``pad_size_divisor``.
- Stack images in inputs to batch_imgs.
- Convert images in inputs from bgr to rgb if the shape of input is
(3, H, W).
- Normalize images in inputs with defined std and mean.
- Do batch augmentations during training.
- 2) For point cloud data:
- If no voxelization, directly return list of point cloud data.
- If voxelization is applied, voxelize point cloud according to
``voxel_type`` and obtain ``voxels``.
Args:
voxel (bool): Whether to apply voxelization to point cloud.
Defaults to False.
voxel_type (str): Voxelization type. Two voxelization types are
provided: 'hard' and 'dynamic', respectively for hard voxelization
and dynamic voxelization. Defaults to 'hard'.
voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
config. Defaults to None.
batch_first (bool): Whether to put the batch dimension to the first
dimension when getting voxel coordinates. Defaults to True.
max_voxels (int, optional): Maximum number of voxels in each voxel
grid. Defaults to None.
mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
Defaults to None.
std (Sequence[Number], optional): The pixel standard deviation of
R, G, B channels. Defaults to None.
pad_size_divisor (int): The size of padded image should be divisible by
``pad_size_divisor``. Defaults to 1.
pad_value (float or int): The padded pixel value. Defaults to 0.
pad_mask (bool): Whether to pad instance masks. Defaults to False.
mask_pad_value (int): The padded pixel value for instance masks.
Defaults to 0.
pad_seg (bool): Whether to pad semantic segmentation maps.
Defaults to False.
seg_pad_value (int): The padded pixel value for semantic segmentation
maps. Defaults to 255.
bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
Defaults to False.
rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
Defaults to False.
boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
bboxes data to ``Tensor`` type. Defaults to True.
non_blocking (bool): Whether to block current process when transferring
data to device. Defaults to False.
batch_augments (List[dict], optional): Batch-level augmentations.
Defaults to None.
batchwise_inputs (bool): Pack the input as a batch of samples
with 1-N frames for the continuous 3D perception setting.
Defaults to False.
"""
def __init__(
self,
voxel: bool = False,
voxel_type: str = "hard",
voxel_layer: Optional[ConfigType] = None,
batch_first: bool = True,
max_voxels: Optional[int] = None,
mean: Sequence[Number] = None,
std: Sequence[Number] = None,
pad_size_divisor: int = 1,
pad_value: Union[float, int] = 0,
pad_mask: bool = False,
mask_pad_value: int = 0,
pad_seg: bool = False,
seg_pad_value: int = 255,
bgr_to_rgb: bool = False,
rgb_to_bgr: bool = False,
boxtype2tensor: bool = True,
non_blocking: bool = False,
batch_augments: Optional[List[dict]] = None,
batchwise_inputs: bool = False,
):
super().__init__(
mean=mean,
std=std,
pad_size_divisor=pad_size_divisor,
pad_value=pad_value,
pad_mask=pad_mask,
mask_pad_value=mask_pad_value,
pad_seg=pad_seg,
seg_pad_value=seg_pad_value,
bgr_to_rgb=bgr_to_rgb,
rgb_to_bgr=rgb_to_bgr,
boxtype2tensor=boxtype2tensor,
non_blocking=non_blocking,
batch_augments=batch_augments,
)
self.voxel = voxel
self.voxel_type = voxel_type
self.batch_first = batch_first
self.max_voxels = max_voxels
self.batchwise_inputs = batchwise_inputs
if voxel:
self.voxel_layer = VoxelizationByGridShape(**voxel_layer)
def forward(self, data: Union[dict, List[dict]], training: bool = False):
"""Perform normalization, padding and bgr2rgb conversion based on
``BaseDataPreprocessor``.
Args:
data (dict or List[dict]): Data from dataloader. The dict contains
the whole batch data, when it is a list[dict], the list
indicates test time augmentation.
training (bool): Whether to enable training time augmentation.
Defaults to False.
Returns:
dict or List[dict]: Data in the same format as the model input.
"""
if isinstance(data, list):
num_augs = len(data)
aug_batch_data = []
for aug_id in range(num_augs):
single_aug_batch_data = self.simple_process(
data[aug_id], training
)
aug_batch_data.append(single_aug_batch_data)
return aug_batch_data
else:
return self.simple_process(data, training)
def simple_process(self, data: dict, training: bool = False):
"""Perform normalization, padding and bgr2rgb conversion for img data
based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
is set to be True.
Args:
data (dict): Data sampled from dataloader.
training (bool): Whether to enable training time augmentation.
Defaults to False.
Returns:
dict: Data in the same format as the model input.
"""
if "img" in data["inputs"]:
batch_pad_shape = self._get_pad_shape(data)
if self.batchwise_inputs:
data_samples = data["data_samples"]
batchwise_data_samples = []
if "bboxes_3d" in data_samples[0].gt_instances_3d:
assert isinstance(
data_samples[0].gt_instances_3d.labels_3d, list
)
bboxes_3d = data_samples[0].gt_instances_3d.bboxes_3d
labels_3d = data_samples[0].gt_instances_3d.labels_3d
if "gt_occupancy_masks" in data_samples[0]:
gt_occupancy_masks = [
mask.clone() for mask in data_samples[0].gt_occupancy_masks
]
if (
"eval_ann_info" in data_samples[0]
and data_samples[0].eval_ann_info is not None
):
eval_ann_info = data_samples[0].eval_ann_info
for idx in range(len(labels_3d)):
data_sample = data_samples[0].clone()
if "bboxes_3d" in data_sample.gt_instances_3d:
data_sample.gt_instances_3d = InstanceData()
data_sample.gt_instances_3d.bboxes_3d = bboxes_3d[idx]
data_sample.gt_instances_3d.labels_3d = labels_3d[idx]
if "gt_occupancy_masks" in data_sample:
data_sample.gt_occupancy_masks = gt_occupancy_masks[idx]
if "eval_ann_info" in data_sample:
if data_sample.eval_ann_info is not None:
data_sample.eval_ann_info = dict()
data_sample.eval_ann_info["gt_bboxes_3d"] = (
eval_ann_info["gt_bboxes_3d"][idx]
)
data_sample.eval_ann_info["gt_labels_3d"] = (
eval_ann_info["gt_labels_3d"][idx]
)
batchwise_data_samples.append(data_sample)
data["data_samples"] = batchwise_data_samples
data = self.collate_data(data)
inputs, data_samples = data["inputs"], data["data_samples"]
batch_inputs = dict()
batch_inputs.update(self.process_camera_params(data_samples))
for key in ["depth_img", "depth_prob_gt"]:
if key not in inputs:
continue
batch_inputs[key] = torch.stack(inputs[key])
for key in ["text", "scan_id", "tokens_positive", "ignore_mask"]:
if not hasattr(data_samples[0], key):
continue
batch_inputs[key] = [getattr(x, key) for x in data_samples]
if hasattr(data_samples[0], "gt_instances_3d"):
batch_inputs["gt_bboxes_3d"] = [
x.gt_instances_3d.bboxes_3d for x in data_samples
]
batch_inputs["gt_labels_3d"] = [
x.gt_instances_3d.labels_3d for x in data_samples
]
if "imgs" in inputs:
imgs = inputs["imgs"]
if data_samples is not None:
# NOTE the batched image size information may be useful, e.g.
# in DETR, this is needed for the construction of masks, which
# is then used for the transformer_head.
batch_input_shape = tuple(imgs[0].size()[-2:])
for data_sample, pad_shape in zip(
data_samples, batch_pad_shape
):
data_sample.set_metainfo(
{
"batch_input_shape": batch_input_shape,
"pad_shape": pad_shape,
}
)
if self.boxtype2tensor:
samplelist_boxtype2tensor(data_samples)
if self.pad_mask:
self.pad_gt_masks(data_samples)
if self.pad_seg:
self.pad_gt_sem_seg(data_samples)
if training and self.batch_augments is not None:
for batch_aug in self.batch_augments:
imgs, data_samples = batch_aug(imgs, data_samples)
batch_inputs["imgs"] = imgs
return {"inputs": batch_inputs, "data_samples": data_samples}
def process_camera_params(self, data_samples):
projection_mat = []
extrinsic_list = []
intrinsic_list = []
image_wh = []
for data_sample in data_samples:
proj_mat = get_proj_mat_by_coord_type(
data_sample.metainfo, "DEPTH"
)
img_scale_factor = data_sample.metainfo.get("scale_factor", [1, 1])
img_flip = data_sample.metainfo.get("flip", False)
img_crop_offset = data_sample.metainfo.get(
"img_crop_offset", [0, 0]
)
trans_mat = np.eye(4)
trans_mat[0, 0] = img_scale_factor[0]
trans_mat[1, 1] = img_scale_factor[1]
trans_mat[0, 2] = -img_crop_offset[0]
trans_mat[1, 2] = -img_crop_offset[1]
if img_flip:
assert False
if "trans_mat" in proj_mat:
trans_mat = np.stack(proj_mat["trans_mat"]) @ trans_mat
if isinstance(proj_mat, dict):
extrinsic = np.stack(proj_mat["extrinsic"])
intrinsic = np.stack(proj_mat["intrinsic"])
proj_mat = intrinsic @ extrinsic
extrinsic_list.append(extrinsic)
intrinsic_list.append(trans_mat @ intrinsic)
else:
extrinsic_list.append(
np.tile(np.eye(4)[None], (proj_mat.shape[0], 1, 1))
)
intrinsic_list.append(
np.tile(np.eye(4)[None], (proj_mat.shape[0], 1, 1))
)
proj_mat = trans_mat @ proj_mat
projection_mat.append(proj_mat)
image_wh.append(data_sample.metainfo["img_shape"][:2])
to_tensor = lambda x: torch.from_numpy(x).cuda().to(torch.float32)
projection_mat = to_tensor(np.stack(projection_mat))
image_wh = to_tensor(np.array(image_wh))
image_wh = image_wh[:, None].tile(1, projection_mat.shape[1], 1)
extrinsic = to_tensor(np.stack(extrinsic_list))
intrinsic = to_tensor(np.stack(intrinsic_list))
return {
"projection_mat": projection_mat,
"image_wh": image_wh,
"extrinsic": extrinsic,
"intrinsic": intrinsic,
}
def preprocess_img(self, _batch_img: Tensor):
# channel transform
if self._channel_conversion:
_batch_img = _batch_img[[2, 1, 0], ...]
# Convert to float after channel conversion to ensure
# efficiency
_batch_img = _batch_img.float()
# Normalization.
if self._enable_normalize:
if self.mean.shape[0] == 3:
assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
"If the mean has 3 values, the input tensor "
"should in shape of (3, H, W), but got the "
f"tensor with shape {_batch_img.shape}"
)
_batch_img = (_batch_img - self.mean) / self.std
return _batch_img
def collate_data(self, data: dict):
"""Copy data to the target device and perform normalization, padding
and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``.
Collates the data sampled from dataloader into a list of dict and list
of labels, and then copies tensor to the target device.
Args:
data (dict): Data sampled from dataloader.
Returns:
dict: Data in the same format as the model input.
"""
data = self.cast_data(data) # type: ignore
if "img" in data["inputs"]:
_batch_imgs = data["inputs"]["img"]
# Process data with `pseudo_collate`.
if is_seq_of(_batch_imgs, torch.Tensor):
batch_imgs = []
img_dim = _batch_imgs[0].dim()
for _batch_img in _batch_imgs:
if img_dim == 3: # standard img
_batch_img = self.preprocess_img(_batch_img)
elif img_dim == 4:
_batch_img = [
self.preprocess_img(_img) for _img in _batch_img
]
_batch_img = torch.stack(_batch_img, dim=0)
batch_imgs.append(_batch_img)
# Pad and stack Tensor.
if img_dim == 3:
batch_imgs = stack_batch(
batch_imgs, self.pad_size_divisor, self.pad_value
)
elif img_dim == 4:
batch_imgs = multiview_img_stack_batch(
batch_imgs, self.pad_size_divisor, self.pad_value
)
# Process data with `default_collate`.
elif isinstance(_batch_imgs, torch.Tensor):
assert _batch_imgs.dim() == 4, (
"The input of `ImgDataPreprocessor` should be a NCHW "
"tensor or a list of tensor, but got a tensor with "
f"shape: {_batch_imgs.shape}"
)
if self._channel_conversion:
_batch_imgs = _batch_imgs[:, [2, 1, 0], ...]
# Convert to float after channel conversion to ensure
# efficiency
_batch_imgs = _batch_imgs.float()
if self._enable_normalize:
_batch_imgs = (_batch_imgs - self.mean) / self.std
h, w = _batch_imgs.shape[2:]
target_h = (
math.ceil(h / self.pad_size_divisor)
* self.pad_size_divisor
)
target_w = (
math.ceil(w / self.pad_size_divisor)
* self.pad_size_divisor
)
pad_h = target_h - h
pad_w = target_w - w
batch_imgs = F.pad(
_batch_imgs,
(0, pad_w, 0, pad_h),
"constant",
self.pad_value,
)
else:
raise TypeError(
"Output of `cast_data` should be a list of dict "
"or a tuple with inputs and data_samples, but got "
f"{type(data)}: {data}"
)
data["inputs"]["imgs"] = batch_imgs
data.setdefault("data_samples", None)
return data
def _get_pad_shape(self, data: dict):
"""Get the pad_shape of each image based on data and
pad_size_divisor."""
# rewrite `_get_pad_shape` for obtaining image inputs.
_batch_inputs = data["inputs"]["img"]
# Process data with `pseudo_collate`.
if is_seq_of(_batch_inputs, torch.Tensor):
batch_pad_shape = []
for ori_input in _batch_inputs:
if ori_input.dim() == 4:
# mean multiview input, select one of the
# image to calculate the pad shape
ori_input = ori_input[0]
pad_h = (
int(np.ceil(ori_input.shape[1] / self.pad_size_divisor))
* self.pad_size_divisor
)
pad_w = (
int(np.ceil(ori_input.shape[2] / self.pad_size_divisor))
* self.pad_size_divisor
)
batch_pad_shape.append((pad_h, pad_w))
# Process data with `default_collate`.
elif isinstance(_batch_inputs, torch.Tensor):
assert _batch_inputs.dim() == 4, (
"The input of `ImgDataPreprocessor` should be a NCHW tensor "
"or a list of tensor, but got a tensor with shape: "
f"{_batch_inputs.shape}"
)
pad_h = (
int(np.ceil(_batch_inputs.shape[1] / self.pad_size_divisor))
* self.pad_size_divisor
)
pad_w = (
int(np.ceil(_batch_inputs.shape[2] / self.pad_size_divisor))
* self.pad_size_divisor
)
batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
else:
raise TypeError(
"Output of `cast_data` should be a list of dict "
"or a tuple with inputs and data_samples, but got "
f"{type(data)}: {data}"
)
return batch_pad_shape
================================================
FILE: bip3d/models/data_preprocessors/utils.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union
import torch
import torch.nn.functional as F
from torch import Tensor
def multiview_img_stack_batch(
tensor_list: List[Tensor],
pad_size_divisor: int = 1,
pad_value: Union[int, float] = 0,
) -> Tensor:
"""Compared to the ``stack_batch`` in `mmengine.model.utils`,
multiview_img_stack_batch further handle the multiview images.
See diff of padded_sizes[:, :-2] = 0 vs padded_sizes[:, 0] = 0 in line 47.
Stack multiple tensors to form a batch and pad the tensor to the max shape
use the right bottom padding mode in these images. If
``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
divisible by ``pad_size_divisor``.
Args:
tensor_list (List[Tensor]): A list of tensors with the same dim.
pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding to
ensure the shape of each dim is divisible by ``pad_size_divisor``.
This depends on the model, and many models need to be divisible by
32. Defaults to 1.
pad_value (int or float): The padding value. Defaults to 0.
Returns:
Tensor: The n dim tensor.
"""
assert isinstance(
tensor_list, list
), f"Expected input type to be list, but got {type(tensor_list)}"
assert tensor_list, "`tensor_list` could not be an empty list"
assert len({tensor.ndim for tensor in tensor_list}) == 1, (
"Expected the dimensions of all tensors must be the same, "
f"but got {[tensor.ndim for tensor in tensor_list]}"
)
dim = tensor_list[0].dim()
num_img = len(tensor_list)
all_sizes: torch.Tensor = torch.Tensor(
[tensor.shape for tensor in tensor_list]
)
max_sizes = (
torch.ceil(torch.max(all_sizes, dim=0)[0] / pad_size_divisor)
* pad_size_divisor
)
padded_sizes = max_sizes - all_sizes
# The first dim normally means channel, which should not be padded.
padded_sizes[:, :-2] = 0
if padded_sizes.sum() == 0:
return torch.stack(tensor_list)
# `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
# it means that padding the last dim with 1(left) 2(right), padding the
# penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
# the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
# and only odd index of pad should be assigned to keep padding "right" and
# "bottom".
pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
batch_tensor = []
for idx, tensor in enumerate(tensor_list):
batch_tensor.append(
F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value)
)
return torch.stack(batch_tensor)
================================================
FILE: bip3d/models/deformable_aggregation.py
================================================
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
from torch.cuda.amp.autocast_mode import autocast
from mmcv.cnn import build_norm_layer
from mmcv.cnn import Linear
from mmengine.model import (
Sequential,
BaseModule,
xavier_init,
constant_init,
ModuleList,
)
from mmcv.cnn.bricks.transformer import FFN
from ..ops import deformable_aggregation_func
from .utils import linear_act_ln
from bip3d.registry import MODELS
__all__ = [
"DeformableFeatureAggregation",
]
@MODELS.register_module()
class DeformableFeatureAggregation(BaseModule):
def __init__(
self,
embed_dims: int = 256,
num_groups: int = 8,
num_levels: int = 4,
num_cams: int = 6,
proj_drop: float = 0.0,
attn_drop: float = 0.0,
kps_generator: dict = None,
temporal_fusion_module=None,
use_temporal_anchor_embed=True,
use_deformable_func=False,
use_camera_embed=False,
residual_mode="add",
batch_first=True,
ffn_cfg=None,
with_value_proj=False,
filter_outlier=False,
with_depth=False,
min_depth=None,
max_depth=None,
):
super(DeformableFeatureAggregation, self).__init__()
assert batch_first
if embed_dims % num_groups != 0:
raise ValueError(
f"embed_dims must be divisible by num_groups, "
f"but got {embed_dims} and {num_groups}"
)
self.group_dims = int(embed_dims / num_groups)
self.embed_dims = embed_dims
self.num_levels = num_levels
self.num_groups = num_groups
self.num_cams = num_cams
self.use_temporal_anchor_embed = use_temporal_anchor_embed
self.use_deformable_func = use_deformable_func
self.attn_drop = attn_drop
self.residual_mode = residual_mode
self.proj_drop = nn.Dropout(proj_drop)
if kps_generator is not None:
kps_generator["embed_dims"] = embed_dims
self.kps_generator = MODELS.build(kps_generator)
self.num_pts = self.kps_generator.num_pts
else:
self.kps_generator = None
self.num_pts = 1
if temporal_fusion_module is not None:
if "embed_dims" not in temporal_fusion_module:
temporal_fusion_module["embed_dims"] = embed_dims
self.temp_module = MODELS.build(temporal_fusion_module)
else:
self.temp_module = None
self.output_proj = Linear(embed_dims, embed_dims)
if use_camera_embed:
self.camera_encoder = Sequential(
*linear_act_ln(embed_dims, 1, 2, 12)
)
self.weights_fc = Linear(
embed_dims, num_groups * num_levels * self.num_pts
)
else:
self.camera_encoder = None
self.weights_fc = Linear(
embed_dims, num_groups * num_cams * num_levels * self.num_pts
)
if ffn_cfg is not None:
ffn_cfg["embed_dims"] = embed_dims
self.ffn = FFN(**ffn_cfg)
self.norms = ModuleList(
build_norm_layer(dict(type="LN"), embed_dims)[1]
for _ in range(2)
)
else:
self.ffn = None
self.with_value_proj = with_value_proj
self.filter_outlier = filter_outlier
if self.with_value_proj:
self.value_proj = Linear(embed_dims, embed_dims)
self.with_depth = with_depth
if self.with_depth:
assert min_depth is not None and max_depth is not None
self.min_depth = min_depth
self.max_depth = max_depth
def init_weight(self):
constant_init(self.weights_fc, val=0.0, bias=0.0)
xavier_init(self.output_proj, distribution="uniform", bias=0.0)
def get_spatial_shape_3D(self, spatial_shape, depth_dim):
spatial_shape_depth = spatial_shape.new_ones(*spatial_shape.shape[:-1], 1) * depth_dim
spatial_shape_3D = torch.cat([spatial_shape, spatial_shape_depth], dim=-1)
return spatial_shape_3D.contiguous()
def forward(
self,
instance_feature: torch.Tensor,
anchor: torch.Tensor,
anchor_embed: torch.Tensor,
feature_maps: List[torch.Tensor],
metas: dict,
depth_prob=None,
**kwargs: dict,
):
bs, num_anchor = instance_feature.shape[:2]
if self.kps_generator is not None:
key_points = self.kps_generator(anchor, instance_feature)
else:
key_points = anchor[:, :, None]
points_2d, depth, mask = self.project_points(
key_points,
metas["projection_mat"],
metas.get("image_wh"),
)
weights = self._get_weights(
instance_feature, anchor_embed, metas, mask
)
if self.use_deformable_func:
if self.with_value_proj:
feature_maps[0] = self.value_proj(feature_maps[0])
points_2d = points_2d.permute(0, 2, 3, 1, 4).reshape(
bs, num_anchor * self.num_pts, -1, 2
)
weights = (
weights.permute(0, 1, 4, 2, 3, 5)
.contiguous()
.reshape(
bs,
num_anchor * self.num_pts,
-1,
self.num_levels,
self.num_groups,
)
)
if self.with_depth:
depth = depth.permute(0, 2, 3, 1).reshape(
bs, num_anchor * self.num_pts, -1, 1
)
# normalize depth to [0, depth_prob.shape[-1]-1]
depth = (depth - self.min_depth) / (self.max_depth - self.min_depth)
depth = depth * (depth_prob.shape[-1] - 1)
features = deformable_aggregation_func(
*feature_maps, points_2d, weights, depth_prob, depth
)
else:
features = deformable_aggregation_func(*feature_maps, points_2d, weights)
features = features.reshape(bs, num_anchor, self.num_pts, self.embed_dims)
features = features.sum(dim=2)
else:
assert False
assert not self.with_value_proj
features = self.feature_sampling(
feature_maps,
key_points,
metas["projection_mat"],
metas.get("image_wh"),
)
features = self.multi_view_level_fusion(features, weights)
features = features.sum(dim=2) # fuse multi-point features
output = self.proj_drop(self.output_proj(features))
if self.residual_mode == "add":
output = output + instance_feature
elif self.residual_mode == "cat":
output = torch.cat([output, instance_feature], dim=-1)
if self.ffn is not None:
output = self.norms[0](output)
output = self.ffn(output)
output = self.norms[1](output)
return output
def _get_weights(
self, instance_feature, anchor_embed, metas=None, mask=None
):
bs, num_anchor = instance_feature.shape[:2]
feature = instance_feature + anchor_embed
if self.camera_encoder is not None:
num_cams = metas["projection_mat"].shape[1]
camera_embed = self.camera_encoder(
metas["projection_mat"][:, :, :3].reshape(bs, num_cams, -1)
)
feature = feature[:, :, None] + camera_embed[:, None]
weights = self.weights_fc(feature)
if mask is not None and self.filter_outlier:
num_cams = weights.shape[2]
mask = mask.permute(0, 2, 1, 3)[..., None, :, None]
weights = weights.reshape(
bs,
num_anchor,
num_cams,
self.num_levels,
self.num_pts,
self.num_groups,
)
weights = weights.masked_fill(
torch.logical_and(~mask, mask.sum(dim=2, keepdim=True) != 0),
float("-inf"),
)
weights = (
weights.reshape(bs, num_anchor, -1, self.num_groups)
.softmax(dim=-2)
.reshape(
bs,
num_anchor,
-1,
self.num_levels,
self.num_pts,
self.num_groups,
)
)
if self.training and self.attn_drop > 0:
mask = torch.rand(bs, num_anchor, -1, 1, self.num_pts, 1)
mask = mask.to(device=weights.device, dtype=weights.dtype)
weights = ((mask > self.attn_drop) * weights) / (
1 - self.attn_drop
)
return weights
@staticmethod
def project_points(key_points, projection_mat, image_wh=None):
bs, num_anchor, num_pts = key_points.shape[:3]
pts_extend = torch.cat(
[key_points, torch.ones_like(key_points[..., :1])], dim=-1
)
points_2d = torch.matmul(
projection_mat[:, :, None, None], pts_extend[:, None, ..., None]
).squeeze(-1)
depth = points_2d[..., 2]
mask = depth > 1e-5
points_2d = points_2d[..., :2] / torch.clamp(
points_2d[..., 2:3], min=1e-5
)
mask = mask & (points_2d[..., 0] > 0) & (points_2d[..., 1] > 0)
if image_wh is not None:
points_2d = points_2d / image_wh[:, :, None, None]
mask = mask & (points_2d[..., 0] < 1) & (points_2d[..., 1] < 1)
return points_2d, depth, mask
@staticmethod
def feature_sampling(
feature_maps: List[torch.Tensor],
key_points: torch.Tensor,
projection_mat: torch.Tensor,
image_wh: Optional[torch.Tensor] = None,
) -> torch.Tensor:
num_levels = len(feature_maps)
num_cams = feature_maps[0].shape[1]
bs, num_anchor, num_pts = key_points.shape[:3]
points_2d = DeformableFeatureAggregation.project_points(
key_points, projection_mat, image_wh
)
points_2d = points_2d * 2 - 1
points_2d = points_2d.flatten(end_dim=1)
features = []
for fm in feature_maps:
features.append(
torch.nn.functional.grid_sample(
fm.flatten(end_dim=1), points_2d
)
)
features = torch.stack(features, dim=1)
features = features.reshape(
bs, num_cams, num_levels, -1, num_anchor, num_pts
).permute(
0, 4, 1, 2, 5, 3
) # bs, num_anchor, num_cams, num_levels, num_pts, embed_dims
return features
def multi_view_level_fusion(
self,
features: torch.Tensor,
weights: torch.Tensor,
):
bs, num_anchor = weights.shape[:2]
features = weights[..., None] * features.reshape(
features.shape[:-1] + (self.num_groups, self.group_dims)
)
features = features.sum(dim=2).sum(dim=2)
features = features.reshape(
bs, num_anchor, self.num_pts, self.embed_dims
)
return features
================================================
FILE: bip3d/models/feature_enhancer.py
================================================
import torch
from torch import nn
from mmengine.model import BaseModel
from mmdet.models.layers.transformer.utils import get_text_sine_pos_embed
from mmdet.models.layers import SinePositionalEncoding
from mmdet.models.layers.transformer.deformable_detr_layers import (
DeformableDetrTransformerEncoder as DDTE,
)
from mmdet.models.layers.transformer.deformable_detr_layers import (
DeformableDetrTransformerEncoderLayer,
)
from mmdet.models.layers.transformer.detr_layers import (
DetrTransformerEncoderLayer,
)
from mmdet.models.utils.vlfuse_helper import SingleScaleBiAttentionBlock
from bip3d.registry import MODELS
from .utils import deformable_format
@MODELS.register_module()
class TextImageDeformable2DEnhancer(BaseModel):
def __init__(
self,
num_layers,
text_img_attn_block,
img_attn_block,
text_attn_block=None,
embed_dims=256,
num_feature_levels=4,
positional_encoding=None,
**kwargs,
):
super().__init__(**kwargs)
self.num_layers = num_layers
self.num_feature_levels = num_feature_levels
self.embed_dims = embed_dims
self.positional_encoding = positional_encoding
self.text_img_attn_blocks = nn.ModuleList()
self.img_attn_blocks = nn.ModuleList()
self.text_attn_blocks = nn.ModuleList()
for _ in range(self.num_layers):
self.text_img_attn_blocks.append(
SingleScaleBiAttentionBlock(**text_img_attn_block)
)
self.img_attn_blocks.append(
DeformableDetrTransformerEncoderLayer(**img_attn_block)
)
self.text_attn_blocks.append(
DetrTransformerEncoderLayer(**text_attn_block)
)
self.positional_encoding = SinePositionalEncoding(
**self.positional_encoding
)
self.level_embed = nn.Parameter(
torch.Tensor(self.num_feature_levels, self.embed_dims)
)
def forward(
self,
feature_maps,
text_dict=None,
**kwargs,
):
with_cams = feature_maps[0].dim() == 5
if with_cams:
bs, num_cams = feature_maps[0].shape[:2]
feature_maps = [x.flatten(0, 1) for x in feature_maps]
else:
bs = feature_maps[0].shape[0]
num_cams = 1
pos_2d = self.get_2d_position_embed(feature_maps)
feature_2d, spatial_shapes, level_start_index = deformable_format(
feature_maps
)
reference_points = DDTE.get_encoder_reference_points(
spatial_shapes,
valid_ratios=feature_2d.new_ones(
[bs * num_cams, self.num_feature_levels, 2]
),
device=feature_2d.device,
)
text_feature = text_dict["embedded"]
pos_text = get_text_sine_pos_embed(
text_dict["position_ids"][..., None],
num_pos_feats=self.embed_dims,
exchange_xy=False,
)
for layer_id in range(self.num_layers):
feature_2d_fused = feature_2d[:, level_start_index[-1] :]
if with_cams:
feature_2d_fused = feature_2d_fused.unflatten(
0, (bs, num_cams)
)
feature_2d_fused = feature_2d_fused.flatten(1, 2)
feature_2d_fused, text_feature = self.text_img_attn_blocks[
layer_id
](
feature_2d_fused,
text_feature,
attention_mask_l=text_dict["text_token_mask"],
)
if with_cams:
feature_2d_fused = feature_2d_fused.unflatten(
1, (num_cams, -1)
)
feature_2d_fused = feature_2d_fused.flatten(0, 1)
feature_2d = torch.cat(
[feature_2d[:, : level_start_index[-1]], feature_2d_fused],
dim=1,
)
feature_2d = self.img_attn_blocks[layer_id](
query=feature_2d,
query_pos=pos_2d,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
key_padding_mask=None,
)
text_attn_mask = text_dict.get("masks")
if text_attn_mask is not None:
text_num_heads = self.text_attn_blocks[
layer_id
].self_attn_cfg.num_heads
text_attn_mask = ~text_attn_mask.repeat(text_num_heads, 1, 1)
text_feature = self.text_attn_blocks[layer_id](
query=text_feature,
query_pos=pos_text,
attn_mask=text_attn_mask,
key_padding_mask=None,
)
feature_2d = deformable_format(
feature_2d, spatial_shapes, batch_size=bs if with_cams else None
)
return feature_2d, text_feature
def get_2d_position_embed(self, feature_maps):
pos_2d = []
for lvl, feat in enumerate(feature_maps):
batch_size, c, h, w = feat.shape
pos = self.positional_encoding(None, feat)
pos = pos.view(batch_size, c, h * w).permute(0, 2, 1)
pos = pos + self.level_embed[lvl]
pos_2d.append(pos)
pos_2d = torch.cat(pos_2d, 1)
return pos_2d
================================================
FILE: bip3d/models/instance_bank.py
================================================
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
from bip3d.registry import MODELS
__all__ = ["InstanceBank"]
def topk(confidence, k, *inputs):
bs, N = confidence.shape[:2]
confidence, indices = torch.topk(confidence, k, dim=1)
indices = (
indices + torch.arange(bs, device=indices.device)[:, None] * N
).reshape(-1)
outputs = []
for input in inputs:
outputs.append(input.flatten(end_dim=1)[indices].reshape(bs, k, -1))
return confidence, outputs
@MODELS.register_module()
class InstanceBank(nn.Module):
def __init__(
self,
num_anchor,
embed_dims,
anchor,
anchor_handler=None,
num_current_instance=None,
num_temp_instances=0,
default_time_interval=0.5,
confidence_decay=0.6,
anchor_grad=True,
feat_grad=True,
max_time_interval=2,
anchor_in_camera=True,
):
super(InstanceBank, self).__init__()
self.embed_dims = embed_dims
self.num_current_instance = num_current_instance
self.num_temp_instances = num_temp_instances
self.default_time_interval = default_time_interval
self.confidence_decay = confidence_decay
self.max_time_interval = max_time_interval
if anchor_handler is not None:
anchor_handler = MODELS.build(anchor_handler)
assert hasattr(anchor_handler, "anchor_projection")
self.anchor_handler = anchor_handler
if isinstance(anchor, str):
anchor = np.load(anchor)
elif isinstance(anchor, (list, tuple)):
anchor = np.array(anchor)
if len(anchor.shape) == 3: # for map
anchor = anchor.reshape(anchor.shape[0], -1)
self.num_anchor = min(len(anchor), num_anchor)
self.anchor = anchor[:num_anchor]
# self.anchor = nn.Parameter(
# torch.tensor(anchor, dtype=torch.float32),
# requires_grad=anchor_grad,
# )
self.anchor_init = anchor
self.instance_feature = nn.Parameter(
torch.zeros([1, self.embed_dims]),
# torch.zeros([self.anchor.shape[0], self.embed_dims]),
requires_grad=feat_grad,
)
self.anchor_in_camera = anchor_in_camera
self.reset()
def init_weight(self):
self.anchor.data = self.anchor.data.new_tensor(self.anchor_init)
if self.instance_feature.requires_grad:
torch.nn.init.xavier_uniform_(self.instance_feature.data, gain=1)
def reset(self):
self.cached_feature = None
self.cached_anchor = None
self.metas = None
self.mask = None
self.confidence = None
self.temp_confidence = None
self.instance_id = None
self.prev_id = 0
def bbox_transform(self, bbox, matrix):
# bbox: bs, n, 9
# matrix: bs, cam, 4, 4
# output: bs, n*cam, 9
bbox = bbox.unsqueeze(dim=2)
matrix = matrix.unsqueeze(dim=1)
points = bbox[..., :3]
points_extend = torch.concat(
[points, torch.ones_like(points[..., :1])], dim=-1
)
points_trans = torch.matmul(matrix, points_extend[..., None])[
..., :3, 0
]
size = bbox[..., 3:6].tile(1, 1, points_trans.shape[2], 1)
angle = bbox[..., 6:].tile(1, 1, points_trans.shape[2], 1)
bbox = torch.cat([points_trans, size, angle], dim=-1)
bbox = bbox.flatten(1, 2)
return bbox
def get(self, batch_size, metas=None, dn_metas=None):
instance_feature = torch.tile(
self.instance_feature[None], (batch_size, self.anchor.shape[0], 1)
)
anchor = torch.tile(
instance_feature.new_tensor(self.anchor)[None], (batch_size, 1, 1)
)
if self.anchor_in_camera:
cam2global = np.linalg.inv(metas["extrinsic"].cpu().numpy())
cam2global = torch.from_numpy(cam2global).to(anchor)
anchor = self.bbox_transform(anchor, cam2global)
instance_feature = instance_feature.tile(1, cam2global.shape[1], 1)
if (
self.cached_anchor is not None
and batch_size == self.cached_anchor.shape[0]
):
# assert False, "TODO: linxuewu"
# history_time = self.metas["timestamp"]
# time_interval = metas["timestamp"] - history_time
# time_interval = time_interval.to(dtype=instance_feature.dtype)
# self.mask = torch.abs(time_interval) <= self.max_time_interval
last_scan_id = self.metas["scan_id"]
current_scan_id = metas["scan_id"]
self.mask = torch.tensor(
[x==y for x,y in zip(last_scan_id, current_scan_id)],
device=anchor.device,
)
assert self.mask.shape[0] == 1
if not self.mask:
self.reset()
time_interval = instance_feature.new_tensor(
[self.default_time_interval] * batch_size
)
else:
self.reset()
time_interval = instance_feature.new_tensor(
[self.default_time_interval] * batch_size
)
return (
instance_feature,
anchor,
self.cached_feature,
self.cached_anchor,
time_interval,
)
def update(self, instance_feature, anchor, confidence, num_dn=None):
if self.cached_feature is None:
return instance_feature, anchor
if num_dn is not None and num_dn > 0:
dn_instance_feature = instance_feature[:, -num_dn:]
dn_anchor = anchor[:, -num_dn:]
instance_feature = instance_feature[:, : -num_dn]
anchor = anchor[:, : -num_dn]
confidence = confidence[:, : -num_dn]
N = self.num_current_instance
if N is not None and N < confidence.shape[1]:
confidence = confidence.max(dim=-1).values
_, (selected_feature, selected_anchor) = topk(
confidence, N, instance_feature, anchor
)
else:
selected_feature, selected_anchor = instance_feature, anchor
instance_feature = torch.cat(
[self.cached_feature, selected_feature], dim=1
)
anchor = torch.cat(
[self.cached_anchor, selected_anchor], dim=1
)
if num_dn is not None and num_dn > 0:
instance_feature = torch.cat(
[instance_feature, dn_instance_feature], dim=1
)
anchor = torch.cat([anchor, dn_anchor], dim=1)
return instance_feature, anchor
def cache(
self,
instance_feature,
anchor,
confidence,
metas=None,
feature_maps=None,
):
if self.num_temp_instances <= 0:
return
instance_feature = instance_feature.detach()
anchor = anchor.detach()
confidence = confidence.detach()
self.metas = metas
confidence = confidence.max(dim=-1).values.sigmoid()
if self.confidence is not None:
N = self.confidence.shape[1]
confidence[:, : N] = torch.maximum(
self.confidence * self.confidence_decay,
confidence[:, : N],
)
self.temp_confidence = confidence
if self.num_temp_instances < confidence.shape[1]:
(
self.confidence,
(self.cached_feature, self.cached_anchor),
) = topk(
confidence, self.num_temp_instances, instance_feature, anchor
)
else:
self.confidence, self.cached_feature, self.cached_anchor = (
confidence, instance_feature, anchor
)
def get_instance_id(self, confidence, anchor=None, threshold=None):
confidence = confidence.max(dim=-1).values.sigmoid()
instance_id = confidence.new_full(confidence.shape, -1).long()
if (
self.instance_id is not None
and self.instance_id.shape[0] == instance_id.shape[0]
):
instance_id[:, : self.instance_id.shape[1]] = self.instance_id
mask = instance_id < 0
if threshold is not None:
mask = mask & (confidence >= threshold)
num_new_instance = mask.sum()
new_ids = torch.arange(num_new_instance).to(instance_id) + self.prev_id
instance_id[torch.where(mask)] = new_ids
self.prev_id += num_new_instance
self.update_instance_id(instance_id, confidence)
return instance_id
def update_instance_id(self, instance_id=None, confidence=None):
if self.temp_confidence is None:
if confidence.dim() == 3: # bs, num_anchor, num_cls
temp_conf = confidence.max(dim=-1).values
else: # bs, num_anchor
temp_conf = confidence
else:
temp_conf = self.temp_confidence
instance_id = topk(temp_conf, self.num_temp_instances, instance_id)[1][
0
]
instance_id = instance_id.squeeze(dim=-1)
self.instance_id = F.pad(
instance_id,
(0, self.num_anchor - self.num_temp_instances),
value=-1,
)
================================================
FILE: bip3d/models/spatial_enhancer.py
================================================
import torch
from torch import nn
from mmengine.model import BaseModel
from mmdet.models.layers.transformer.utils import MLP
from mmcv.cnn.bricks.transformer import FFN
from .utils import deformable_format
from bip3d.registry import MODELS
@MODELS.register_module()
class DepthFusionSpatialEnhancer(BaseModel):
def __init__(
self,
embed_dims=256,
feature_3d_dim=32,
num_depth_layers=2,
min_depth=0.25,
max_depth=10,
num_depth=64,
with_feature_3d=True,
loss_depth_weight=-1,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dims = embed_dims
self.feature_3d_dim = feature_3d_dim
self.num_depth_layers = num_depth_layers
self.min_depth = min_depth
self.max_depth = max_depth
self.num_depth = num_depth
self.with_feature_3d = with_feature_3d
self.loss_depth_weight = loss_depth_weight
fusion_dim = self.embed_dims + self.feature_3d_dim
if self.with_feature_3d:
self.pts_prob_pre_fc = nn.Linear(
self.embed_dims, self.feature_3d_dim
)
dim = self.feature_3d_dim * 2
fusion_dim += self.feature_3d_dim
else:
dim = self.embed_dims
self.pts_prob_fc = MLP(
dim,
dim,
self.num_depth,
self.num_depth_layers,
)
self.pts_fc = nn.Linear(3, self.feature_3d_dim)
self.fusion_fc = nn.Sequential(
FFN(embed_dims=fusion_dim, feedforward_channels=1024),
nn.Linear(fusion_dim, self.embed_dims),
)
self.fusion_norm = nn.LayerNorm(self.embed_dims)
def forward(
self,
feature_maps,
feature_3d=None,
batch_inputs=None,
**kwargs,
):
with_cams = feature_maps[0].dim() == 5
if with_cams:
bs, num_cams = feature_maps[0].shape[:2]
else:
bs = feature_maps[0].shape[0]
num_cams = 1
feature_2d, spatial_shapes, _ = deformable_format(feature_maps)
pts = self.get_pts(
spatial_shapes,
batch_inputs["image_wh"][0, 0],
batch_inputs["projection_mat"],
feature_2d.device,
feature_2d.dtype,
)
if self.with_feature_3d:
feature_3d = deformable_format(feature_3d)[0]
depth_prob_feat = self.pts_prob_pre_fc(feature_2d)
depth_prob_feat = torch.cat([depth_prob_feat, feature_3d], dim=-1)
depth_prob = self.pts_prob_fc(depth_prob_feat).softmax(dim=-1)
feature_fused = [feature_2d, feature_3d]
else:
depth_prob = self.pts_prob_fc(feature_2d).softmax(dim=-1)
feature_fused = [feature_2d]
pts_feature = self.pts_fc(pts)
pts_feature = (depth_prob.unsqueeze(dim=-1) * pts_feature).sum(dim=-2)
feature_fused.append(pts_feature)
feature_fused = torch.cat(feature_fused, dim=-1)
feature_fused = self.fusion_fc(feature_fused) + feature_2d
feature_fused = self.fusion_norm(feature_fused)
feature_fused = deformable_format(feature_fused, spatial_shapes)
if self.loss_depth_weight > 0 and self.training:
loss_depth = self.depth_prob_loss(depth_prob, batch_inputs)
else:
loss_depth = None
return feature_fused, depth_prob, loss_depth
def get_pts(self, spatial_shapes, image_wh, projection_mat, device, dtype):
pixels = []
for i, shape in enumerate(spatial_shapes):
stride = image_wh[0] / shape[1]
u = torch.linspace(
0, image_wh[0] - stride, shape[1], device=device, dtype=dtype
)
v = torch.linspace(
0, image_wh[1] - stride, shape[0], device=device, dtype=dtype
)
u = u[None].tile(shape[0], 1)
v = v[:, None].tile(1, shape[1])
uv = torch.stack([u, v], dim=-1).flatten(0, 1)
pixels.append(uv)
pixels = torch.cat(pixels, dim=0)[:, None]
depths = torch.linspace(
self.min_depth,
self.max_depth,
self.num_depth,
device=device,
dtype=dtype,
)
depths = depths[None, :, None]
pts = pixels * depths
depths = depths.tile(pixels.shape[0], 1, 1)
pts = torch.cat([pts, depths, torch.ones_like(depths)], dim=-1)
pts = torch.linalg.solve(
projection_mat.mT.unsqueeze(dim=2), pts, left=False
)[
..., :3
] # b,cam,N,3
return pts
def depth_prob_loss(self, depth_prob, batch_inputs):
mask = batch_inputs["depth_prob_gt"][..., 0] != 1
loss_depth = (
torch.nn.functional.binary_cross_entropy(
depth_prob[mask], batch_inputs["depth_prob_gt"][mask]
)
* self.loss_depth_weight
)
return loss_depth
================================================
FILE: bip3d/models/structure.py
================================================
from torch import nn
from mmdet.models.detectors import BaseDetector
from mmdet.models.detectors.deformable_detr import (
MultiScaleDeformableAttention,
)
from bip3d.registry import MODELS
@MODELS.register_module()
class BIP3D(BaseDetector):
def __init__(
self,
backbone,
decoder,
neck=None,
text_encoder=None,
feature_enhancer=None,
spatial_enhancer=None,
data_preprocessor=None,
backbone_3d=None,
neck_3d=None,
init_cfg=None,
input_2d="imgs",
input_3d="depth_img",
embed_dims=256,
use_img_grid_mask=False,
use_depth_grid_mask=False,
):
super().__init__(data_preprocessor, init_cfg)
def build(cfg):
if cfg is None:
return None
return MODELS.build(cfg)
self.backbone = build(backbone)
self.decoder = build(decoder)
self.neck = build(neck)
self.text_encoder = build(text_encoder)
self.feature_enhancer = build(feature_enhancer)
self.spatial_enhancer = build(spatial_enhancer)
self.backbone_3d = build(backbone_3d)
self.neck_3d = build(neck_3d)
self.input_2d = input_2d
self.input_3d = input_3d
self.embed_dims = embed_dims
if text_encoder is not None:
self.text_feat_map = nn.Linear(
self.text_encoder.language_backbone.body.language_dim,
self.embed_dims,
bias=True,
)
self.use_img_grid_mask = use_img_grid_mask
self.use_depth_grid_mask = use_depth_grid_mask
if use_depth_grid_mask or use_img_grid_mask:
from ..grid_mask import GridMask
self.grid_mask = GridMask(
True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7
)
def init_weights(self):
"""Initialize weights for Transformer and other components."""
for p in self.feature_enhancer.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
self.decoder.init_weights()
for m in self.modules():
if isinstance(m, MultiScaleDeformableAttention):
m.init_weights()
nn.init.normal_(self.feature_enhancer.level_embed)
nn.init.constant_(self.text_feat_map.bias.data, 0)
nn.init.xavier_uniform_(self.text_feat_map.weight.data)
def extract_feat(self, batch_inputs_dict, batch_data_samples):
imgs = batch_inputs_dict.get(self.input_2d)
if imgs.dim() == 5:
bs, num_cams = imgs.shape[:2]
imgs = imgs.flatten(end_dim=1)
else:
bs = imgs.shape[0]
num_cams = 1
if self.use_img_grid_mask and self.training:
img = self.grid_mask(
img,
offset=-self.data_preprocessor.mean
/ self.data_preprocessor.std,
)
feature_maps = self.backbone(imgs)
if self.neck is not None:
feature_maps = self.neck(feature_maps)
feature_maps = [x.unflatten(0, (bs, num_cams)) for x in feature_maps]
input_3d = batch_inputs_dict.get(self.input_3d)
if self.backbone_3d is not None and input_3d is not None:
if self.input_3d == "depth_img" and input_3d.dim() == 5:
assert input_3d.shape[1] == num_cams
input_3d = input_3d.flatten(end_dim=1)
if self.use_depth_grid_mask and self.training:
input_3d = self.grid_mask(input_3d)
feature_3d = self.backbone_3d(input_3d)
if self.neck_3d is not None:
feature_3d = self.neck_3d(feature_3d)
feature_3d = [x.unflatten(0, (bs, num_cams)) for x in feature_3d]
else:
feature_3d = None
return feature_maps, feature_3d
def extract_text_feature(self, batch_inputs_dict):
if self.text_encoder is not None:
text_dict = self.text_encoder(batch_inputs_dict["text"])
text_dict["embedded"] = self.text_feat_map(text_dict["embedded"])
else:
text_dict = None
return text_dict
def loss(self, batch_inputs, batch_data_samples):
model_outs, text_dict, loss_depth = self._forward(
batch_inputs, batch_data_samples
)
loss = self.decoder.loss(model_outs, batch_inputs, text_dict=text_dict)
if loss_depth is not None:
loss["loss_depth"] = loss_depth
return loss
def predict(self, batch_inputs, batch_data_samples):
model_outs, text_dict = self._forward(batch_inputs, batch_data_samples)
results = self.decoder.post_process(
model_outs, text_dict, batch_inputs, batch_data_samples
)
return results
def _forward(self, batch_inputs, batch_data_samples):
feature_maps, feature_3d = self.extract_feat(
batch_inputs, batch_data_samples
)
text_dict = self.extract_text_feature(batch_inputs)
if self.feature_enhancer is not None:
feature_maps, text_feature = self.feature_enhancer(
feature_maps=feature_maps,
feature_3d=feature_3d,
text_dict=text_dict,
batch_inputs=batch_inputs,
batch_data_samples=batch_data_samples,
)
text_dict["embedded"] = text_feature
if self.spatial_enhancer is not None:
feature_maps, depth_prob, loss_depth = self.spatial_enhancer(
feature_maps=feature_maps,
feature_3d=feature_3d,
text_dict=text_dict,
batch_inputs=batch_inputs,
batch_data_samples=batch_data_samples,
)
else:
loss_depth = depth_prob = None
model_outs = self.decoder(
feature_maps=feature_maps,
feature_3d=feature_3d,
text_dict=text_dict,
batch_inputs=batch_inputs,
batch_data_samples=batch_data_samples,
depth_prob=depth_prob,
)
if self.training:
return model_outs, text_dict, loss_depth
return model_outs, text_dict
================================================
FILE: bip3d/models/target.py
================================================
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from .base_target import BaseTargetWithDenoising
from .utils import (
wasserstein_distance,
permutation_corner_distance,
center_distance,
get_positive_map,
)
from bip3d.registry import TASK_UTILS
__all__ = ["Grounding3DTarget"]
X, Y, Z, W, L, H, ALPHA, BETA, GAMMA = range(9)
@TASK_UTILS.register_module()
class Grounding3DTarget(BaseTargetWithDenoising):
def __init__(
self,
cls_weight=1.0,
alpha=0.25,
gamma=2,
eps=1e-12,
box_weight=1.0,
cls_wise_reg_weights=None,
num_dn=0,
dn_noise_scale=0.5,
add_neg_dn=True,
num_temp_dn_groups=0,
with_dn_query=False,
num_classes=None,
embed_dims=256,
label_noise_scale=0.5,
cost_weight_wd=1.0,
cost_weight_pcd=0.0,
cost_weight_cd=0.8,
use_ignore_mask=False,
):
super(Grounding3DTarget, self).__init__(num_dn, num_temp_dn_groups)
self.cls_weight = cls_weight
self.box_weight = box_weight
self.alpha = alpha
self.gamma = gamma
self.eps = eps
self.cls_wise_reg_weights = cls_wise_reg_weights
self.dn_noise_scale = dn_noise_scale
self.add_neg_dn = add_neg_dn
self.with_dn_query = with_dn_query
self.cost_weight_wd = cost_weight_wd
self.cost_weight_pcd = cost_weight_pcd
self.cost_weight_cd = cost_weight_cd
self.use_ignore_mask = use_ignore_mask
if self.with_dn_query:
self.num_classes = num_classes
self.embed_dims = embed_dims
self.label_noise_scale = label_noise_scale
self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims)
def encode_reg_target(self, box_target, device=None):
outputs = []
for box in box_target:
if not isinstance(box, torch.Tensor):
box = torch.cat(
[box.gravity_center, box.tensor[..., 3:]], dim=-1
)
output = torch.cat(
[box[..., [X, Y, Z]], box[..., [W, L, H]], box[..., ALPHA:]],
dim=-1,
)
if device is not None:
output = output.to(device=device)
outputs.append(output)
return outputs
@torch.no_grad()
def sample(
self,
cls_pred,
box_pred,
char_positives,
box_target,
text_dict,
ignore_mask=None,
):
bs, num_pred, num_cls = cls_pred.shape
token_positive_maps = get_positive_map(char_positives, text_dict)
token_positive_maps = [
x.to(cls_pred).bool().float() for x in token_positive_maps
]
cls_cost = self._cls_cost(cls_pred, token_positive_maps, text_dict["text_token_mask"])
box_target = self.encode_reg_target(box_target, box_pred.device)
box_cost = self._box_cost(box_pred, box_target)
indices = []
for i in range(bs):
if cls_cost[i] is not None and box_cost[i] is not None:
cost = (cls_cost[i] + box_cost[i]).detach().cpu().numpy()
cost = np.where(np.isneginf(cost) | np.isnan(cost), 1e8, cost)
assign = linear_sum_assignment(cost)
indices.append(
[cls_pred.new_tensor(x, dtype=torch.int64) for x in assign]
)
else:
indices.append([None, None])
output_cls_target = torch.zeros_like(cls_pred)
output_box_target = torch.zeros_like(box_pred)
output_reg_weights = torch.ones_like(box_pred)
if self.use_ignore_mask:
output_ignore_mask = torch.zeros_like(cls_pred[..., 0], dtype=torch.bool)
ignore_mask = [output_ignore_mask.new_tensor(x) for x in ignore_mask]
else:
output_ignore_mask = None
for i, (pred_idx, target_idx) in enumerate(indices):
if len(box_target[i]) == 0:
continue
output_cls_target[i, pred_idx] = token_positive_maps[i][target_idx]
output_box_target[i, pred_idx] = box_target[i][target_idx]
if self.use_ignore_mask:
output_ignore_mask[i, pred_idx] = ignore_mask[i][target_idx]
self.indices = indices
return output_cls_target, output_box_target, output_reg_weights, output_ignore_mask
def _cls_cost(self, cls_pred, token_positive_maps, text_token_mask):
bs = cls_pred.shape[0]
cls_pred = cls_pred.sigmoid()
cost = []
for i in range(bs):
if len(token_positive_maps[i]) > 0:
pred = cls_pred[i][:, text_token_mask[i]]
neg_cost = (
-(1 - pred + self.eps).log()
* (1 - self.alpha)
* pred.pow(self.gamma)
)
pos_cost = (
-(pred + self.eps).log()
* self.alpha
* (1 - pred).pow(self.gamma)
)
gt = token_positive_maps[i][:, text_token_mask[i]]
cls_cost = torch.einsum(
"nc,mc->nm", pos_cost, gt
) + torch.einsum("nc,mc->nm", neg_cost, (1 - gt))
cost.append(cls_cost)
else:
cost.append(None)
return cost
def _box_cost(self, box_pred, box_target):
bs = box_pred.shape[0]
cost = []
for i in range(bs):
if len(box_target[i]) > 0:
pred = box_pred[i].unsqueeze(dim=-2)
gt = box_target[i].unsqueeze(dim=-3)
_cost = 0
if self.cost_weight_wd > 0:
_cost += self.cost_weight_wd * wasserstein_distance(
pred, gt
)
if self.cost_weight_pcd > 0:
_cost += (
self.cost_weight_pcd
* permutation_corner_distance(pred, gt)
)
if self.cost_weight_cd > 0:
_cost += self.cost_weight_cd * center_distance(pred, gt)
_cost *= self.box_weight
cost.append(_cost)
else:
cost.append(None)
return cost
def get_dn_anchors(self, char_positives, box_target, text_dict, label=None, ignore_mask=None):
if self.num_dn <= 0:
return None
if self.num_temp_dn_groups <= 0:
gt_instance_id = None
char_positives = [x[: self.num_dn] for x in char_positives]
box_target = [x[: self.num_dn] for x in box_target]
max_dn_gt = max([len(x) for x in char_positives] + [1])
token_positive_maps = get_positive_map(char_positives, text_dict)
token_positive_maps = torch.stack(
[
F.pad(x, (0, 0, 0, max_dn_gt - x.shape[0]), value=-1)
for x in token_positive_maps
]
)
box_target = self.encode_reg_target(box_target, box_target[0].device)
box_target = torch.stack(
[F.pad(x, (0, 0, 0, max_dn_gt - x.shape[0])) for x in box_target]
)
token_positive_maps = token_positive_maps.to(box_target)
box_target = torch.where(
(token_positive_maps == -1).all(dim=-1, keepdim=True),
box_target.new_tensor(0),
box_target,
)
bs, num_gt, state_dims = box_target.shape
num_dn_groups = self.num_dn // max(num_gt, 1)
if num_dn_groups > 1:
token_positive_maps = token_positive_maps.tile(num_dn_groups, 1, 1)
box_target = box_target.tile(num_dn_groups, 1, 1)
noise = torch.rand_like(box_target) * 2 - 1
noise *= box_target.new_tensor(self.dn_noise_scale)
noise[..., :3] *= box_target[..., 3:6]
noise[..., 3:6] *= box_target[..., 3:6]
dn_anchor = box_target + noise
if self.add_neg_dn:
noise_neg = torch.rand_like(box_target) + 1
flag = torch.where(
torch.rand_like(box_target) > 0.5,
noise_neg.new_tensor(1),
noise_neg.new_tensor(-1),
)
noise_neg *= flag
noise_neg *= box_target.new_tensor(self.dn_noise_scale)
noise_neg[..., :3] *= box_target[..., 3:6]
noise_neg[..., 3:6] *= box_target[..., 3:6]
dn_anchor = torch.cat([dn_anchor, box_target + noise_neg], dim=1)
num_gt *= 2
box_cost = self._box_cost(dn_anchor, box_target)
dn_box_target = torch.zeros_like(dn_anchor)
dn_token_positive_maps = -torch.ones_like(token_positive_maps) * 3
if self.add_neg_dn:
dn_token_positive_maps = torch.cat(
[dn_token_positive_maps, dn_token_positive_maps], dim=1
)
for i in range(dn_anchor.shape[0]):
if box_cost[i] is None:
continue
cost = box_cost[i].cpu().numpy()
anchor_idx, gt_idx = linear_sum_assignment(cost)
anchor_idx = dn_anchor.new_tensor(anchor_idx, dtype=torch.int64)
gt_idx = dn_anchor.new_tensor(gt_idx, dtype=torch.int64)
dn_box_target[i, anchor_idx] = box_target[i, gt_idx]
dn_token_positive_maps[i, anchor_idx] = token_positive_maps[
i, gt_idx
]
dn_anchor = (
dn_anchor.reshape(num_dn_groups, bs, num_gt, state_dims)
.permute(1, 0, 2, 3)
.flatten(1, 2)
)
dn_box_target = (
dn_box_target.reshape(num_dn_groups, bs, num_gt, state_dims)
.permute(1, 0, 2, 3)
.flatten(1, 2)
)
text_length = dn_token_positive_maps.shape[-1]
dn_token_positive_maps = (
dn_token_positive_maps.reshape(
num_dn_groups, bs, num_gt, text_length
)
.permute(1, 0, 2, 3)
.flatten(1, 2)
)
valid_mask = (dn_token_positive_maps >= 0).all(dim=-1)
if self.add_neg_dn:
token_positive_maps = (
torch.cat([token_positive_maps, token_positive_maps], dim=1)
.reshape(num_dn_groups, bs, num_gt, text_length)
.permute(1, 0, 2, 3)
.flatten(1, 2)
)
valid_mask = torch.logical_or(
valid_mask,
(
(token_positive_maps >= 0).all(dim=-1)
& (dn_token_positive_maps == -3).all(dim=-1)
),
) # valid denotes the items is not from pad.
attn_mask = dn_box_target.new_ones(
num_gt * num_dn_groups, num_gt * num_dn_groups
)
dn_token_positive_maps = torch.clamp(dn_token_positive_maps, min=0)
for i in range(num_dn_groups):
start = num_gt * i
end = start + num_gt
attn_mask[start:end, start:end] = 0
attn_mask = attn_mask == 1
dn_anchor[..., 3:6] = torch.clamp(dn_anchor[..., 3:6], min=0.01).log()
if label is not None and self.with_dn_query:
label = torch.stack(
[F.pad(x, (0, max_dn_gt - x.shape[0]), value=-1) for x in label]
)
label = label.tile(num_dn_groups, 1)
if self.add_neg_dn:
label = torch.cat([label, label], dim=1)
label = label.reshape(num_dn_groups, bs, num_gt).permute(1, 0, 2).flatten(1, 2)
label_noise_mask = torch.logical_or(
torch.rand_like(label.float()) < self.label_noise_scale * 0.5,
label == -1
)
label = torch.where(
label_noise_mask,
torch.randint_like(label, 0, self.num_classes),
label,
)
dn_query = self.label_embedding(label)
else:
dn_query = None
return (
dn_anchor,
dn_box_target,
dn_token_positive_maps,
attn_mask,
valid_mask,
dn_query,
)
================================================
FILE: bip3d/models/utils.py
================================================
import torch
from torch import nn
from pytorch3d.transforms import euler_angles_to_matrix
def deformable_format(
feature_maps,
spatial_shapes=None,
level_start_index=None,
flat_batch=False,
batch_size=None,
):
if spatial_shapes is None:
if flat_batch and feature_maps[0].dim() > 4:
feature_maps = [x.flatten(end_dim=-4) for x in feature_maps]
feat_flatten = []
spatial_shapes = []
for lvl, feat in enumerate(feature_maps):
spatial_shape = torch._shape_as_tensor(feat)[-2:].to(feat.device)
feat = feat.flatten(start_dim=-2).transpose(-1, -2)
feat_flatten.append(feat)
spatial_shapes.append(spatial_shape)
# (bs, num_feat_points, dim)
feat_flatten = torch.cat(feat_flatten, -2)
spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
level_start_index = torch.cat(
(
spatial_shapes.new_zeros((1,)), # (num_level)
spatial_shapes.prod(1).cumsum(0)[:-1],
)
)
return feat_flatten, spatial_shapes, level_start_index
else:
split_size = (spatial_shapes[:, 0] * spatial_shapes[:, 1]).tolist()
feature_maps = feature_maps.transpose(-1, -2)
feature_maps = list(torch.split(feature_maps, split_size, dim=-1))
for i, feat in enumerate(feature_maps):
feature_maps[i] = feature_maps[i].unflatten(
-1, (spatial_shapes[i, 0], spatial_shapes[i, 1])
)
if batch_size is not None:
if isinstance(batch_size, int):
feature_maps[i] = feature_maps[i].unflatten(
0, (batch_size, -1)
)
else:
feature_maps[i] = feature_maps[i].unflatten(
0, batch_size + (-1,)
)
return feature_maps
def bbox_to_corners(bbox, permutation=False):
assert (
len(bbox.shape) == 2
), "bbox must be 2D tensor of shape (N, 6) or (N, 7) or (N, 9)"
if bbox.shape[-1] == 6:
rot_mat = (
torch.eye(3, device=bbox.device)
.unsqueeze(0)
.repeat(bbox.shape[0], 1, 1)
)
elif bbox.shape[-1] == 7:
angles = bbox[:, 6:]
fake_angles = torch.zeros_like(angles).repeat(1, 2)
angles = torch.cat((angles, fake_angles), dim=1)
rot_mat = euler_angles_to_matrix(angles, "ZXY")
elif bbox.shape[-1] == 9:
rot_mat = euler_angles_to_matrix(bbox[:, 6:], "ZXY")
else:
raise NotImplementedError
centers = bbox[:, :3].unsqueeze(1).repeat(1, 8, 1) # shape (N, 8, 3)
half_sizes = (
bbox[:, 3:6].unsqueeze(1).repeat(1, 8, 1) / 2
) # shape (N, 8, 3)
eight_corners_x = (
torch.tensor([1, 1, 1, 1, -1, -1, -1, -1], device=bbox.device)
.unsqueeze(0)
.repeat(bbox.shape[0], 1)
) # shape (N, 8)
eight_corners_y = (
torch.tensor([1, 1, -1, -1, 1, 1, -1, -1], device=bbox.device)
.unsqueeze(0)
.repeat(bbox.shape[0], 1)
) # shape (N, 8)
eight_corners_z = (
torch.tensor([1, -1, 1, -1, 1, -1, 1, -1], device=bbox.device)
.unsqueeze(0)
.repeat(bbox.shape[0], 1)
) # shape (N, 8)
eight_corners = torch.stack(
(eight_corners_x, eight_corners_y, eight_corners_z), dim=-1
) # shape (N, 8, 3)
eight_corners = eight_corners * half_sizes # shape (N, 8, 3)
# rot_mat: (N, 3, 3), eight_corners: (N, 8, 3)
rotated_corners = torch.matmul(
eight_corners, rot_mat.transpose(1, 2)
) # shape (N, 8, 3)
corners = rotated_corners + centers
if permutation:
corners = corners[:, PERMUTE_INDEX]
return corners
def wasserstein_distance(source, target):
rot_mat_src = euler_angles_to_matrix(source[..., 6:9], "ZXY")
sqrt_sigma_src = rot_mat_src @ (
source[..., 3:6, None] * rot_mat_src.transpose(-2, -1)
)
rot_mat_tgt = euler_angles_to_matrix(target[..., 6:9], "ZXY")
sqrt_sigma_tgt = rot_mat_tgt @ (
target[..., 3:6, None] * rot_mat_tgt.transpose(-2, -1)
)
sigma_distance = sqrt_sigma_src - sqrt_sigma_tgt
sigma_distance = sigma_distance.pow(2).sum(dim=-1).sum(dim=-1)
center_distance = ((source[..., :3] - target[..., :3]) ** 2).sum(dim=-1)
distance = sigma_distance + center_distance
distance = distance.clamp(1e-7).sqrt()
return distance
def permutation_corner_distance(source, target):
source_corners = bbox_to_corners(source) # N, 8, 3
target_corners = bbox_to_corners(target, True) # N, 48, 8, 3
distance = torch.norm(
source_corners.unsqueeze(dim=-2) - target_corners, p=2, dim=-1
)
distance = distance.mean(dim=-1).min(dim=-1).values
return distance
def center_distance(source, target):
return torch.norm(source[..., :3] - target[..., :3], p=2, dim=-1)
def get_positive_map(char_positive, text_dict):
bs, text_length = text_dict["embedded"].shape[:2]
tokenized = text_dict["tokenized"]
positive_maps = []
for i in range(bs):
num_target = len(char_positive[i])
positive_map = torch.zeros(
(num_target, text_length), dtype=torch.float
)
for j, tok_list in enumerate(char_positive[i]):
for beg, end in tok_list:
try:
beg_pos = tokenized.char_to_token(i, beg)
end_pos = tokenized.char_to_token(i, end - 1)
except Exception as e:
print("beg:", beg, "end:", end)
print("token_positive:", char_positive[i])
raise e
if beg_pos is None:
try:
beg_pos = tokenized.char_to_token(i, beg + 1)
if beg_pos is None:
beg_pos = tokenized.char_to_token(i, beg + 2)
except Exception:
beg_pos = None
if end_pos is None:
try:
end_pos = tokenized.char_to_token(i, end - 2)
if end_pos is None:
end_pos = tokenized.char_to_token(i, end - 3)
except Exception:
end_pos = None
if beg_pos is None or end_pos is None:
continue
assert beg_pos is not None and end_pos is not None
positive_map[j, beg_pos : end_pos + 1].fill_(1)
positive_map /= (positive_map.sum(-1)[:, None] + 1e-6)
positive_maps.append(positive_map)
return positive_maps
def get_entities(text, char_positive, sep_token="[SEP]"):
batch_entities = []
for bs_idx in range(len(char_positive)):
entities = []
for obj_idx in range(len(char_positive[bs_idx])):
entity = ""
for beg, end in char_positive[bs_idx][obj_idx]:
if len(entity) == 0:
entity = text[bs_idx][beg:end]
else:
entity += sep_token + text[bs_idx][beg:end]
entities.append(entity)
batch_entities.append(entities)
return batch_entities
def linear_act_ln(
embed_dims, in_loops, out_loops, input_dims=None,
act_cfg=None,
):
if act_cfg is None:
act_cfg = dict(type='ReLU', inplace=True)
from mmcv.cnn import build_activation_layer
if input_dims is None:
input_dims = embed_dims
layers = []
for _ in range(out_loops):
for _ in range(in_loops):
layers.append(nn.Linear(input_dims, embed_dims))
layers.append(build_activation_layer(act_cfg))
input_dims = embed_dims
layers.append(nn.LayerNorm(embed_dims))
return layers
================================================
FILE: bip3d/ops/__init__.py
================================================
from .deformable_aggregation import (
deformable_aggregation_func,
feature_maps_format
)
================================================
FILE: bip3d/ops/deformable_aggregation.py
================================================
import torch
from torch.autograd.function import Function, once_differentiable
from torch.cuda.amp import custom_fwd, custom_bwd
from . import deformable_aggregation_ext as da
from . import deformable_aggregation_with_depth_ext as dad
class DeformableAggregationFunction(Function):
@staticmethod
@custom_fwd
def forward(
ctx,
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
):
# output: [bs, num_pts, num_embeds]
mc_ms_feat = mc_ms_feat.contiguous().float()
spatial_shape = spatial_shape.contiguous().int()
scale_start_index = scale_start_index.contiguous().int()
sampling_location = sampling_location.contiguous().float()
weights = weights.contiguous().float()
output = da.deformable_aggregation_forward(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
)
ctx.save_for_backward(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
)
return output
@staticmethod
@once_differentiable
@custom_bwd
def backward(ctx, grad_output):
(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
) = ctx.saved_tensors
mc_ms_feat = mc_ms_feat.contiguous().float()
spatial_shape = spatial_shape.contiguous().int()
scale_start_index = scale_start_index.contiguous().int()
sampling_location = sampling_location.contiguous().float()
weights = weights.contiguous().float()
grad_mc_ms_feat = torch.zeros_like(mc_ms_feat)
grad_sampling_location = torch.zeros_like(sampling_location)
grad_weights = torch.zeros_like(weights)
da.deformable_aggregation_backward(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
grad_output.contiguous(),
grad_mc_ms_feat,
grad_sampling_location,
grad_weights,
)
# print(grad_mc_ms_feat.abs().mean(), grad_sampling_location.abs().mean(), grad_weights.abs().mean())
return (
grad_mc_ms_feat,
None,
None,
grad_sampling_location,
grad_weights,
)
class DeformableAggregationWithDepthFunction(Function):
@staticmethod
@custom_fwd
def forward(
ctx,
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
num_depths,
):
# output: [bs, num_pts, num_embeds]
mc_ms_feat = mc_ms_feat.contiguous().float()
spatial_shape = spatial_shape.contiguous().int()
scale_start_index = scale_start_index.contiguous().int()
sampling_location = sampling_location.contiguous().float()
weights = weights.contiguous().float()
output = dad.deformable_aggregation_with_depth_forward(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
num_depths,
)
ctx.save_for_backward(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
)
ctx._num_depths = num_depths
return output
@staticmethod
@once_differentiable
@custom_bwd
def backward(ctx, grad_output):
(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
) = ctx.saved_tensors
num_depths = ctx._num_depths
mc_ms_feat = mc_ms_feat.contiguous().float()
spatial_shape = spatial_shape.contiguous().int()
scale_start_index = scale_start_index.contiguous().int()
sampling_location = sampling_location.contiguous().float()
weights = weights.contiguous().float()
grad_mc_ms_feat = torch.zeros_like(mc_ms_feat)
grad_sampling_location = torch.zeros_like(sampling_location)
grad_weights = torch.zeros_like(weights)
dad.deformable_aggregation_with_depth_backward(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
num_depths,
grad_output.contiguous(),
grad_mc_ms_feat,
grad_sampling_location,
grad_weights,
)
# print(grad_mc_ms_feat.abs().mean(), grad_sampling_location.abs().mean(), grad_weights.abs().mean())
# print(grad_mc_ms_feat.abs().max(), grad_sampling_location.abs().max(), grad_weights.abs().max())
# print("")
return (
grad_mc_ms_feat,
None,
None,
grad_sampling_location,
grad_weights,
None,
)
def deformable_aggregation_func(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
depth_prob=None,
depth=None
):
if depth_prob is not None and depth is not None:
mc_ms_feat = torch.cat([mc_ms_feat, depth_prob], dim=-1)
sampling_location = torch.cat([sampling_location, depth], dim=-1)
return DeformableAggregationWithDepthFunction.apply(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
depth_prob.shape[-1],
)
else:
return DeformableAggregationFunction.apply(
mc_ms_feat,
spatial_shape,
scale_start_index,
sampling_location,
weights,
)
def feature_maps_format(feature_maps, inverse=False):
bs, num_cams = feature_maps[0].shape[:2]
if not inverse:
spatial_shape = []
scale_start_index = [0]
col_feats = []
for i, feat in enumerate(feature_maps):
spatial_shape.append(feat.shape[-2:])
scale_start_index.append(
feat.shape[-1] * feat.shape[-2] + scale_start_index[-1]
)
col_feats.append(torch.reshape(
feat, (bs, num_cams, feat.shape[2], -1)
))
scale_start_index.pop()
col_feats = torch.cat(col_feats, dim=-1).permute(0, 1, 3, 2)
feature_maps = [
col_feats,
torch.tensor(
spatial_shape,
dtype=torch.int64,
device=col_feats.device,
),
torch.tensor(
scale_start_index,
dtype=torch.int64,
device=col_feats.device,
),
]
else:
spatial_shape = feature_maps[1].int()
split_size = (spatial_shape[:, 0] * spatial_shape[:, 1]).tolist()
feature_maps = feature_maps[0].permute(0, 1, 3, 2)
feature_maps = list(torch.split(feature_maps, split_size, dim=-1))
for i, feat in enumerate(feature_maps):
feature_maps[i] = feat.reshape(
feat.shape[:3] + (spatial_shape[i, 0], spatial_shape[i, 1])
)
return feature_maps
================================================
FILE: bip3d/ops/gcc.sh
================================================
export CC=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/bin/gcc
export CXX=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/bin/g++
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib/dyninst:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib64/dyninst:$LD_LIBRARY_PATH
================================================
FILE: bip3d/ops/setup.py
================================================
import os
import torch
from setuptools import setup
from torch.utils.cpp_extension import (
BuildExtension,
CppExtension,
CUDAExtension,
)
def make_cuda_ext(
name,
module,
sources,
sources_cuda=[],
extra_args=[],
extra_include_path=[],
):
define_macros = []
extra_compile_args = {"cxx": [] + extra_args}
if torch.cuda.is_available() or os.getenv("FORCE_CUDA", "0") == "1":
define_macros += [("WITH_CUDA", None)]
extension = CUDAExtension
extra_compile_args["nvcc"] = extra_args + [
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
sources += sources_cuda
else:
print("Compiling {} without CUDA".format(name))
extension = CppExtension
return extension(
name="{}.{}".format(module, name),
sources=[os.path.join(*module.split("."), p) for p in sources],
include_dirs=extra_include_path,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
if __name__ == "__main__":
setup(
name="deformable_aggregation_with_depth_ext",
ext_modules=[
make_cuda_ext(
"deformable_aggregation_with_depth_ext",
module=".",
sources=[
f"src/deformable_aggregation_with_depth.cpp",
f"src/deformable_aggregation_with_depth_cuda.cu",
],
),
],
cmdclass={"build_ext": BuildExtension},
)
setup(
name="deformable_aggregation_ext",
ext_modules=[
make_cuda_ext(
"deformable_aggregation_ext",
module=".",
sources=[
f"src/deformable_aggregation.cpp",
f"src/deformable_aggregation_cuda.cu",
],
),
],
cmdclass={"build_ext": BuildExtension},
)
================================================
FILE: bip3d/ops/src/deformable_aggregation.cpp
================================================
#include
#include
void deformable_aggregation(
float* output,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups
);
void deformable_aggregation_grad(
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
const float* grad_output,
float* grad_mc_ms_feat,
float* grad_sampling_location,
float* grad_weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups
);
/* _mc_ms_feat: b, cam, feat, C */
/* _spatial_shape: scale, 2 */
/* _scale_start_index: scale */
/* _sampling_location: bs, pts, */
at::Tensor deformable_aggregation_forward(
const at::Tensor &_mc_ms_feat,
const at::Tensor &_spatial_shape,
const at::Tensor &_scale_start_index,
const at::Tensor &_sampling_location,
const at::Tensor &_weights
) {
at::DeviceGuard guard(_mc_ms_feat.device());
const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
int batch_size = _mc_ms_feat.size(0);
int num_cams = _mc_ms_feat.size(1);
int num_feat = _mc_ms_feat.size(2);
int num_embeds = _mc_ms_feat.size(3);
int num_scale = _spatial_shape.size(0);
int num_pts = _sampling_location.size(1);
int num_groups = _weights.size(4);
const float* mc_ms_feat = _mc_ms_feat.data_ptr();
const int* spatial_shape = _spatial_shape.data_ptr();
const int* scale_start_index = _scale_start_index.data_ptr();
const float* sampling_location = _sampling_location.data_ptr();
const float* weights = _weights.data_ptr();
auto output = at::zeros({batch_size, num_pts, num_embeds}, _mc_ms_feat.options());
deformable_aggregation(
output.data_ptr(),
mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
);
return output;
}
void deformable_aggregation_backward(
const at::Tensor &_mc_ms_feat,
const at::Tensor &_spatial_shape,
const at::Tensor &_scale_start_index,
const at::Tensor &_sampling_location,
const at::Tensor &_weights,
const at::Tensor &_grad_output,
at::Tensor &_grad_mc_ms_feat,
at::Tensor &_grad_sampling_location,
at::Tensor &_grad_weights
) {
at::DeviceGuard guard(_mc_ms_feat.device());
const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
int batch_size = _mc_ms_feat.size(0);
int num_cams = _mc_ms_feat.size(1);
int num_feat = _mc_ms_feat.size(2);
int num_embeds = _mc_ms_feat.size(3);
int num_scale = _spatial_shape.size(0);
int num_pts = _sampling_location.size(1);
int num_groups = _weights.size(4);
const float* mc_ms_feat = _mc_ms_feat.data_ptr();
const int* spatial_shape = _spatial_shape.data_ptr();
const int* scale_start_index = _scale_start_index.data_ptr();
const float* sampling_location = _sampling_location.data_ptr();
const float* weights = _weights.data_ptr();
const float* grad_output = _grad_output.data_ptr();
float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr();
float* grad_sampling_location = _grad_sampling_location.data_ptr();
float* grad_weights = _grad_weights.data_ptr();
deformable_aggregation_grad(
mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def(
"deformable_aggregation_forward",
&deformable_aggregation_forward,
"deformable_aggregation_forward"
);
m.def(
"deformable_aggregation_backward",
&deformable_aggregation_backward,
"deformable_aggregation_backward"
);
}
================================================
FILE: bip3d/ops/src/deformable_aggregation_cuda.cu
================================================
#include
#include
#include
#include
#include
__device__ float bilinear_sampling(
const float *&bottom_data, const int &height, const int &width,
const int &num_embeds, const float &h_im, const float &w_im,
const int &base_ptr
) {
const int h_low = floorf(h_im);
const int w_low = floorf(w_im);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1 - lh, hw = 1 - lw;
const int w_stride = num_embeds;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
}
float v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
}
float v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
}
float v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
}
const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
__device__ void bilinear_sampling_grad(
const float *&bottom_data, const float &weight,
const int &height, const int &width,
const int &num_embeds, const float &h_im, const float &w_im,
const int &base_ptr,
const float &grad_output,
float *&grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights) {
const int h_low = floorf(h_im);
const int w_low = floorf(w_im);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1 - lh, hw = 1 - lw;
const int w_stride = num_embeds;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const float top_grad_mc_ms_feat = grad_output * weight;
float grad_h_weight = 0, grad_w_weight = 0;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
}
float v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
}
float v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
}
float v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
}
const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
atomicAdd(grad_weights, grad_output * val);
atomicAdd(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
atomicAdd(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
}
__global__ void deformable_aggregation_kernel(
const int num_kernels,
float* output,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_kernels) return;
float *output_ptr = output + idx;
const int channel_index = idx % num_embeds;
const int groups_index = channel_index / (num_embeds / num_groups);
idx /= num_embeds;
const int pts_index = idx % num_pts;
idx /= num_pts;
const int batch_index = idx;
const int value_cam_stride = num_feat * num_embeds;
const int weight_cam_stride = num_scale * num_groups;
int loc_offset = (batch_index * num_pts + pts_index) * num_cams << 1;
int value_offset = batch_index * num_cams * value_cam_stride + channel_index;
int weight_offset = (
(batch_index * num_pts + pts_index) * num_cams * weight_cam_stride
+ groups_index
);
float result = 0;
for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 2) {
const float loc_w = sample_location[loc_offset];
const float loc_h = sample_location[loc_offset + 1];
if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1) {
for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
const int scale_offset = scale_start_index[scale_index] * num_embeds;
const int spatial_shape_ptr = scale_index << 1;
const int h = spatial_shape[spatial_shape_ptr];
const int w = spatial_shape[spatial_shape_ptr + 1];
const float h_im = loc_h * h - 0.5;
const float w_im = loc_w * w - 0.5;
const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index;
const float *weights_ptr = (
weights + weight_offset + scale_index * num_groups
+ weight_cam_stride * cam_index
);
result += bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_ptr) * *weights_ptr;
}
}
}
*output_ptr = result;
}
__global__ void deformable_aggregation_grad_kernel(
const int num_kernels,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
const float* grad_output,
float* grad_mc_ms_feat,
float* grad_sampling_location,
float* grad_weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_kernels) return;
const float grad = grad_output[idx];
const int channel_index = idx % num_embeds;
const int groups_index = channel_index / (num_embeds / num_groups);
idx /= num_embeds;
const int pts_index = idx % num_pts;
idx /= num_pts;
const int batch_index = idx;
const int value_cam_stride = num_feat * num_embeds;
const int weight_cam_stride = num_scale * num_groups;
int loc_offset = (batch_index * num_pts + pts_index) * num_cams << 1;
int value_offset = batch_index * num_cams * value_cam_stride + channel_index;
int weight_offset = (
(batch_index * num_pts + pts_index) * num_cams * weight_cam_stride
+ groups_index
);
for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 2) {
const float loc_w = sample_location[loc_offset];
const float loc_h = sample_location[loc_offset + 1];
if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1) {
for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
const int scale_offset = scale_start_index[scale_index] * num_embeds;
const int spatial_shape_ptr = scale_index << 1;
const int h = spatial_shape[spatial_shape_ptr];
const int w = spatial_shape[spatial_shape_ptr + 1];
const float h_im = loc_h * h - 0.5;
const float w_im = loc_w * w - 0.5;
const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index;
const int weights_ptr = weight_offset + scale_index * num_groups + weight_cam_stride * cam_index;
const float weight = weights[weights_ptr];
float *grad_location_ptr = grad_sampling_location + loc_offset;
float *grad_weights_ptr = grad_weights + weights_ptr;
bilinear_sampling_grad(
mc_ms_feat, weight, h, w, num_embeds, h_im, w_im,
value_ptr,
grad,
grad_mc_ms_feat, grad_location_ptr, grad_weights_ptr
);
}
}
}
}
void deformable_aggregation(
float* output,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups
) {
const int num_kernels = batch_size * num_pts * num_embeds;
deformable_aggregation_kernel
<<<(int)ceil(((double)num_kernels/512)), 512>>>(
num_kernels, output,
mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
);
}
void deformable_aggregation_grad(
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
const float* grad_output,
float* grad_mc_ms_feat,
float* grad_sampling_location,
float* grad_weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups
) {
const int num_kernels = batch_size * num_pts * num_embeds;
deformable_aggregation_grad_kernel
<<<(int)ceil(((double)num_kernels/512)), 512>>>(
num_kernels,
mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups
);
}
================================================
FILE: bip3d/ops/src/deformable_aggregation_with_depth.cpp
================================================
#include
#include
void deformable_aggregation_with_depth(
float* output,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups,
int num_depths
);
void deformable_aggregation_with_depth_grad(
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
const float* grad_output,
float* grad_mc_ms_feat,
float* grad_sampling_location,
float* grad_weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_pts,
int num_groups,
int num_depths
);
/* _mc_ms_feat: b, cam, feat, C+D */
/* _spatial_shape: scale, 2 */
/* _scale_start_index: scale */
/* _sampling_location: bs, pts, */
at::Tensor deformable_aggregation_with_depth_forward(
const at::Tensor &_mc_ms_feat,
const at::Tensor &_spatial_shape,
const at::Tensor &_scale_start_index,
const at::Tensor &_sampling_location,
const at::Tensor &_weights,
const int num_depths
) {
at::DeviceGuard guard(_mc_ms_feat.device());
const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
int batch_size = _mc_ms_feat.size(0);
int num_cams = _mc_ms_feat.size(1);
int num_feat = _mc_ms_feat.size(2);
int num_embeds = _mc_ms_feat.size(3) - num_depths;
int num_scale = _spatial_shape.size(0);
int num_pts = _sampling_location.size(1);
int num_groups = _weights.size(4);
const float* mc_ms_feat = _mc_ms_feat.data_ptr();
const int* spatial_shape = _spatial_shape.data_ptr();
const int* scale_start_index = _scale_start_index.data_ptr();
const float* sampling_location = _sampling_location.data_ptr();
const float* weights = _weights.data_ptr();
auto output = at::zeros({batch_size, num_pts, num_embeds}, _mc_ms_feat.options());
deformable_aggregation_with_depth(
output.data_ptr(),
mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups,
num_depths
);
return output;
}
void deformable_aggregation_with_depth_backward(
const at::Tensor &_mc_ms_feat,
const at::Tensor &_spatial_shape,
const at::Tensor &_scale_start_index,
const at::Tensor &_sampling_location,
const at::Tensor &_weights,
const int num_depths,
const at::Tensor &_grad_output,
at::Tensor &_grad_mc_ms_feat,
at::Tensor &_grad_sampling_location,
at::Tensor &_grad_weights
) {
at::DeviceGuard guard(_mc_ms_feat.device());
const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
int batch_size = _mc_ms_feat.size(0);
int num_cams = _mc_ms_feat.size(1);
int num_feat = _mc_ms_feat.size(2);
int num_embeds = _mc_ms_feat.size(3) - num_depths;
int num_scale = _spatial_shape.size(0);
int num_pts = _sampling_location.size(1);
int num_groups = _weights.size(4);
const float* mc_ms_feat = _mc_ms_feat.data_ptr();
const int* spatial_shape = _spatial_shape.data_ptr();
const int* scale_start_index = _scale_start_index.data_ptr();
const float* sampling_location = _sampling_location.data_ptr();
const float* weights = _weights.data_ptr();
const float* grad_output = _grad_output.data_ptr();
float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr();
float* grad_sampling_location = _grad_sampling_location.data_ptr();
float* grad_weights = _grad_weights.data_ptr();
deformable_aggregation_with_depth_grad(
mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups,
num_depths
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def(
"deformable_aggregation_with_depth_forward",
&deformable_aggregation_with_depth_forward,
"deformable_aggregation_with_depth_forward"
);
m.def(
"deformable_aggregation_with_depth_backward",
&deformable_aggregation_with_depth_backward,
"deformable_aggregation_with_depth_backward"
);
}
================================================
FILE: bip3d/ops/src/deformable_aggregation_with_depth_cuda.cu
================================================
#include
#include
#include
#include
#include
__device__ float bilinear_sampling(const float *&bottom_data, const int &height,
const int &width,
const int &num_embeds_with_depth,
const int &num_depths, const float &h_im,
const float &w_im, const float &loc_d,
const int &base_ptr, const int &depth_ptr) {
const int h_low = floorf(h_im);
const int w_low = floorf(w_im);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const int d_low = floorf(loc_d);
const int d_high = d_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1 - lh, hw = 1 - lw;
const float ld = loc_d - d_low;
const float hd = 1 - ld;
const int w_stride = num_embeds_with_depth;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
float v1 = 0;
float dp_low1 = 0;
float dp_high1 = 0;
const bool flag_d_low = d_low >= 0 && d_low < num_depths;
const bool flag_d_high = d_high >= 0 && d_high < num_depths;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset;
v1 = bottom_data[ptr1 + base_ptr];
const int ptr_d1 = ptr1 + depth_ptr + d_low;
if (flag_d_low) {
dp_low1 = bottom_data[ptr_d1];
}
if (flag_d_high) {
dp_high1 = bottom_data[ptr_d1 + 1];
}
}
float v2 = 0;
float dp_low2 = 0;
float dp_high2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset;
v2 = bottom_data[ptr2 + base_ptr];
const int ptr_d2 = ptr2 + depth_ptr + d_low;
if (flag_d_low) {
dp_low2 = bottom_data[ptr_d2];
}
if (flag_d_high) {
dp_high2 = bottom_data[ptr_d2 + 1];
}
}
float v3 = 0;
float dp_low3 = 0;
float dp_high3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset;
v3 = bottom_data[ptr3 + base_ptr];
const int ptr_d3 = ptr3 + depth_ptr + d_low;
if (flag_d_low) {
dp_low3 = bottom_data[ptr_d3];
}
if (flag_d_high) {
dp_high3 = bottom_data[ptr_d3 + 1];
}
}
float v4 = 0;
float dp_low4 = 0;
float dp_high4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset;
v4 = bottom_data[ptr4 + base_ptr];
const int ptr_d4 = ptr4 + depth_ptr + d_low;
if (flag_d_low) {
dp_low4 = bottom_data[ptr_d4];
}
if (flag_d_high) {
dp_high4 = bottom_data[ptr_d4 + 1];
}
}
const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const float val = hd * (w1 * v1 * dp_low1 + w2 * v2 * dp_low2 +
w3 * v3 * dp_low3 + w4 * v4 * dp_low4) +
ld * (w1 * v1 * dp_high1 + w2 * v2 * dp_high2 +
w3 * v3 * dp_high3 + w4 * v4 * dp_high4);
return val;
}
__device__ void
bilinear_sampling_grad(const float *&bottom_data, const float &weight,
const int &height, const int &width,
const int &num_embeds_with_depth, const int &num_depths,
const float &h_im, const float &w_im, const float &loc_d,
const int &base_ptr, const int &depth_ptr,
const float &grad_output, float *&grad_mc_ms_feat,
float *grad_sampling_location, float *grad_weights) {
const int h_low = floorf(h_im);
const int w_low = floorf(w_im);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const int d_low = floorf(loc_d);
const int d_high = d_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1 - lh, hw = 1 - lw;
const float ld = loc_d - d_low;
const float hd = 1 - ld;
const int w_stride = num_embeds_with_depth;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const float top_grad_mc_ms_feat = grad_output * weight;
const bool flag_d_low = d_low >= 0 && d_low < num_depths;
const bool flag_d_high = d_high >= 0 && d_high < num_depths;
float v1 = 0;
float dp_low1 = 0;
float dp_high1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset;
v1 = bottom_data[ptr1 + base_ptr];
const int ptr_d1 = ptr1 + depth_ptr + d_low;
if (flag_d_low) {
dp_low1 = bottom_data[ptr_d1];
atomicAdd(grad_mc_ms_feat + ptr_d1, w1 * top_grad_mc_ms_feat * v1 * hd);
}
if (flag_d_high) {
dp_high1 = bottom_data[ptr_d1 + 1];
atomicAdd(grad_mc_ms_feat + ptr_d1 + 1,
w1 * top_grad_mc_ms_feat * v1 * ld);
}
atomicAdd(grad_mc_ms_feat + ptr1 + base_ptr,
w1 * top_grad_mc_ms_feat * (dp_low1 * hd + dp_high1 * ld));
}
float v2 = 0;
float dp_low2 = 0;
float dp_high2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset;
v2 = bottom_data[ptr2 + base_ptr];
const int ptr_d2 = ptr2 + depth_ptr + d_low;
if (flag_d_low) {
dp_low2 = bottom_data[ptr_d2];
atomicAdd(grad_mc_ms_feat + ptr_d2, w2 * top_grad_mc_ms_feat * v2 * hd);
}
if (flag_d_high) {
dp_high2 = bottom_data[ptr_d2 + 1];
atomicAdd(grad_mc_ms_feat + ptr_d2 + 1,
w2 * top_grad_mc_ms_feat * v2 * ld);
}
atomicAdd(grad_mc_ms_feat + ptr2 + base_ptr,
w2 * top_grad_mc_ms_feat * (dp_low2 * hd + dp_high2 * ld));
}
float v3 = 0;
float dp_low3 = 0;
float dp_high3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset;
v3 = bottom_data[ptr3 + base_ptr];
const int ptr_d3 = ptr3 + depth_ptr + d_low;
if (flag_d_low) {
dp_low3 = bottom_data[ptr_d3];
atomicAdd(grad_mc_ms_feat + ptr_d3, w3 * top_grad_mc_ms_feat * v3 * hd);
}
if (flag_d_high) {
dp_high3 = bottom_data[ptr_d3 + 1];
atomicAdd(grad_mc_ms_feat + ptr_d3 + 1,
w3 * top_grad_mc_ms_feat * v3 * ld);
}
atomicAdd(grad_mc_ms_feat + ptr3 + base_ptr,
w3 * top_grad_mc_ms_feat * (dp_low3 * hd + dp_high3 * ld));
}
float v4 = 0;
float dp_low4 = 0;
float dp_high4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset;
v4 = bottom_data[ptr4 + base_ptr];
const int ptr_d4 = ptr4 + depth_ptr + d_low;
if (flag_d_low) {
dp_low4 = bottom_data[ptr_d4];
atomicAdd(grad_mc_ms_feat + ptr_d4, w4 * top_grad_mc_ms_feat * v4 * hd);
}
if (flag_d_high) {
dp_high4 = bottom_data[ptr_d4 + 1];
atomicAdd(grad_mc_ms_feat + ptr_d4 + 1,
w4 * top_grad_mc_ms_feat * v4 * ld);
}
atomicAdd(grad_mc_ms_feat + ptr4 + base_ptr,
w4 * top_grad_mc_ms_feat * (dp_low4 * hd + dp_high4 * ld));
}
const float val1 = w1 * v1 * dp_low1 + w2 * v2 * dp_low2 + w3 * v3 * dp_low3 +
w4 * v4 * dp_low4;
const float val2 = w1 * v1 * dp_high1 + w2 * v2 * dp_high2 +
w3 * v3 * dp_high3 + w4 * v4 * dp_high4;
const float val = hd * val1 + ld * val2;
atomicAdd(grad_weights, grad_output * val);
const float grad_w_weight = hd * (-hh * v1 * dp_low1 + hh * v2 * dp_low2 -
lh * v3 * dp_low3 + lh * v4 * dp_low4) +
ld * (-hh * v1 * dp_high1 + hh * v2 * dp_high2 -
lh * v3 * dp_high3 + lh * v4 * dp_high4);
const float grad_h_weight = hd * (-hw * v1 * dp_low1 - lw * v2 * dp_low2 +
hw * v3 * dp_low3 + lw * v4 * dp_low4) +
ld * (-hw * v1 * dp_high1 - lw * v2 * dp_high2 +
hw * v3 * dp_high3 + lw * v4 * dp_high4);
/* const float grad_d_weight = -1 * (w1 * v1 * dp_low1 + w2 * v2 * dp_low2 +
* w3 * v3 * dp_low3 + w4 * v4 * dp_low4) */
/* + (w1 * v1 * dp_high1 + w2 * v2 * dp_high2 + w3 * v3 * dp_high3 + w4 *
* v4 * dp_high4); */
const float grad_d_weight = val2 - val1;
/* const float grad_d_weight = w1 * v1 * (dp_high1 - dp_low1) */
/* + w2 * v2 * (dp_high2 - dp_low2) */
/* + w3 * v3 * (dp_high3 - dp_low3) */
/* + w4 * v4 * (dp_high4 - dp_low4); */
atomicAdd(grad_sampling_location,
width * top_grad_mc_ms_feat * grad_w_weight);
atomicAdd(grad_sampling_location + 1,
height * top_grad_mc_ms_feat * grad_h_weight);
atomicAdd(grad_sampling_location + 2, top_grad_mc_ms_feat * grad_d_weight);
}
__global__ void deformable_aggregation_with_depth_kernel(
const int num_kernels, float *output, const float *mc_ms_feat,
const int *spatial_shape, const int *scale_start_index,
const float *sample_location, const float *weights, int batch_size,
int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts,
int num_groups, int num_depths) {
long int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_kernels)
return;
float *output_ptr = output + idx;
const int channel_index = idx % num_embeds;
const int groups_index = channel_index / (num_embeds / num_groups);
idx /= num_embeds;
const int pts_index = idx % num_pts;
idx /= num_pts;
const int batch_index = idx;
const int num_embeds_with_depth = num_embeds + num_depths;
const int value_cam_stride = num_feat * num_embeds_with_depth;
const int weight_cam_stride = num_scale * num_groups;
int loc_offset = (batch_index * num_pts + pts_index) * num_cams * 3;
int value_offset = batch_index * num_cams * value_cam_stride;
int depth_offset = value_offset + num_embeds;
value_offset = value_offset + channel_index;
int weight_offset =
((batch_index * num_pts + pts_index) * num_cams * weight_cam_stride +
groups_index);
float result = 0;
for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 3) {
const float loc_w = sample_location[loc_offset];
const float loc_h = sample_location[loc_offset + 1];
const float loc_d = sample_location[loc_offset + 2];
if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1 && loc_d > -1 &&
loc_d < num_depths) {
for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
const int scale_offset =
scale_start_index[scale_index] * num_embeds_with_depth;
const int spatial_shape_ptr = scale_index << 1;
const int h = spatial_shape[spatial_shape_ptr];
const int w = spatial_shape[spatial_shape_ptr + 1];
const float h_im = loc_h * h - 0.5;
const float w_im = loc_w * w - 0.5;
const int value_ptr =
value_offset + scale_offset + value_cam_stride * cam_index;
const int depth_ptr =
depth_offset + scale_offset + value_cam_stride * cam_index;
const float *weights_ptr =
(weights + weight_offset + scale_index * num_groups +
weight_cam_stride * cam_index);
result += bilinear_sampling(mc_ms_feat, h, w, num_embeds_with_depth,
num_depths, h_im, w_im, loc_d, value_ptr,
depth_ptr) *
*weights_ptr;
}
}
}
*output_ptr = result;
}
__global__ void deformable_aggregation_with_depth_grad_kernel(
const int num_kernels, const float *mc_ms_feat, const int *spatial_shape,
const int *scale_start_index, const float *sample_location,
const float *weights, const float *grad_output, float *grad_mc_ms_feat,
float *grad_sampling_location, float *grad_weights, int batch_size,
int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts,
int num_groups, int num_depths) {
long int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_kernels)
return;
const float grad = grad_output[idx];
const int channel_index = idx % num_embeds;
const int groups_index = channel_index / (num_embeds / num_groups);
idx /= num_embeds;
const int pts_index = idx % num_pts;
idx /= num_pts;
const int batch_index = idx;
const int num_embeds_with_depth = num_embeds + num_depths;
const int value_cam_stride = num_feat * num_embeds_with_depth;
const int weight_cam_stride = num_scale * num_groups;
int loc_offset = (batch_index * num_pts + pts_index) * num_cams * 3;
int value_offset = batch_index * num_cams * value_cam_stride;
int depth_offset = value_offset + num_embeds;
value_offset = value_offset + channel_index;
int weight_offset =
((batch_index * num_pts + pts_index) * num_cams * weight_cam_stride +
groups_index);
for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 3) {
const float loc_w = sample_location[loc_offset];
const float loc_h = sample_location[loc_offset + 1];
const float loc_d = sample_location[loc_offset + 2];
if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1 && loc_d > -1 &&
loc_d < num_depths) {
for (int scale_index = 0; scale_index < num_scale; ++scale_index) {
const int scale_offset =
scale_start_index[scale_index] * num_embeds_with_depth;
const int spatial_shape_ptr = scale_index << 1;
const int h = spatial_shape[spatial_shape_ptr];
const int w = spatial_shape[spatial_shape_ptr + 1];
const float h_im = loc_h * h - 0.5;
const float w_im = loc_w * w - 0.5;
const int value_ptr =
value_offset + scale_offset + value_cam_stride * cam_index;
const int depth_ptr =
depth_offset + scale_offset + value_cam_stride * cam_index;
const int weights_ptr = weight_offset + scale_index * num_groups +
weight_cam_stride * cam_index;
const float weight = weights[weights_ptr];
float *grad_location_ptr = grad_sampling_location + loc_offset;
float *grad_weights_ptr = grad_weights + weights_ptr;
bilinear_sampling_grad(mc_ms_feat, weight, h, w, num_embeds_with_depth,
num_depths, h_im, w_im, loc_d, value_ptr,
depth_ptr, grad, grad_mc_ms_feat,
grad_location_ptr, grad_weights_ptr);
}
}
}
}
void deformable_aggregation_with_depth(
float *output, const float *mc_ms_feat, const int *spatial_shape,
const int *scale_start_index, const float *sample_location,
const float *weights, int batch_size, int num_cams, int num_feat,
int num_embeds, int num_scale, int num_pts, int num_groups,
int num_depths) {
const long int num_kernels = batch_size * num_pts * num_embeds;
deformable_aggregation_with_depth_kernel<<<
(int)ceil(((double)num_kernels / 512)), 512>>>(
num_kernels, output, mc_ms_feat, spatial_shape, scale_start_index,
sample_location, weights, batch_size, num_cams, num_feat, num_embeds,
num_scale, num_pts, num_groups, num_depths);
}
void deformable_aggregation_with_depth_grad(
const float *mc_ms_feat, const int *spatial_shape,
const int *scale_start_index, const float *sample_location,
const float *weights, const float *grad_output, float *grad_mc_ms_feat,
float *grad_sampling_location, float *grad_weights, int batch_size,
int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts,
int num_groups, int num_depths) {
const long int num_kernels = batch_size * num_pts * num_embeds;
deformable_aggregation_with_depth_grad_kernel<<<
(int)ceil(((double)num_kernels / 512)), 512>>>(
num_kernels, mc_ms_feat, spatial_shape, scale_start_index,
sample_location, weights, grad_output, grad_mc_ms_feat,
grad_sampling_location, grad_weights, batch_size, num_cams, num_feat,
num_embeds, num_scale, num_pts, num_groups, num_depths);
}
================================================
FILE: bip3d/registry.py
================================================
from mmengine import DATASETS as MMENGINE_DATASETS
from mmengine import METRICS as MMENGINE_METRICS
from mmengine import MODELS as MMENGINE_MODELS
from mmengine import TASK_UTILS as MMENGINE_TASK_UTILS
from mmengine import TRANSFORMS as MMENGINE_TRANSFORMS
from mmengine import VISBACKENDS as MMENGINE_VISBACKENDS
from mmengine import VISUALIZERS as MMENGINE_VISUALIZERS
from mmengine import Registry
MODELS = Registry('model',
parent=MMENGINE_MODELS,
locations=['bip3d.models'])
DATASETS = Registry('dataset',
parent=MMENGINE_DATASETS,
locations=['bip3d.datasets'])
TRANSFORMS = Registry('transform',
parent=MMENGINE_TRANSFORMS,
locations=['bip3d.datasets.transforms'])
METRICS = Registry('metric',
parent=MMENGINE_METRICS,
locations=['bip3d.eval'])
TASK_UTILS = Registry('task util',
parent=MMENGINE_TASK_UTILS,
locations=['bip3d.models'])
================================================
FILE: bip3d/structures/__init__.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from .bbox_3d import (BaseInstance3DBoxes, Box3DMode, Coord3DMode,
EulerDepthInstance3DBoxes, EulerInstance3DBoxes,
get_box_type, get_proj_mat_by_coord_type, limit_period,
mono_cam_box2vis, points_cam2img, points_img2cam,
rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr)
__all__ = [
'BaseInstance3DBoxes', 'Box3DMode', 'Coord3DMode', 'EulerInstance3DBoxes',
'EulerDepthInstance3DBoxes', 'get_box_type', 'get_proj_mat_by_coord_type',
'limit_period', 'mono_cam_box2vis', 'points_cam2img', 'points_img2cam',
'rotation_3d_in_axis', 'rotation_3d_in_euler', 'xywhr2xyxyr'
]
================================================
FILE: bip3d/structures/bbox_3d/__init__.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from .base_box3d import BaseInstance3DBoxes
from .box_3d_mode import Box3DMode
from .coord_3d_mode import Coord3DMode
from .euler_box3d import EulerInstance3DBoxes
from .euler_depth_box3d import EulerDepthInstance3DBoxes
from .utils import (batch_points_cam2img, get_box_type,
get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis,
points_cam2img, points_img2cam, rotation_3d_in_axis,
rotation_3d_in_euler, xywhr2xyxyr)
__all__ = [
'Box3DMode', 'BaseInstance3DBoxes', 'EulerInstance3DBoxes',
'EulerDepthInstance3DBoxes', 'xywhr2xyxyr', 'get_box_type',
'rotation_3d_in_axis', 'rotation_3d_in_euler', 'limit_period',
'points_cam2img', 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis',
'batch_points_cam2img', 'get_proj_mat_by_coord_type'
]
================================================
FILE: bip3d/structures/bbox_3d/base_box3d.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from abc import abstractmethod
from typing import Iterator, Optional, Sequence, Tuple, Union
import numpy as np
import torch
from torch import Tensor
from bip3d.structures.points.base_points import BasePoints
from .utils import limit_period
class BaseInstance3DBoxes:
"""Base class for 3D Boxes.
Note:
The box is bottom centered, i.e. the relative position of origin in the
box is (0.5, 0.5, 0).
Args:
tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes
data with shape (N, box_dim).
box_dim (int): Number of the dimension of a box. Each row is
(x, y, z, x_size, y_size, z_size, yaw). Defaults to 7.
with_yaw (bool): Whether the box is with yaw rotation. If False, the
value of yaw will be set to 0 as minmax boxes. Defaults to True.
origin (Tuple[float]): Relative position of the box origin.
Defaults to (0.5, 0.5, 0). This will guide the box be converted to
(0.5, 0.5, 0) mode.
Attributes:
tensor (Tensor): Float matrix with shape (N, box_dim).
box_dim (int): Integer indicating the dimension of a box. Each row is
(x, y, z, x_size, y_size, z_size, yaw, ...).
with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
boxes.
"""
YAW_AXIS: int = 0
def __init__(
self,
tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
box_dim: int = 7,
with_yaw: bool = True,
origin: Tuple[float, float, float] = (0.5, 0.5, 0)
) -> None:
if isinstance(tensor, Tensor):
device = tensor.device
else:
device = torch.device('cpu')
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that does
# not depend on the inputs (and consequently confuses jit)
tensor = tensor.reshape((-1, box_dim))
assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \
('The box dimension must be 2 and the length of the last '
f'dimension must be {box_dim}, but got boxes with shape '
f'{tensor.shape}.')
if tensor.shape[-1] == 6:
# If the dimension of boxes is 6, we expand box_dim by padding 0 as
# a fake yaw and set with_yaw to False
assert box_dim == 6
fake_rot = tensor.new_zeros(tensor.shape[0], 1)
tensor = torch.cat((tensor, fake_rot), dim=-1)
self.box_dim = box_dim + 1
self.with_yaw = False
else:
self.box_dim = box_dim
self.with_yaw = with_yaw
self.tensor = tensor.clone()
if origin != (0.5, 0.5, 0):
dst = self.tensor.new_tensor((0.5, 0.5, 0))
src = self.tensor.new_tensor(origin)
self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
@property
def shape(self) -> torch.Size:
"""torch.Size: Shape of boxes."""
return self.tensor.shape
@property
def volume(self) -> Tensor:
"""Tensor: A vector with volume of each box in shape (N, )."""
return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
@property
def dims(self) -> Tensor:
"""Tensor: Size dimensions of each box in shape (N, 3)."""
return self.tensor[:, 3:6]
@property
def yaw(self) -> Tensor:
"""Tensor: A vector with yaw of each box in shape (N, )."""
return self.tensor[:, 6]
@property
def height(self) -> Tensor:
"""Tensor: A vector with height of each box in shape (N, )."""
return self.tensor[:, 5]
@property
def top_height(self) -> Tensor:
"""Tensor: A vector with top height of each box in shape (N, )."""
return self.bottom_height + self.height
@property
def bottom_height(self) -> Tensor:
"""Tensor: A vector with bottom height of each box in shape (N, )."""
return self.tensor[:, 2]
@property
def center(self) -> Tensor:
"""Calculate the center of all the boxes.
Note:
In MMDetection3D's convention, the bottom center is usually taken
as the default center.
The relative position of the centers in different kinds of boxes
are different, e.g., the relative center of a boxes is
(0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is
recommended to use ``bottom_center`` or ``gravity_center`` for
clearer usage.
Returns:
Tensor: A tensor with center of each box in shape (N, 3).
"""
return self.bottom_center
@property
def bottom_center(self) -> Tensor:
"""Tensor: A tensor with center of each box in shape (N, 3)."""
return self.tensor[:, :3]
@property
def gravity_center(self) -> Tensor:
"""Tensor: A tensor with center of each box in shape (N, 3)."""
bottom_center = self.bottom_center
gravity_center = torch.zeros_like(bottom_center)
gravity_center[:, :2] = bottom_center[:, :2]
gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
return gravity_center
@property
def corners(self) -> Tensor:
"""Tensor: A tensor with 8 corners of each box in shape (N, 8, 3)."""
pass
@property
def bev(self) -> Tensor:
"""Tensor: 2D BEV box of each box with rotation in XYWHR format, in
shape (N, 5)."""
return self.tensor[:, [0, 1, 3, 4, 6]]
@property
def nearest_bev(self) -> Tensor:
"""Tensor: A tensor of 2D BEV box of each box without rotation."""
# Obtain BEV boxes with rotation in XYWHR format
bev_rotated_boxes = self.bev
# convert the rotation to a valid range
rotations = bev_rotated_boxes[:, -1]
normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
# find the center of boxes
conditions = (normed_rotations > np.pi / 4)[..., None]
bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
[0, 1, 3, 2]],
bev_rotated_boxes[:, :4])
centers = bboxes_xywh[:, :2]
dims = bboxes_xywh[:, 2:]
bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
return bev_boxes
def in_range_bev(
self, box_range: Union[Tensor, np.ndarray,
Sequence[float]]) -> Tensor:
"""Check whether the boxes are in the given range.
Args:
box_range (Tensor or np.ndarray or Sequence[float]): The range of
box in order of (x_min, y_min, x_max, y_max).
Note:
The original implementation of SECOND checks whether boxes in a
range by checking whether the points are in a convex polygon, we
reduce the burden for simpler cases.
Returns:
Tensor: A binary vector indicating whether each box is inside the
reference range.
"""
in_range_flags = ((self.bev[:, 0] > box_range[0])
& (self.bev[:, 1] > box_range[1])
& (self.bev[:, 0] < box_range[2])
& (self.bev[:, 1] < box_range[3]))
return in_range_flags
@abstractmethod
def rotate(
self,
angle: Union[Tensor, np.ndarray, float],
points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
BasePoints, Tensor], None]:
"""Rotate boxes with points (optional) with the given angle or rotation
matrix.
Args:
angle (Tensor or np.ndarray or float): Rotation angle or rotation
matrix.
points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
Points to rotate. Defaults to None.
Returns:
tuple or None: When ``points`` is None, the function returns None,
otherwise it returns the rotated points and the rotation matrix
``rot_mat_T``.
"""
pass
@abstractmethod
def flip(
self,
bev_direction: str = 'horizontal',
points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
) -> Union[Tensor, np.ndarray, BasePoints, None]:
"""Flip the boxes in BEV along given BEV direction.
Args:
bev_direction (str): Direction by which to flip. Can be chosen from
'horizontal' and 'vertical'. Defaults to 'horizontal'.
points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
Points to flip. Defaults to None.
Returns:
Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
is None, the function returns None, otherwise it returns the
flipped points.
"""
pass
def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
"""Translate boxes with the given translation vector.
Args:
trans_vector (Tensor or np.ndarray): Translation vector of size
1x3.
"""
if not isinstance(trans_vector, Tensor):
trans_vector = self.tensor.new_tensor(trans_vector)
self.tensor[:, :3] += trans_vector
def in_range_3d(
self, box_range: Union[Tensor, np.ndarray,
Sequence[float]]) -> Tensor:
"""Check whether the boxes are in the given range.
Args:
box_range (Tensor or np.ndarray or Sequence[float]): The range of
box (x_min, y_min, z_min, x_max, y_max, z_max).
Note:
In the original implementation of SECOND, checking whether a box in
the range checks whether the points are in a convex polygon, we try
to reduce the burden for simpler cases.
Returns:
Tensor: A binary vector indicating whether each point is inside the
reference range.
"""
in_range_flags = ((self.tensor[:, 0] > box_range[0])
& (self.tensor[:, 1] > box_range[1])
& (self.tensor[:, 2] > box_range[2])
& (self.tensor[:, 0] < box_range[3])
& (self.tensor[:, 1] < box_range[4])
& (self.tensor[:, 2] < box_range[5]))
return in_range_flags
@abstractmethod
def convert_to(self,
dst: int,
rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
"""Convert self to ``dst`` mode.
Args:
dst (int): The target Box mode.
rt_mat (Tensor or np.ndarray, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
correct_yaw (bool): Whether to convert the yaw angle to the target
coordinate. Defaults to False.
Returns:
:obj:`BaseInstance3DBoxes`: The converted box of the same type in
the ``dst`` mode.
"""
pass
def scale(self, scale_factor: float) -> None:
"""Scale the box with horizontal and vertical scaling factors.
Args:
scale_factors (float): Scale factors to scale the boxes.
"""
self.tensor[:, :6] *= scale_factor
self.tensor[:, 7:] *= scale_factor # velocity
def limit_yaw(self, offset: float = 0.5, period: float = np.pi) -> None:
"""Limit the yaw to a given period and offset.
Args:
offset (float): The offset of the yaw. Defaults to 0.5.
period (float): The expected period. Defaults to np.pi.
"""
self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
def nonempty(self, threshold: float = 0.0) -> Tensor:
"""Find boxes that are non-empty.
A box is considered empty if either of its side is no larger than
threshold.
Args:
threshold (float): The threshold of minimal sizes. Defaults to 0.0.
Returns:
Tensor: A binary vector which represents whether each box is empty
(False) or non-empty (True).
"""
box = self.tensor
size_x = box[..., 3]
size_y = box[..., 4]
size_z = box[..., 5]
keep = ((size_x > threshold)
& (size_y > threshold) & (size_z > threshold))
return keep
def __getitem__(
self, item: Union[int, slice, np.ndarray,
Tensor]) -> 'BaseInstance3DBoxes':
"""
Args:
item (int or slice or np.ndarray or Tensor): Index of boxes.
Note:
The following usage are allowed:
1. `new_boxes = boxes[3]`: Return a `Boxes` that contains only one
box.
2. `new_boxes = boxes[2:10]`: Return a slice of boxes.
3. `new_boxes = boxes[vector]`: Where vector is a
torch.BoolTensor with `length = len(boxes)`. Nonzero elements in
the vector will be selected.
Note that the returned Boxes might share storage with this Boxes,
subject to PyTorch's indexing semantics.
Returns:
:obj:`BaseInstance3DBoxes`: A new object of
:class:`BaseInstance3DBoxes` after indexing.
"""
original_type = type(self)
if isinstance(item, int):
return original_type(self.tensor[item].view(1, -1),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
b = self.tensor[item]
assert b.dim() == 2, \
f'Indexing on Boxes with {item} failed to return a matrix!'
return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
def __len__(self) -> int:
"""int: Number of boxes in the current object."""
return self.tensor.shape[0]
def __repr__(self) -> str:
"""str: Return a string that describes the object."""
return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')'
@classmethod
def cat(cls, boxes_list: Sequence['BaseInstance3DBoxes']
) -> 'BaseInstance3DBoxes':
"""Concatenate a list of Boxes into a single Boxes.
Args:
boxes_list (Sequence[:obj:`BaseInstance3DBoxes`]): List of boxes.
Returns:
:obj:`BaseInstance3DBoxes`: The concatenated boxes.
"""
assert isinstance(boxes_list, (list, tuple))
if len(boxes_list) == 0:
return cls(torch.empty(0))
assert all(isinstance(box, cls) for box in boxes_list)
# use torch.cat (v.s. layers.cat)
# so the returned boxes never share storage with input
cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0),
box_dim=boxes_list[0].box_dim,
with_yaw=boxes_list[0].with_yaw)
return cat_boxes
def numpy(self) -> np.ndarray:
"""Reload ``numpy`` from self.tensor."""
return self.tensor.numpy()
def to(self, device: Union[str, torch.device], *args,
**kwargs) -> 'BaseInstance3DBoxes':
"""Convert current boxes to a specific device.
Args:
device (str or :obj:`torch.device`): The name of the device.
Returns:
:obj:`BaseInstance3DBoxes`: A new boxes object on the specific
device.
"""
original_type = type(self)
return original_type(self.tensor.to(device, *args, **kwargs),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
def cpu(self) -> 'BaseInstance3DBoxes':
"""Convert current boxes to cpu device.
Returns:
:obj:`BaseInstance3DBoxes`: A new boxes object on the cpu device.
"""
original_type = type(self)
return original_type(self.tensor.cpu(),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
def cuda(self, *args, **kwargs) -> 'BaseInstance3DBoxes':
"""Convert current boxes to cuda device.
Returns:
:obj:`BaseInstance3DBoxes`: A new boxes object on the cuda device.
"""
original_type = type(self)
return original_type(self.tensor.cuda(*args, **kwargs),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
def clone(self) -> 'BaseInstance3DBoxes':
"""Clone the boxes.
Returns:
:obj:`BaseInstance3DBoxes`: Box object with the same properties as
self.
"""
original_type = type(self)
return original_type(self.tensor.clone(),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
def detach(self) -> 'BaseInstance3DBoxes':
"""Detach the boxes.
Returns:
:obj:`BaseInstance3DBoxes`: Box object with the same properties as
self.
"""
original_type = type(self)
return original_type(self.tensor.detach(),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
@property
def device(self) -> torch.device:
"""torch.device: The device of the boxes are on."""
return self.tensor.device
def __iter__(self) -> Iterator[Tensor]:
"""Yield a box as a Tensor at a time.
Returns:
Iterator[Tensor]: A box of shape (box_dim, ).
"""
yield from self.tensor
@classmethod
def height_overlaps(cls, boxes1: 'BaseInstance3DBoxes',
boxes2: 'BaseInstance3DBoxes') -> Tensor:
"""Calculate height overlaps of two boxes.
Note:
This function calculates the height overlaps between ``boxes1`` and
``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
Args:
boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
Returns:
Tensor: Calculated height overlap of the boxes.
"""
assert isinstance(boxes1, BaseInstance3DBoxes)
assert isinstance(boxes2, BaseInstance3DBoxes)
assert type(boxes1) == type(boxes2), \
'"boxes1" and "boxes2" should be in the same type, ' \
f'but got {type(boxes1)} and {type(boxes2)}.'
boxes1_top_height = boxes1.top_height.view(-1, 1)
boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
boxes2_top_height = boxes2.top_height.view(1, -1)
boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
heighest_of_bottom = torch.max(boxes1_bottom_height,
boxes2_bottom_height)
lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
return overlaps_h
def new_box(
self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
) -> 'BaseInstance3DBoxes':
"""Create a new box object with data.
The new box and its tensor has the similar properties as self and
self.tensor, respectively.
Args:
data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
be copied.
Returns:
:obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the
object's other properties are similar to ``self``.
"""
new_tensor = self.tensor.new_tensor(data) \
if not isinstance(data, Tensor) else data.to(self.device)
original_type = type(self)
return original_type(new_tensor,
box_dim=self.box_dim,
with_yaw=self.with_yaw)
================================================
FILE: bip3d/structures/bbox_3d/box_3d_mode.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from enum import IntEnum, unique
from typing import Optional, Sequence, Union
import numpy as np
import torch
from torch import Tensor
from .base_box3d import BaseInstance3DBoxes
from .utils import limit_period
@unique
class Box3DMode(IntEnum):
"""Enum of different ways to represent a box.
Coordinates in LiDAR:
.. code-block:: none
up z
^ x front
| /
| /
left y <------ 0
The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
Coordinates in Camera:
.. code-block:: none
z front
/
/
0 ------> x right
|
|
v
down y
The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
and the yaw is around the y axis, thus the rotation axis=1.
Coordinates in Depth:
.. code-block:: none
up z
^ y front
| /
| /
0 ------> x right
The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
"""
LIDAR = 0
CAM = 1
DEPTH = 2
EULER_CAM = 3
EULER_DEPTH = 4
@staticmethod
def convert(
box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
src: 'Box3DMode',
dst: 'Box3DMode',
rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
with_yaw: bool = True,
correct_yaw: bool = False
) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
"""Convert boxes from ``src`` mode to ``dst`` mode.
Args:
box (Sequence[float] or np.ndarray or Tensor or
:obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
array/tensor.
src (:obj:`Box3DMode`): The source box mode.
dst (:obj:`Box3DMode`): The target box mode.
rt_mat (np.ndarray or Tensor, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
with_yaw (bool): If ``box`` is an instance of
:obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
Defaults to True.
correct_yaw (bool): If the yaw is rotated by rt_mat.
Defaults to False.
Returns:
Sequence[float] or np.ndarray or Tensor or
:obj:`BaseInstance3DBoxes`: The converted box of the same type.
"""
if src == dst:
return box
is_numpy = isinstance(box, np.ndarray)
is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
single_box = isinstance(box, (list, tuple))
if single_box:
assert len(box) >= 7, (
'Box3DMode.convert takes either a k-tuple/list or '
'an Nxk array/tensor, where k >= 7')
arr = torch.tensor(box)[None, :]
else:
# avoid modifying the input box
if is_numpy:
arr = torch.from_numpy(np.asarray(box)).clone()
elif is_Instance3DBoxes:
arr = box.tensor.clone()
else:
arr = box.clone()
if is_Instance3DBoxes:
with_yaw = box.with_yaw
# convert box from `src` mode to `dst` mode.
x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
if with_yaw:
yaw = arr[..., 6:7]
if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
if with_yaw:
if correct_yaw:
yaw_vector = torch.cat([
torch.cos(yaw),
torch.sin(yaw),
torch.zeros_like(yaw)
],
dim=1)
else:
yaw = -yaw - np.pi / 2
yaw = limit_period(yaw, period=np.pi * 2)
elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
if with_yaw:
if correct_yaw:
yaw_vector = torch.cat([
torch.cos(-yaw),
torch.zeros_like(yaw),
torch.sin(-yaw)
],
dim=1)
else:
yaw = -yaw - np.pi / 2
yaw = limit_period(yaw, period=np.pi * 2)
elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
if with_yaw:
if correct_yaw:
yaw_vector = torch.cat([
torch.cos(yaw),
torch.sin(yaw),
torch.zeros_like(yaw)
],
dim=1)
else:
yaw = -yaw
elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
if with_yaw:
if correct_yaw:
yaw_vector = torch.cat([
torch.cos(-yaw),
torch.zeros_like(yaw),
torch.sin(-yaw)
],
dim=1)
else:
yaw = -yaw
elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
if with_yaw:
if correct_yaw:
yaw_vector = torch.cat([
torch.cos(yaw),
torch.sin(yaw),
torch.zeros_like(yaw)
],
dim=1)
else:
yaw = yaw + np.pi / 2
yaw = limit_period(yaw, period=np.pi * 2)
elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
if with_yaw:
if correct_yaw:
yaw_vector = torch.cat([
torch.cos(yaw),
torch.sin(yaw),
torch.zeros_like(yaw)
],
dim=1)
else:
yaw = yaw - np.pi / 2
yaw = limit_period(yaw, period=np.pi * 2)
else: # TODO: add transformation between euler boxes
raise NotImplementedError(
f'Conversion from Box3DMode {src} to {dst} '
'is not supported yet')
if not isinstance(rt_mat, Tensor):
rt_mat = arr.new_tensor(rt_mat)
if rt_mat.size(1) == 4:
extended_xyz = torch.cat(
[arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
xyz = extended_xyz @ rt_mat.t()
else:
xyz = arr[..., :3] @ rt_mat.t()
# Note: we only use rotation in rt_mat
# so don't need to extend yaw_vector
if with_yaw and correct_yaw:
rot_yaw_vector = yaw_vector @ rt_mat[:3, :3].t()
if dst == Box3DMode.CAM:
yaw = torch.atan2(-rot_yaw_vector[:, [2]], rot_yaw_vector[:,
[0]])
elif dst in [Box3DMode.LIDAR, Box3DMode.DEPTH]:
yaw = torch.atan2(rot_yaw_vector[:, [1]], rot_yaw_vector[:,
[0]])
yaw = limit_period(yaw, period=np.pi * 2)
if with_yaw:
remains = arr[..., 7:]
arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1)
else:
remains = arr[..., 6:]
arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1)
# convert arr to the original type
original_type = type(box)
if single_box:
return original_type(arr.flatten().tolist())
if is_numpy:
return arr.numpy()
elif is_Instance3DBoxes:
raise NotImplementedError(
f'Conversion to {dst} through {original_type} '
'is not supported yet')
else:
return arr
================================================
FILE: bip3d/structures/bbox_3d/coord_3d_mode.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from enum import IntEnum, unique
from typing import Optional, Sequence, Union
import numpy as np
import torch
from torch import Tensor
from bip3d.structures.points import (BasePoints, CameraPoints,
DepthPoints, LiDARPoints)
from .base_box3d import BaseInstance3DBoxes
from .box_3d_mode import Box3DMode
@unique
class Coord3DMode(IntEnum):
"""Enum of different ways to represent a box and point cloud.
Coordinates in LiDAR:
.. code-block:: none
up z
^ x front
| /
| /
left y <------ 0
The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
Coordinates in Camera:
.. code-block:: none
z front
/
/
0 ------> x right
|
|
v
down y
The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
and the yaw is around the y axis, thus the rotation axis=1.
Coordinates in Depth:
.. code-block:: none
up z
^ y front
| /
| /
0 ------> x right
The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
"""
LIDAR = 0
CAM = 1
DEPTH = 2
@staticmethod
def convert(input: Union[Sequence[float], np.ndarray, Tensor,
BaseInstance3DBoxes, BasePoints],
src: Union[Box3DMode, 'Coord3DMode'],
dst: Union[Box3DMode, 'Coord3DMode'],
rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
with_yaw: bool = True,
correct_yaw: bool = False,
is_point: bool = True):
"""Convert boxes or points from ``src`` mode to ``dst`` mode.
Args:
input (Sequence[float] or np.ndarray or Tensor or
:obj:`BaseInstance3DBoxes` or :obj:`BasePoints`): Can be a
k-tuple, k-list or an Nxk array/tensor.
src (:obj:`Box3DMode` or :obj:`Coord3DMode`): The source mode.
dst (:obj:`Box3DMode` or :obj:`Coord3DMode`): The target mode.
rt_mat (np.ndarray or Tensor, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
with_yaw (bool): If ``box`` is an instance of
:obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
Defaults to True.
correct_yaw (bool): If the yaw is rotated by rt_mat.
Defaults to False.
is_point (bool): If ``input`` is neither an instance of
:obj:`BaseInstance3DBoxes` nor an instance of
:obj:`BasePoints`, whether or not it is point data.
Defaults to True.
Returns:
Sequence[float] or np.ndarray or Tensor or
:obj:`BaseInstance3DBoxes` or :obj:`BasePoints`: The converted box
or points of the same type.
"""
if isinstance(input, BaseInstance3DBoxes):
return Coord3DMode.convert_box(input,
src,
dst,
rt_mat=rt_mat,
with_yaw=with_yaw,
correct_yaw=correct_yaw)
elif isinstance(input, BasePoints):
return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
elif isinstance(input, (tuple, list, np.ndarray, Tensor)):
if is_point:
return Coord3DMode.convert_point(input,
src,
dst,
rt_mat=rt_mat)
else:
return Coord3DMode.convert_box(input,
src,
dst,
rt_mat=rt_mat,
with_yaw=with_yaw,
correct_yaw=correct_yaw)
else:
raise NotImplementedError
@staticmethod
def convert_box(
box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
src: Box3DMode,
dst: Box3DMode,
rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
with_yaw: bool = True,
correct_yaw: bool = False
) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
"""Convert boxes from ``src`` mode to ``dst`` mode.
Args:
box (Sequence[float] or np.ndarray or Tensor or
:obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
array/tensor.
src (:obj:`Box3DMode`): The source box mode.
dst (:obj:`Box3DMode`): The target box mode.
rt_mat (np.ndarray or Tensor, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
with_yaw (bool): If ``box`` is an instance of
:obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
Defaults to True.
correct_yaw (bool): If the yaw is rotated by rt_mat.
Defaults to False.
Returns:
Sequence[float] or np.ndarray or Tensor or
:obj:`BaseInstance3DBoxes`: The converted box of the same type.
"""
return Box3DMode.convert(box,
src,
dst,
rt_mat=rt_mat,
with_yaw=with_yaw,
correct_yaw=correct_yaw)
@staticmethod
def convert_point(
point: Union[Sequence[float], np.ndarray, Tensor, BasePoints],
src: 'Coord3DMode',
dst: 'Coord3DMode',
rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
) -> Union[Sequence[float], np.ndarray, Tensor, BasePoints]:
"""Convert points from ``src`` mode to ``dst`` mode.
Args:
box (Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`):
Can be a k-tuple, k-list or an Nxk array/tensor.
src (:obj:`Coord3DMode`): The source point mode.
dst (:obj:`Coord3DMode`): The target point mode.
rt_mat (np.ndarray or Tensor, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
Returns:
Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`: The
converted point of the same type.
"""
if src == dst:
return point
is_numpy = isinstance(point, np.ndarray)
is_InstancePoints = isinstance(point, BasePoints)
single_point = isinstance(point, (list, tuple))
if single_point:
assert len(point) >= 3, (
'Coord3DMode.convert takes either a k-tuple/list or '
'an Nxk array/tensor, where k >= 3')
arr = torch.tensor(point)[None, :]
else:
# avoid modifying the input point
if is_numpy:
arr = torch.from_numpy(np.asarray(point)).clone()
elif is_InstancePoints:
arr = point.tensor.clone()
else:
arr = point.clone()
# convert point from `src` mode to `dst` mode.
if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
else:
raise NotImplementedError(
f'Conversion from Coord3DMode {src} to {dst} '
'is not supported yet')
if not isinstance(rt_mat, Tensor):
rt_mat = arr.new_tensor(rt_mat)
if rt_mat.size(1) == 4:
extended_xyz = torch.cat(
[arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
xyz = extended_xyz @ rt_mat.t()
else:
xyz = arr[..., :3] @ rt_mat.t()
remains = arr[..., 3:]
arr = torch.cat([xyz[..., :3], remains], dim=-1)
# convert arr to the original type
original_type = type(point)
if single_point:
return original_type(arr.flatten().tolist())
if is_numpy:
return arr.numpy()
elif is_InstancePoints:
if dst == Coord3DMode.CAM:
target_type = CameraPoints
elif dst == Coord3DMode.LIDAR:
target_type = LiDARPoints
elif dst == Coord3DMode.DEPTH:
target_type = DepthPoints
else:
raise NotImplementedError(
f'Conversion to {dst} through {original_type} '
'is not supported yet')
return target_type(arr,
points_dim=arr.size(-1),
attribute_dims=point.attribute_dims)
else:
return arr
================================================
FILE: bip3d/structures/bbox_3d/euler_box3d.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import numpy as np
import torch
from pytorch3d.ops import box3d_overlap
from pytorch3d.transforms import euler_angles_to_matrix, matrix_to_euler_angles
from ..points.base_points import BasePoints
from .base_box3d import BaseInstance3DBoxes
from .utils import rotation_3d_in_euler
class EulerInstance3DBoxes(BaseInstance3DBoxes):
"""3D boxes with 1-D orientation represented by three Euler angles.
See https://en.wikipedia.org/wiki/Euler_angles for
regarding the definition of Euler angles.
Attributes:
tensor (torch.Tensor): Float matrix of N x box_dim.
box_dim (int): Integer indicates the dimension of a box
Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma).
"""
def __init__(self, tensor, box_dim=9, origin=(0.5, 0.5, 0.5)):
if isinstance(tensor, torch.Tensor):
device = tensor.device
else:
device = torch.device('cpu')
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that
# does not depend on the inputs (and consequently confuses jit)
tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32,
device=device)
assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
if tensor.shape[-1] == 6:
# If the dimension of boxes is 6, we expand box_dim by padding
# (0, 0, 0) as a fake euler angle.
assert box_dim == 6
fake_rot = tensor.new_zeros(tensor.shape[0], 3)
tensor = torch.cat((tensor, fake_rot), dim=-1)
self.box_dim = box_dim + 3
elif tensor.shape[-1] == 7:
assert box_dim == 7
fake_euler = tensor.new_zeros(tensor.shape[0], 2)
tensor = torch.cat((tensor, fake_euler), dim=-1)
self.box_dim = box_dim + 2
else:
assert tensor.shape[-1] == 9
self.box_dim = box_dim
self.tensor = tensor.clone()
self.origin = origin
if origin != (0.5, 0.5, 0.5):
dst = self.tensor.new_tensor((0.5, 0.5, 0.5))
src = self.tensor.new_tensor(origin)
self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
def get_corners(self, tensor1):
"""torch.Tensor: Coordinates of corners of all the boxes
in shape (N, 8, 3).
Convert the boxes to corners in clockwise order, in form of
``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
.. code-block:: none
up z
front y ^
/ |
/ |
(x0, y1, z1) + ----------- + (x1, y1, z1)
/| / |
/ | / |
(x0, y0, z1) + ----------- + + (x1, y1, z0)
| / . | /
| / origin | /
(x0, y0, z0) + ----------- + --------> right x
(x1, y0, z0)
"""
if tensor1.numel() == 0:
return torch.empty([0, 8, 3], device=tensor1.device)
dims = tensor1[:, 3:6]
corners_norm = torch.from_numpy(
np.stack(np.unravel_index(np.arange(8), [2] * 3),
axis=1)).to(device=dims.device, dtype=dims.dtype)
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
# use relative origin
assert self.origin == (0.5, 0.5, 0.5), \
'self.origin != (0.5, 0.5, 0.5) needs to be checked!'
corners_norm = corners_norm - dims.new_tensor(self.origin)
corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
# rotate
corners = rotation_3d_in_euler(corners, tensor1[:, 6:])
corners += tensor1[:, :3].view(-1, 1, 3)
return corners
@classmethod
def overlaps(cls, boxes1, boxes2, mode='iou', eps=1e-4):
"""Calculate 3D overlaps of two boxes.
Note:
This function calculates the overlaps between ``boxes1`` and
``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
Args:
boxes1 (:obj:`EulerInstance3DBoxes`): Boxes 1 contain N boxes.
boxes2 (:obj:`EulerInstance3DBoxes`): Boxes 2 contain M boxes.
mode (str): Mode of iou calculation. Defaults to 'iou'.
eps (bool): Epsilon. Defaults to 1e-4.
Returns:
torch.Tensor: Calculated 3D overlaps of the boxes.
"""
assert isinstance(boxes1, EulerInstance3DBoxes)
assert isinstance(boxes2, EulerInstance3DBoxes)
assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
assert mode in ['iou']
rows = len(boxes1)
cols = len(boxes2)
if rows * cols == 0:
return boxes1.tensor.new(rows, cols)
corners1 = boxes1.corners
corners2 = boxes2.corners
_, iou3d = box3d_overlap(corners1, corners2, eps=eps)
return iou3d
@property
def gravity_center(self):
"""torch.Tensor: A tensor with center of each box in shape (N, 3)."""
return self.tensor[:, :3]
@property
def corners(self):
"""torch.Tensor: Coordinates of corners of all the boxes
in shape (N, 8, 3).
Convert the boxes to corners in clockwise order, in form of
``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
.. code-block:: none
up z
front y ^
/ |
/ |
(x0, y1, z1) + ----------- + (x1, y1, z1)
/| / |
/ | / |
(x0, y0, z1) + ----------- + + (x1, y1, z0)
| / . | /
| / origin | /
(x0, y0, z0) + ----------- + --------> right x
(x1, y0, z0)
"""
if self.tensor.numel() == 0:
return torch.empty([0, 8, 3], device=self.tensor.device)
dims = self.dims
corners_norm = torch.from_numpy(
np.stack(np.unravel_index(np.arange(8), [2] * 3),
axis=1)).to(device=dims.device, dtype=dims.dtype)
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
# use relative origin
assert self.origin == (0.5, 0.5, 0.5), \
'self.origin != (0.5, 0.5, 0.5) needs to be checked!'
corners_norm = corners_norm - dims.new_tensor(self.origin)
corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
# rotate
corners = rotation_3d_in_euler(corners, self.tensor[:, 6:])
corners += self.tensor[:, :3].view(-1, 1, 3)
return corners
def transform(self, matrix):
if self.tensor.shape[0] == 0:
return
if not isinstance(matrix, torch.Tensor):
matrix = self.tensor.new_tensor(matrix)
points = self.tensor[:, :3]
constant = points.new_ones(points.shape[0], 1)
points_extend = torch.concat([points, constant], dim=-1)
points_trans = torch.matmul(points_extend, matrix.transpose(-2,
-1))[:, :3]
size = self.tensor[:, 3:6]
# angle_delta = matrix_to_euler_angles(matrix[:3,:3], 'ZXY')
# angle = self.tensor[:,6:] + angle_delta
ori_matrix = euler_angles_to_matrix(self.tensor[:, 6:], 'ZXY')
rot_matrix = matrix[:3, :3].expand_as(ori_matrix)
final = torch.bmm(rot_matrix, ori_matrix)
angle = matrix_to_euler_angles(final, 'ZXY')
self.tensor = torch.cat([points_trans, size, angle], dim=-1)
def scale(self, scale_factor: float) -> None:
"""Scale the box with horizontal and vertical scaling factors.
Args:
scale_factors (float): Scale factors to scale the boxes.
"""
self.tensor[:, :6] *= scale_factor
def rotate(self, angle, points=None):
"""Rotate boxes with points (optional) with the given angle or rotation
matrix.
Args:
angle (float | torch.Tensor | np.ndarray):
Rotation angle or rotation matrix.
points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
Points to rotate. Defaults to None.
Returns:
tuple or None: When ``points`` is None, the function returns
None, otherwise it returns the rotated points and the
rotation matrix ``rot_mat_T``.
"""
if not isinstance(angle, torch.Tensor):
angle = self.tensor.new_tensor(angle)
if angle.numel() == 1: # only given yaw angle for rotation
angle = self.tensor.new_tensor([angle, 0., 0.])
rot_matrix = euler_angles_to_matrix(angle, 'ZXY')
elif angle.numel() == 3:
rot_matrix = euler_angles_to_matrix(angle, 'ZXY')
elif angle.shape == torch.Size([3, 3]):
rot_matrix = angle
else:
raise NotImplementedError
rot_mat_T = rot_matrix.T
transform_matrix = torch.eye(4)
transform_matrix[:3, :3] = rot_matrix
self.transform(transform_matrix)
if points is not None:
if isinstance(points, torch.Tensor):
points[:, :3] = points[:, :3] @ rot_mat_T
elif isinstance(points, np.ndarray):
rot_mat_T = rot_mat_T.cpu().numpy()
points[:, :3] = np.dot(points[:, :3], rot_mat_T)
elif isinstance(points, BasePoints):
points.rotate(rot_mat_T)
else:
raise ValueError
return points, rot_mat_T
else:
return rot_mat_T
def flip(self, direction='X'):
"""Flip the boxes along the corresponding axis.
Args:
direction (str, optional): Flip axis. Defaults to 'X'.
"""
assert direction in ['X', 'Y', 'Z']
if direction == 'X':
self.tensor[:, 0] = -self.tensor[:, 0]
self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
self.tensor[:, 8] = -self.tensor[:, 8]
elif direction == 'Y':
self.tensor[:, 1] = -self.tensor[:, 1]
self.tensor[:, 6] = -self.tensor[:, 6]
self.tensor[:, 7] = -self.tensor[:, 7] + np.pi
elif direction == 'Z':
self.tensor[:, 2] = -self.tensor[:, 2]
self.tensor[:, 7] = -self.tensor[:, 7]
self.tensor[:, 8] = -self.tensor[:, 8] + np.pi
================================================
FILE: bip3d/structures/bbox_3d/euler_depth_box3d.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import numpy as np
import torch
from mmcv.ops import points_in_boxes_all, points_in_boxes_part
from ..points.base_points import BasePoints
from .euler_box3d import EulerInstance3DBoxes
class EulerDepthInstance3DBoxes(EulerInstance3DBoxes):
"""3D boxes of instances in Depth coordinates.
We keep the "Depth" coordinate system definition in MMDet3D just for
clarification of the points coordinates and the flipping augmentation.
Coordinates in Depth:
.. code-block:: none
up z y front (alpha=0.5*pi)
^ ^
| /
| /
0 ------> x right (alpha=0)
The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
The yaw is 0 at the positive direction of x axis, and decreases from
the positive direction of x to the positive direction of y.
Also note that rotation of DepthInstance3DBoxes is counterclockwise,
which is reverse to the definition of the yaw angle (clockwise).
Attributes:
tensor (torch.Tensor): Float matrix of N x box_dim.
box_dim (int): Integer indicates the dimension of a box
Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma).
with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
boxes.
"""
def __init__(self,
tensor,
box_dim=9,
with_yaw=True,
origin=(0.5, 0.5, 0.5)):
super().__init__(tensor, box_dim, origin)
self.with_yaw = with_yaw
def flip(self, bev_direction='horizontal', points=None):
"""Flip the boxes in BEV along given BEV direction.
In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
Args:
bev_direction (str, optional): Flip direction
(horizontal or vertical). Defaults to 'horizontal'.
points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
Points to flip. Defaults to None.
Returns:
torch.Tensor, numpy.ndarray or None: Flipped points.
"""
assert bev_direction in ('horizontal', 'vertical')
if bev_direction == 'horizontal':
super().flip(direction='X')
elif bev_direction == 'vertical':
super().flip(direction='Y')
if points is not None:
assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
if isinstance(points, (torch.Tensor, np.ndarray)):
if bev_direction == 'horizontal':
points[:, 0] = -points[:, 0]
elif bev_direction == 'vertical':
points[:, 1] = -points[:, 1]
elif isinstance(points, BasePoints):
points.flip(bev_direction)
return points
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`Box3DMode`): The target Box mode.
rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
translation matrix between different coordinates.
Defaults to None.
The conversion from ``src`` coordinates to ``dst`` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`DepthInstance3DBoxes`:
The converted box of the same type in the ``dst`` mode.
"""
from .box_3d_mode import Box3DMode
assert dst == Box3DMode.EULER_DEPTH
return self
def points_in_boxes_part(self, points, boxes_override=None):
"""Find the box in which each point is.
Args:
points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
3 dimensions are (x, y, z) in LiDAR or depth coordinate.
boxes_override (torch.Tensor, optional): Boxes to override
`self.tensor`. Defaults to None.
Returns:
torch.Tensor: The index of the first box that each point
is in, in shape (M, ). Default value is -1
(if the point is not enclosed by any box).
Note:
If a point is enclosed by multiple boxes, the index of the
first box will be returned.
"""
if boxes_override is not None:
boxes = boxes_override
else:
boxes = self.tensor
if points.dim() == 2:
points = points.unsqueeze(0)
# TODO: take euler angles into consideration
aligned_boxes = boxes[..., :7].clone()
aligned_boxes[..., 6] = 0
box_idx = points_in_boxes_part(
points,
aligned_boxes.unsqueeze(0).to(points.device)).squeeze(0)
return box_idx
def points_in_boxes_all(self, points, boxes_override=None):
"""Find all boxes in which each point is.
Args:
points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
3 dimensions are (x, y, z) in LiDAR or depth coordinate.
boxes_override (torch.Tensor, optional): Boxes to override
`self.tensor`. Defaults to None.
Returns:
torch.Tensor: A tensor indicating whether a point is in a box,
in shape (M, T). T is the number of boxes. Denote this
tensor as A, if the m^th point is in the t^th box, then
`A[m, t] == 1`, elsewise `A[m, t] == 0`.
"""
if boxes_override is not None:
boxes = boxes_override
else:
boxes = self.tensor
points_clone = points.clone()[..., :3]
if points_clone.dim() == 2:
points_clone = points_clone.unsqueeze(0)
else:
assert points_clone.dim() == 3 and points_clone.shape[0] == 1
boxes = boxes.to(points_clone.device).unsqueeze(0)
# TODO: take euler angles into consideration
aligned_boxes = boxes[..., :7].clone()
aligned_boxes[..., 6] = 0
box_idxs_of_pts = points_in_boxes_all(points_clone, aligned_boxes)
return box_idxs_of_pts.squeeze(0)
================================================
FILE: bip3d/structures/bbox_3d/utils.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
from logging import warning
from typing import Tuple, Union
import numpy as np
import torch
from pytorch3d.transforms import euler_angles_to_matrix, quaternion_to_matrix
from torch import Tensor
from bip3d.utils.array_converter import array_converter
@array_converter(apply_to=('val', ))
def limit_period(val: Union[np.ndarray, Tensor],
offset: float = 0.5,
period: float = np.pi) -> Union[np.ndarray, Tensor]:
"""Limit the value into a period for periodic function.
Args:
val (np.ndarray or Tensor): The value to be converted.
offset (float): Offset to set the value range. Defaults to 0.5.
period (float): Period of the value. Defaults to np.pi.
Returns:
np.ndarray or Tensor: Value in the range of
[-offset * period, (1-offset) * period].
"""
limited_val = val - torch.floor(val / period + offset) * period
return limited_val
@array_converter(apply_to=('points', 'angles'))
def rotation_3d_in_euler(points, angles, return_mat=False, clockwise=False):
"""Rotate points by angles according to axis.
Args:
points (np.ndarray | torch.Tensor | list | tuple ):
Points of shape (N, M, 3).
angles (np.ndarray | torch.Tensor | list | tuple):
Vector of angles in shape (N, 3)
return_mat: Whether or not return the rotation matrix (transposed).
Defaults to False.
clockwise: Whether the rotation is clockwise. Defaults to False.
Raises:
ValueError: when the axis is not in range [0, 1, 2], it will
raise value error.
Returns:
(torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3).
"""
batch_free = len(points.shape) == 2
if batch_free:
points = points[None]
if len(angles.shape) == 1:
angles = angles.expand(points.shape[:1] + (3, ))
# angles = torch.full(points.shape[:1], angles)
assert len(points.shape) == 3 and len(angles.shape) == 2 \
and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \
f'angles: {points.shape}, {angles.shape}'
assert points.shape[-1] in [2, 3], \
f'Points size should be 2 or 3 instead of {points.shape[-1]}'
if angles.shape[1] == 3:
rot_mat_T = euler_angles_to_matrix(angles, 'ZXY') # N, 3,3
else:
rot_mat_T = quaternion_to_matrix(angles) # N, 3,3
rot_mat_T = rot_mat_T.transpose(-2, -1)
if clockwise:
raise NotImplementedError('clockwise')
if points.shape[0] == 0:
points_new = points
else:
points_new = torch.bmm(points, rot_mat_T)
if batch_free:
points_new = points_new.squeeze(0)
if return_mat:
if batch_free:
rot_mat_T = rot_mat_T.squeeze(0)
return points_new, rot_mat_T
else:
return points_new
@array_converter(apply_to=('points', 'angles'))
def rotation_3d_in_axis(
points: Union[np.ndarray, Tensor],
angles: Union[np.ndarray, Tensor, float],
axis: int = 0,
return_mat: bool = False,
clockwise: bool = False
) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[Tensor, Tensor], np.ndarray,
Tensor]:
"""Rotate points by angles according to axis.
Args:
points (np.ndarray or Tensor): Points with shape (N, M, 3).
angles (np.ndarray or Tensor or float): Vector of angles with shape
(N, ).
axis (int): The axis to be rotated. Defaults to 0.
return_mat (bool): Whether or not to return the rotation matrix
(transposed). Defaults to False.
clockwise (bool): Whether the rotation is clockwise. Defaults to False.
Raises:
ValueError: When the axis is not in range [-3, -2, -1, 0, 1, 2], it
will raise ValueError.
Returns:
Tuple[np.ndarray, np.ndarray] or Tuple[Tensor, Tensor] or np.ndarray or
Tensor: Rotated points with shape (N, M, 3) and rotation matrix with
shape (N, 3, 3).
"""
batch_free = len(points.shape) == 2
if batch_free:
points = points[None]
if isinstance(angles, float) or len(angles.shape) == 0:
angles = torch.full(points.shape[:1], angles)
assert len(points.shape) == 3 and len(angles.shape) == 1 and \
points.shape[0] == angles.shape[0], 'Incorrect shape of points ' \
f'angles: {points.shape}, {angles.shape}'
assert points.shape[-1] in [2, 3], \
f'Points size should be 2 or 3 instead of {points.shape[-1]}'
rot_sin = torch.sin(angles)
rot_cos = torch.cos(angles)
ones = torch.ones_like(rot_cos)
zeros = torch.zeros_like(rot_cos)
if points.shape[-1] == 3:
if axis == 1 or axis == -2:
rot_mat_T = torch.stack([
torch.stack([rot_cos, zeros, -rot_sin]),
torch.stack([zeros, ones, zeros]),
torch.stack([rot_sin, zeros, rot_cos])
])
elif axis == 2 or axis == -1:
rot_mat_T = torch.stack([
torch.stack([rot_cos, rot_sin, zeros]),
torch.stack([-rot_sin, rot_cos, zeros]),
torch.stack([zeros, zeros, ones])
])
elif axis == 0 or axis == -3:
rot_mat_T = torch.stack([
torch.stack([ones, zeros, zeros]),
torch.stack([zeros, rot_cos, rot_sin]),
torch.stack([zeros, -rot_sin, rot_cos])
])
else:
raise ValueError(
f'axis should in range [-3, -2, -1, 0, 1, 2], got {axis}')
else:
rot_mat_T = torch.stack([
torch.stack([rot_cos, rot_sin]),
torch.stack([-rot_sin, rot_cos])
])
if clockwise:
rot_mat_T = rot_mat_T.transpose(0, 1)
if points.shape[0] == 0:
points_new = points
else:
points_new = torch.einsum('aij,jka->aik', points, rot_mat_T)
if batch_free:
points_new = points_new.squeeze(0)
if return_mat:
rot_mat_T = torch.einsum('jka->ajk', rot_mat_T)
if batch_free:
rot_mat_T = rot_mat_T.squeeze(0)
return points_new, rot_mat_T
else:
return points_new
@array_converter(apply_to=('boxes_xywhr', ))
def xywhr2xyxyr(
boxes_xywhr: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
"""Convert a rotated boxes in XYWHR format to XYXYR format.
Args:
boxes_xywhr (Tensor or np.ndarray): Rotated boxes in XYWHR format.
Returns:
Tensor or np.ndarray: Converted boxes in XYXYR format.
"""
boxes = torch.zeros_like(boxes_xywhr)
half_w = boxes_xywhr[..., 2] / 2
half_h = boxes_xywhr[..., 3] / 2
boxes[..., 0] = boxes_xywhr[..., 0] - half_w
boxes[..., 1] = boxes_xywhr[..., 1] - half_h
boxes[..., 2] = boxes_xywhr[..., 0] + half_w
boxes[..., 3] = boxes_xywhr[..., 1] + half_h
boxes[..., 4] = boxes_xywhr[..., 4]
return boxes
def get_box_type(box_type: str) -> Tuple[type, int]:
"""Get the type and mode of box structure.
We temporarily only support EulerDepthInstance3DBoxes to
support 9-DoF box operations
and will consider refactoring this class with further experience.
Args:
box_type (str): The type of box structure. The valid value are "LiDAR",
"Camera" and "Depth".
Raises:
ValueError: A ValueError is raised when ``box_type`` does not belong to
the three valid types.
Returns:
tuple: Box type and box mode.
"""
from .box_3d_mode import Box3DMode
from .euler_depth_box3d import EulerDepthInstance3DBoxes
box_type_lower = box_type.lower()
if box_type_lower == 'euler-depth':
box_type_3d = EulerDepthInstance3DBoxes
box_mode_3d = Box3DMode.EULER_DEPTH
# elif box_type_lower == 'euler-camera':
# box_type_3d = EulerCameraInstance3DBoxes
# box_mode_3d = Box3DMode.EULER_CAM
else:
raise ValueError(
'Only "box_type" of "camera", "lidar", "depth", "euler"'
f' are supported, got {box_type}')
return box_type_3d, box_mode_3d
@array_converter(apply_to=('points_3d', 'proj_mat'))
def points_cam2img(points_3d: Union[Tensor, np.ndarray],
proj_mat: Union[Tensor, np.ndarray],
with_depth: bool = False) -> Union[Tensor, np.ndarray]:
"""Project points in camera coordinates to image coordinates.
Args:
points_3d (Tensor or np.ndarray): Points in shape (N, 3).
proj_mat (Tensor or np.ndarray): Transformation matrix between
coordinates.
with_depth (bool): Whether to keep depth in the output.
Defaults to False.
Returns:
Tensor or np.ndarray: Points in image coordinates with shape [N, 2] if
``with_depth=False``, else [N, 3].
"""
points_shape = list(points_3d.shape)
points_shape[-1] = 1
assert len(proj_mat.shape) == 2, \
'The dimension of the projection matrix should be 2 ' \
f'instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or \
(d1 == 4 and d2 == 4), 'The shape of the projection matrix ' \
f'({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = torch.eye(4,
device=proj_mat.device,
dtype=proj_mat.dtype)
proj_mat_expanded[:d1, :d2] = proj_mat
proj_mat = proj_mat_expanded
# previous implementation use new_zeros, new_one yields better results
points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
point_2d = points_4 @ proj_mat.T
point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
if with_depth:
point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
return point_2d_res
@array_converter(apply_to=('points_3d', 'proj_mat'))
def batch_points_cam2img(points_3d, proj_mat, with_depth=False):
"""Project points in camera coordinates to image coordinates.
Args:
points_3d (torch.Tensor | np.ndarray): Points in shape (N, D, 3)
proj_mat (torch.Tensor | np.ndarray):
Transformation matrix between coordinates.
with_depth (bool, optional): Whether to keep depth in the output.
Defaults to False.
Returns:
(torch.Tensor | np.ndarray): Points in image coordinates,
with shape [N, D, 2] if `with_depth=False`, else [N, D, 3].
"""
points_shape = list(points_3d.shape)
points_shape[-1] = 1
assert len(proj_mat.shape) == 3, 'The dimension of the projection'\
f' matrix should be 2 instead of {len(proj_mat.shape)}.'
d0, d1, d2 = proj_mat.shape[:3]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = torch.eye(4,
device=proj_mat.device,
dtype=proj_mat.dtype)
proj_mat_expanded = proj_mat_expanded[None, :, :].expand(d0, -1, -1)
proj_mat_expanded[:, :d1, :d2] = proj_mat
proj_mat = proj_mat_expanded
# previous implementation use new_zeros, new_one yields better results
points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
# do the batch wise operation
point_2d = torch.bmm(points_4, proj_mat.permute(0, 2, 1))
# point_2d = points_4 @ proj_mat.T
point_2d_res = point_2d[..., :2] / point_2d[..., 2:3].clamp(min=1e-3)
if with_depth:
point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
return point_2d_res
@array_converter(apply_to=('points', 'cam2img'))
def points_img2cam(
points: Union[Tensor, np.ndarray],
cam2img: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
"""Project points in image coordinates to camera coordinates.
Args:
points (Tensor or np.ndarray): 2.5D points in 2D images with shape
[N, 3], 3 corresponds with x, y in the image and depth.
cam2img (Tensor or np.ndarray): Camera intrinsic matrix. The shape can
be [3, 3], [3, 4] or [4, 4].
Returns:
Tensor or np.ndarray: Points in 3D space with shape [N, 3], 3
corresponds with x, y, z in 3D space.
"""
assert cam2img.shape[0] <= 4
assert cam2img.shape[1] <= 4
assert points.shape[1] == 3
xys = points[:, :2]
depths = points[:, 2].view(-1, 1)
unnormed_xys = torch.cat([xys * depths, depths], dim=1)
pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device)
pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img
inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1)
# Do operation in homogeneous coordinates.
num_points = unnormed_xys.shape[0]
homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1)
points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3]
return points3D
def mono_cam_box2vis(cam_box):
"""This is a post-processing function on the bboxes from Mono-3D task. If
we want to perform projection visualization, we need to:
1. rotate the box along x-axis for np.pi / 2 (roll)
2. change orientation from local yaw to global yaw
3. convert yaw by (np.pi / 2 - yaw)
After applying this function, we can project and draw it on 2D images.
Args:
cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate
system before conversion. Could be gt bbox loaded from dataset or
network prediction output.
Returns:
:obj:`CameraInstance3DBoxes`: Box after conversion.
"""
warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
'monocular 3D detection on nuScenes has been removed. The '
'function mono_cam_box2vis will be deprecated.')
from .cam_box3d import CameraInstance3DBoxes
assert isinstance(cam_box, CameraInstance3DBoxes), \
'input bbox should be CameraInstance3DBoxes!'
loc = cam_box.gravity_center
dim = cam_box.dims
yaw = cam_box.yaw
feats = cam_box.tensor[:, 7:]
# rotate along x-axis for np.pi / 2
# see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557 # noqa
dim[:, [1, 2]] = dim[:, [2, 1]]
# change local yaw to global yaw for visualization
# refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166 # noqa
yaw += torch.atan2(loc[:, 0], loc[:, 2])
# convert yaw by (-yaw - np.pi / 2)
# this is because mono 3D box class such as `NuScenesBox` has different
# definition of rotation with our `CameraInstance3DBoxes`
yaw = -yaw - np.pi / 2
cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
cam_box = CameraInstance3DBoxes(cam_box,
box_dim=cam_box.shape[-1],
origin=(0.5, 0.5, 0.5))
return cam_box
def get_proj_mat_by_coord_type(img_meta: dict, coord_type: str) -> Tensor:
"""Obtain image features using points.
Args:
img_meta (dict): Meta information.
coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Can be case-
insensitive.
Returns:
Tensor: Transformation matrix.
"""
coord_type = coord_type.upper()
mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
assert coord_type in mapping.keys()
return img_meta[mapping[coord_type]]
def yaw2local(yaw: Tensor, loc: Tensor) -> Tensor:
"""Transform global yaw to local yaw (alpha in kitti) in camera
coordinates, ranges from -pi to pi.
Args:
yaw (Tensor): A vector with local yaw of each box in shape (N, ).
loc (Tensor): Gravity center of each box in shape (N, 3).
Returns:
Tensor: Local yaw (alpha in kitti).
"""
local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
if len(larger_idx) != 0:
local_yaw[larger_idx] -= 2 * np.pi
if len(small_idx) != 0:
local_yaw[small_idx] += 2 * np.pi
return local_yaw
def get_lidar2img(cam2img: Tensor, lidar2cam: Tensor) -> Tensor:
"""Get the projection matrix of lidar2img.
Args:
cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix.
lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix.
Returns:
Tensor: Transformation matrix with shape 4x4.
"""
if cam2img.shape == (3, 3):
temp = cam2img.new_zeros(4, 4)
temp[:3, :3] = cam2img
temp[3, 3] = 1
cam2img = temp
if lidar2cam.shape == (3, 3):
temp = lidar2cam.new_zeros(4, 4)
temp[:3, :3] = lidar2cam
temp[3, 3] = 1
lidar2cam = temp
return torch.matmul(cam2img, lidar2cam)
================================================
FILE: bip3d/structures/ops/__init__.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
# yapf:disable
from .box_np_ops import (box2d_to_corner_jit, box3d_to_bbox,
box_camera_to_lidar, boxes3d_to_corners3d_lidar,
camera_to_lidar, center_to_corner_box2d,
center_to_corner_box3d, center_to_minmax_2d,
corner_to_standup_nd_jit, corner_to_surfaces_3d,
corner_to_surfaces_3d_jit, corners_nd,
create_anchors_3d_range, depth_to_lidar_points,
depth_to_points, get_frustum, iou_jit,
minmax_to_corner_2d, points_in_convex_polygon_3d_jit,
points_in_convex_polygon_jit, points_in_rbbox,
projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox,
remove_outside_points, rotation_points_single_angle,
surface_equ_3d)
# yapf:enable
from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
BboxOverlapsNearest3D,
axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
bbox_overlaps_nearest_3d)
from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back
__all__ = [
'box2d_to_corner_jit', 'box3d_to_bbox', 'box_camera_to_lidar',
'boxes3d_to_corners3d_lidar', 'camera_to_lidar', 'center_to_corner_box2d',
'center_to_corner_box3d', 'center_to_minmax_2d',
'corner_to_standup_nd_jit', 'corner_to_surfaces_3d',
'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range',
'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit',
'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit',
'points_in_convex_polygon_jit', 'points_in_rbbox',
'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox',
'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d',
'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi',
'bbox3d2result'
]
================================================
FILE: bip3d/structures/ops/box_np_ops.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
# TODO: clean the functions in this file and move the APIs into box bbox_3d
# in the future
# NOTICE: All functions in this file are valid for LiDAR or depth boxes only
# if we use default parameters.
import numba
import numpy as np
from bip3d.structures.bbox_3d import (limit_period, points_cam2img,
rotation_3d_in_axis)
def camera_to_lidar(points, r_rect, velo2cam):
"""Convert points in camera coordinate to lidar coordinate.
Note:
This function is for KITTI only.
Args:
points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
Returns:
np.ndarray, shape=[N, 3]: Points in lidar coordinate.
"""
points_shape = list(points.shape[0:-1])
if points.shape[-1] == 3:
points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
return lidar_points[..., :3]
def box_camera_to_lidar(data, r_rect, velo2cam):
"""Convert boxes in camera coordinate to lidar coordinate.
Note:
This function is for KITTI only.
Args:
data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
Returns:
np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
"""
xyz = data[:, 0:3]
x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6]
r = data[:, 6:7]
xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
# yaw and dims also needs to be converted
r_new = -r - np.pi / 2
r_new = limit_period(r_new, period=np.pi * 2)
return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1)
def corners_nd(dims, origin=0.5):
"""Generate relative box corners based on length per dim and origin point.
Args:
dims (np.ndarray, shape=[N, ndim]): Array of length per dim
origin (list or array or float, optional): origin point relate to
smallest point. Defaults to 0.5
Returns:
np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
(3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
where x0 < x1, y0 < y1, z0 < z1.
"""
ndim = int(dims.shape[1])
corners_norm = np.stack(np.unravel_index(np.arange(2**ndim), [2] * ndim),
axis=1).astype(dims.dtype)
# now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
# (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
# so need to convert to a format which is convenient to do other computing.
# for 2d boxes, format is clockwise start with minimum point
# for 3d boxes, please draw lines by your hand.
if ndim == 2:
# generate clockwise box corners
corners_norm = corners_norm[[0, 1, 3, 2]]
elif ndim == 3:
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
[1, 2**ndim, ndim])
return corners
def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
"""Convert kitti locations, dimensions and angles to corners.
format: center(xy), dims(xy), angles(counterclockwise when positive)
Args:
centers (np.ndarray): Locations in kitti label file with shape (N, 2).
dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
angles (np.ndarray, optional): Rotation_y in kitti label file with
shape (N). Defaults to None.
origin (list or array or float, optional): origin point relate to
smallest point. Defaults to 0.5.
Returns:
np.ndarray: Corners with the shape of (N, 4, 2).
"""
# 'length' in kitti format is in x axis.
# xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
# center in kitti format is [0.5, 1.0, 0.5] in xyz.
corners = corners_nd(dims, origin=origin)
# corners: [N, 4, 2]
if angles is not None:
corners = rotation_3d_in_axis(corners, angles)
corners += centers.reshape([-1, 1, 2])
return corners
@numba.jit(nopython=True)
def depth_to_points(depth, trunc_pixel):
"""Convert depth map to points.
Args:
depth (np.array, shape=[H, W]): Depth map which
the row of [0~`trunc_pixel`] are truncated.
trunc_pixel (int): The number of truncated row.
Returns:
np.ndarray: Points in camera coordinates.
"""
num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
points = np.zeros((num_pts, 3), dtype=depth.dtype)
x = np.array([0, 0, 1], dtype=depth.dtype)
k = 0
for i in range(trunc_pixel, depth.shape[0]):
for j in range(depth.shape[1]):
if depth[i, j] > 0.1:
x = np.array([j, i, 1], dtype=depth.dtype)
points[k] = x * depth[i, j]
k += 1
return points
def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
"""Convert depth map to points in lidar coordinate.
Args:
depth (np.array, shape=[H, W]): Depth map which
the row of [0~`trunc_pixel`] are truncated.
trunc_pixel (int): The number of truncated row.
P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
Returns:
np.ndarray: Points in lidar coordinates.
"""
pts = depth_to_points(depth, trunc_pixel)
points_shape = list(pts.shape[0:-1])
points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
points = points @ np.linalg.inv(P2.T)
lidar_points = camera_to_lidar(points, r_rect, velo2cam)
return lidar_points
def center_to_corner_box3d(centers,
dims,
angles=None,
origin=(0.5, 1.0, 0.5),
axis=1):
"""Convert kitti locations, dimensions and angles to corners.
Args:
centers (np.ndarray): Locations in kitti label file with shape (N, 3).
dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
angles (np.ndarray, optional): Rotation_y in kitti label file with
shape (N). Defaults to None.
origin (list or array or float, optional): Origin point relate to
smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
in lidar. Defaults to (0.5, 1.0, 0.5).
axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
Defaults to 1.
Returns:
np.ndarray: Corners with the shape of (N, 8, 3).
"""
# 'length' in kitti format is in x axis.
# yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar)
# center in kitti format is [0.5, 1.0, 0.5] in xyz.
corners = corners_nd(dims, origin=origin)
# corners: [N, 8, 3]
if angles is not None:
corners = rotation_3d_in_axis(corners, angles, axis=axis)
corners += centers.reshape([-1, 1, 3])
return corners
@numba.jit(nopython=True)
def box2d_to_corner_jit(boxes):
"""Convert box2d to corner.
Args:
boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
Returns:
box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
"""
num_box = boxes.shape[0]
corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
corners_norm[1, 1] = 1.0
corners_norm[2] = 1.0
corners_norm[3, 0] = 1.0
corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
1, 4, 2)
rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
for i in range(num_box):
rot_sin = np.sin(boxes[i, -1])
rot_cos = np.cos(boxes[i, -1])
rot_mat_T[0, 0] = rot_cos
rot_mat_T[0, 1] = rot_sin
rot_mat_T[1, 0] = -rot_sin
rot_mat_T[1, 1] = rot_cos
box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
return box_corners
@numba.njit
def corner_to_standup_nd_jit(boxes_corner):
"""Convert boxes_corner to aligned (min-max) boxes.
Args:
boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
Returns:
np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
"""
num_boxes = boxes_corner.shape[0]
ndim = boxes_corner.shape[-1]
result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
for i in range(num_boxes):
for j in range(ndim):
result[i, j] = np.min(boxes_corner[i, :, j])
for j in range(ndim):
result[i, j + ndim] = np.max(boxes_corner[i, :, j])
return result
@numba.jit(nopython=True)
def corner_to_surfaces_3d_jit(corners):
"""Convert 3d box corners from corner function above to surfaces that
normal vectors all direct to internal.
Args:
corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
Returns:
np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
"""
# box_corners: [N, 8, 3], must from corner functions in this module
num_boxes = corners.shape[0]
surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
corner_idxes = np.array([
0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
]).reshape(6, 4)
for i in range(num_boxes):
for j in range(6):
for k in range(4):
surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
return surfaces
def rotation_points_single_angle(points, angle, axis=0):
"""Rotate points with a single angle.
Args:
points (np.ndarray, shape=[N, 3]]):
angle (np.ndarray, shape=[1]]):
axis (int, optional): Axis to rotate at. Defaults to 0.
Returns:
np.ndarray: Rotated points.
"""
# points: [N, 3]
rot_sin = np.sin(angle)
rot_cos = np.cos(angle)
if axis == 1:
rot_mat_T = np.array(
[[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]],
dtype=points.dtype)
elif axis == 2 or axis == -1:
rot_mat_T = np.array(
[[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]],
dtype=points.dtype)
elif axis == 0:
rot_mat_T = np.array(
[[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]],
dtype=points.dtype)
else:
raise ValueError('axis should in range')
return points @ rot_mat_T, rot_mat_T
def box3d_to_bbox(box3d, P2):
"""Convert box3d in camera coordinates to bbox in image coordinates.
Args:
box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
Returns:
np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
"""
box_corners = center_to_corner_box3d(box3d[:, :3],
box3d[:, 3:6],
box3d[:, 6], [0.5, 1.0, 0.5],
axis=1)
box_corners_in_image = points_cam2img(box_corners, P2)
# box_corners_in_image: [N, 8, 2]
minxy = np.min(box_corners_in_image, axis=1)
maxxy = np.max(box_corners_in_image, axis=1)
bbox = np.concatenate([minxy, maxxy], axis=1)
return bbox
def corner_to_surfaces_3d(corners):
"""convert 3d box corners from corner function above to surfaces that
normal vectors all direct to internal.
Args:
corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
Returns:
np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
"""
# box_corners: [N, 8, 3], must from corner functions in this module
surfaces = np.array([
[corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
[corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
[corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
[corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
[corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
[corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
]).transpose([2, 0, 1, 3])
return surfaces
def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
"""Check points in rotated bbox and return indices.
Note:
This function is for counterclockwise boxes.
Args:
points (np.ndarray, shape=[N, 3+dim]): Points to query.
rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
z_axis (int, optional): Indicate which axis is height.
Defaults to 2.
origin (tuple[int], optional): Indicate the position of
box center. Defaults to (0.5, 0.5, 0).
Returns:
np.ndarray, shape=[N, M]: Indices of points in each box.
"""
# TODO: this function is different from PointCloud3D, be careful
# when start to use nuscene, check the input
rbbox_corners = center_to_corner_box3d(rbbox[:, :3],
rbbox[:, 3:6],
rbbox[:, 6],
origin=origin,
axis=z_axis)
surfaces = corner_to_surfaces_3d(rbbox_corners)
indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
return indices
def minmax_to_corner_2d(minmax_box):
"""Convert minmax box to corners2d.
Args:
minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
Returns:
np.ndarray: 2d corners of boxes
"""
ndim = minmax_box.shape[-1] // 2
center = minmax_box[..., :ndim]
dims = minmax_box[..., ndim:] - center
return center_to_corner_box2d(center, dims, origin=0.0)
def create_anchors_3d_range(feature_size,
anchor_range,
sizes=((3.9, 1.6, 1.56), ),
rotations=(0, np.pi / 2),
dtype=np.float32):
"""Create anchors 3d by range.
Args:
feature_size (list[float] | tuple[float]): Feature map size. It is
either a list of a tuple of [D, H, W](in order of z, y, and x).
anchor_range (torch.Tensor | list[float]): Range of anchors with
shape [6]. The order is consistent with that of anchors, i.e.,
(x_min, y_min, z_min, x_max, y_max, z_max).
sizes (list[list] | np.ndarray | torch.Tensor, optional):
Anchor size with shape [N, 3], in order of x, y, z.
Defaults to ((3.9, 1.6, 1.56), ).
rotations (list[float] | np.ndarray | torch.Tensor, optional):
Rotations of anchors in a single feature grid.
Defaults to (0, np.pi / 2).
dtype (type, optional): Data type. Defaults to np.float32.
Returns:
np.ndarray: Range based anchors with shape of
(*feature_size, num_sizes, num_rots, 7).
"""
anchor_range = np.array(anchor_range, dtype)
z_centers = np.linspace(anchor_range[2],
anchor_range[5],
feature_size[0],
dtype=dtype)
y_centers = np.linspace(anchor_range[1],
anchor_range[4],
feature_size[1],
dtype=dtype)
x_centers = np.linspace(anchor_range[0],
anchor_range[3],
feature_size[2],
dtype=dtype)
sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
rotations = np.array(rotations, dtype=dtype)
rets = np.meshgrid(x_centers,
y_centers,
z_centers,
rotations,
indexing='ij')
tile_shape = [1] * 5
tile_shape[-2] = int(sizes.shape[0])
for i in range(len(rets)):
rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
rets[i] = rets[i][..., np.newaxis] # for concat
sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
tile_size_shape = list(rets[0].shape)
tile_size_shape[3] = 1
sizes = np.tile(sizes, tile_size_shape)
rets.insert(3, sizes)
ret = np.concatenate(rets, axis=-1)
return np.transpose(ret, [2, 1, 0, 3, 4, 5])
def center_to_minmax_2d(centers, dims, origin=0.5):
"""Center to minmax.
Args:
centers (np.ndarray): Center points.
dims (np.ndarray): Dimensions.
origin (list or array or float, optional): Origin point relate
to smallest point. Defaults to 0.5.
Returns:
np.ndarray: Minmax points.
"""
if origin == 0.5:
return np.concatenate([centers - dims / 2, centers + dims / 2],
axis=-1)
corners = center_to_corner_box2d(centers, dims, origin=origin)
return corners[:, [0, 2]].reshape([-1, 4])
def rbbox2d_to_near_bbox(rbboxes):
"""convert rotated bbox to nearest 'standing' or 'lying' bbox.
Args:
rbboxes (np.ndarray): Rotated bboxes with shape of
(N, 5(x, y, xdim, ydim, rad)).
Returns:
np.ndarray: Bounding boxes with the shape of
(N, 4(xmin, ymin, xmax, ymax)).
"""
rots = rbboxes[..., -1]
rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
return bboxes
@numba.jit(nopython=True)
def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
"""Calculate box iou. Note that jit version runs ~10x faster than the
box_overlaps function in mmdet3d.core.evaluation.
Note:
This function is for counterclockwise boxes.
Args:
boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
query_boxes (np.ndarray): Query boxes with shape of (K, 4).
mode (str, optional): IoU mode. Defaults to 'iou'.
eps (float, optional): Value added to denominator. Defaults to 0.
Returns:
np.ndarray: Overlap between boxes and query_boxes
with the shape of [N, K].
"""
N = boxes.shape[0]
K = query_boxes.shape[0]
overlaps = np.zeros((N, K), dtype=boxes.dtype)
for k in range(K):
box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
(query_boxes[k, 3] - query_boxes[k, 1] + eps))
for n in range(N):
iw = (min(boxes[n, 2], query_boxes[k, 2]) -
max(boxes[n, 0], query_boxes[k, 0]) + eps)
if iw > 0:
ih = (min(boxes[n, 3], query_boxes[k, 3]) -
max(boxes[n, 1], query_boxes[k, 1]) + eps)
if ih > 0:
if mode == 'iou':
ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
(boxes[n, 3] - boxes[n, 1] + eps) + box_area -
iw * ih)
else:
ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
(boxes[n, 3] - boxes[n, 1] + eps))
overlaps[n, k] = iw * ih / ua
return overlaps
def projection_matrix_to_CRT_kitti(proj):
"""Split projection matrix of KITTI.
Note:
This function is for KITTI only.
P = C @ [R|T]
C is upper triangular matrix, so we need to inverse CR and use QR
stable for all kitti camera projection matrix.
Args:
proj (p.array, shape=[4, 4]): Intrinsics of camera.
Returns:
tuple[np.ndarray]: Splited matrix of C, R and T.
"""
CR = proj[0:3, 0:3]
CT = proj[0:3, 3]
RinvCinv = np.linalg.inv(CR)
Rinv, Cinv = np.linalg.qr(RinvCinv)
C = np.linalg.inv(Cinv)
R = np.linalg.inv(Rinv)
T = Cinv @ CT
return C, R, T
def remove_outside_points(points, rect, Trv2c, P2, image_shape):
"""Remove points which are outside of image.
Note:
This function is for KITTI only.
Args:
points (np.ndarray, shape=[N, 3+dims]): Total points.
rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
image_shape (list[int]): Shape of image.
Returns:
np.ndarray, shape=[N, 3+dims]: Filtered points.
"""
# 5x faster than remove_outside_points_v1(2ms vs 10ms)
C, R, T = projection_matrix_to_CRT_kitti(P2)
image_bbox = [0, 0, image_shape[1], image_shape[0]]
frustum = get_frustum(image_bbox, C)
frustum -= T
frustum = np.linalg.inv(R) @ frustum.T
frustum = camera_to_lidar(frustum.T, rect, Trv2c)
frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
points = points[indices.reshape([-1])]
return points
def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
"""Get frustum corners in camera coordinates.
Args:
bbox_image (list[int]): box in image coordinates.
C (np.ndarray): Intrinsics.
near_clip (float, optional): Nearest distance of frustum.
Defaults to 0.001.
far_clip (float, optional): Farthest distance of frustum.
Defaults to 100.
Returns:
np.ndarray, shape=[8, 3]: coordinates of frustum corners.
"""
fku = C[0, 0]
fkv = -C[1, 1]
u0v0 = C[0:2, 2]
z_points = np.array([near_clip] * 4 + [far_clip] * 4,
dtype=C.dtype)[:, np.newaxis]
b = bbox_image
box_corners = np.array(
[[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
dtype=C.dtype)
near_box_corners = (box_corners - u0v0) / np.array(
[fku / near_clip, -fkv / near_clip], dtype=C.dtype)
far_box_corners = (box_corners - u0v0) / np.array(
[fku / far_clip, -fkv / far_clip], dtype=C.dtype)
ret_xy = np.concatenate([near_box_corners, far_box_corners],
axis=0) # [8, 2]
ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
return ret_xyz
def surface_equ_3d(polygon_surfaces):
"""
Args:
polygon_surfaces (np.ndarray): Polygon surfaces with shape of
[num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
All surfaces' normal vector must direct to internal.
Max_num_points_of_surface must at least 3.
Returns:
tuple: normal vector and its direction.
"""
# return [a, b, c], d in ax+by+cz+d=0
# polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
surface_vec = polygon_surfaces[:, :, :2, :] - \
polygon_surfaces[:, :, 1:3, :]
# normal_vec: [..., 3]
normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
# print(normal_vec.shape, points[..., 0, :].shape)
# d = -np.inner(normal_vec, points[..., 0, :])
d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
return normal_vec, -d
@numba.njit
def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
num_surfaces):
"""
Args:
points (np.ndarray): Input points with shape of (num_points, 3).
polygon_surfaces (np.ndarray): Polygon surfaces with shape of
(num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
All surfaces' normal vector must direct to internal.
Max_num_points_of_surface must at least 3.
normal_vec (np.ndarray): Normal vector of polygon_surfaces.
d (int): Directions of normal vector.
num_surfaces (np.ndarray): Number of surfaces a polygon contains
shape of (num_polygon).
Returns:
np.ndarray: Result matrix with the shape of [num_points, num_polygon].
"""
max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
num_points = points.shape[0]
num_polygons = polygon_surfaces.shape[0]
ret = np.ones((num_points, num_polygons), dtype=np.bool_)
sign = 0.0
for i in range(num_points):
for j in range(num_polygons):
for k in range(max_num_surfaces):
if k > num_surfaces[j]:
break
sign = (points[i, 0] * normal_vec[j, k, 0] +
points[i, 1] * normal_vec[j, k, 1] +
points[i, 2] * normal_vec[j, k, 2] + d[j, k])
if sign >= 0:
ret[i, j] = False
break
return ret
def points_in_convex_polygon_3d_jit(points,
polygon_surfaces,
num_surfaces=None):
"""Check points is in 3d convex polygons.
Args:
points (np.ndarray): Input points with shape of (num_points, 3).
polygon_surfaces (np.ndarray): Polygon surfaces with shape of
(num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
All surfaces' normal vector must direct to internal.
Max_num_points_of_surface must at least 3.
num_surfaces (np.ndarray, optional): Number of surfaces a polygon
contains shape of (num_polygon). Defaults to None.
Returns:
np.ndarray: Result matrix with the shape of [num_points, num_polygon].
"""
max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
# num_points = points.shape[0]
num_polygons = polygon_surfaces.shape[0]
if num_surfaces is None:
num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
# normal_vec: [num_polygon, max_num_surfaces, 3]
# d: [num_polygon, max_num_surfaces]
return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
normal_vec, d, num_surfaces)
@numba.njit
def points_in_convex_polygon_jit(points, polygon, clockwise=False):
"""Check points is in 2d convex polygons. True when point in polygon.
Args:
points (np.ndarray): Input points with the shape of [num_points, 2].
polygon (np.ndarray): Input polygon with the shape of
[num_polygon, num_points_of_polygon, 2].
clockwise (bool, optional): Indicate polygon is clockwise. Defaults
to True.
Returns:
np.ndarray: Result matrix with the shape of [num_points, num_polygon].
"""
# first convert polygon to directed lines
num_points_of_polygon = polygon.shape[1]
num_points = points.shape[0]
num_polygons = polygon.shape[0]
# vec for all the polygons
if clockwise:
vec1 = polygon - polygon[:,
np.array([num_points_of_polygon - 1] +
list(range(num_points_of_polygon -
1))), :]
else:
vec1 = polygon[:,
np.array([num_points_of_polygon - 1] +
list(range(num_points_of_polygon -
1))), :] - polygon
ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
success = True
cross = 0.0
for i in range(num_points):
for j in range(num_polygons):
success = True
for k in range(num_points_of_polygon):
vec = vec1[j, k]
cross = vec[1] * (polygon[j, k, 0] - points[i, 0])
cross -= vec[0] * (polygon[j, k, 1] - points[i, 1])
if cross >= 0:
success = False
break
ret[i, j] = success
return ret
def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
"""Convert kitti center boxes to corners.
7 -------- 4
/| /|
6 -------- 5 .
| | | |
. 3 -------- 0
|/ |/
2 -------- 1
Note:
This function is for LiDAR boxes only.
Args:
boxes3d (np.ndarray): Boxes with shape of (N, 7)
[x, y, z, x_size, y_size, z_size, ry] in LiDAR coords,
see the definition of ry in KITTI dataset.
bottom_center (bool, optional): Whether z is on the bottom center
of object. Defaults to True.
Returns:
np.ndarray: Box corners with the shape of [N, 8, 3].
"""
boxes_num = boxes3d.shape[0]
x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
x_corners = np.array([
x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2.,
-x_size / 2., -x_size / 2., x_size / 2.
],
dtype=np.float32).T
y_corners = np.array([
-y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2.,
-y_size / 2., y_size / 2., y_size / 2.
],
dtype=np.float32).T
if bottom_center:
z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
z_corners[:, 4:8] = z_size.reshape(boxes_num,
1).repeat(4, axis=1) # (N, 8)
else:
z_corners = np.array([
-z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2.,
z_size / 2., z_size / 2., z_size / 2., z_size / 2.
],
dtype=np.float32).T
ry = boxes3d[:, 6]
zeros, ones = np.zeros(ry.size,
dtype=np.float32), np.ones(ry.size,
dtype=np.float32)
rot_list = np.array([[np.cos(ry), np.sin(ry), zeros],
[-np.sin(ry), np.cos(ry), zeros],
[zeros, zeros, ones]]) # (3, 3, N)
R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3)
temp_corners = np.concatenate((x_corners.reshape(
-1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
axis=2) # (N, 8, 3)
rotated_corners = np.matmul(temp_corners, R_list) # (N, 8, 3)
x_corners = rotated_corners[:, :, 0]
y_corners = rotated_corners[:, :, 1]
z_corners = rotated_corners[:, :, 2]
x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
corners = np.concatenate(
(x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
axis=2)
return corners.astype(np.float32)
================================================
FILE: bip3d/structures/ops/iou3d_calculator.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import torch
from mmdet.structures.bbox import bbox_overlaps
from bip3d.registry import TASK_UTILS
from bip3d.structures.bbox_3d import get_box_type
@TASK_UTILS.register_module()
class BboxOverlapsNearest3D(object):
"""Nearest 3D IoU Calculator.
Note:
This IoU calculator first finds the nearest 2D boxes in bird eye view
(BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
Args:
coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
"""
def __init__(self, coordinate='lidar'):
assert coordinate in ['camera', 'lidar', 'depth']
self.coordinate = coordinate
def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
"""Calculate nearest 3D IoU.
Note:
If ``is_aligned`` is ``False``, then it calculates the ious between
each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
between each aligned pair of bboxes1 and bboxes2.
Args:
bboxes1 (torch.Tensor): shape (N, 7+N)
[x, y, z, x_size, y_size, z_size, ry, v].
bboxes2 (torch.Tensor): shape (M, 7+N)
[x, y, z, x_size, y_size, z_size, ry, v].
mode (str): "iou" (intersection over union) or iof
(intersection over foreground).
is_aligned (bool): Whether the calculation is aligned.
Return:
torch.Tensor: If ``is_aligned`` is ``True``, return ious between
bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
``False``, return shape is M.
"""
return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
self.coordinate)
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(coordinate={self.coordinate}'
return repr_str
@TASK_UTILS.register_module()
class BboxOverlaps3D(object):
"""3D IoU Calculator.
Args:
coordinate (str): The coordinate system, valid options are
'camera', 'lidar', and 'depth'.
"""
def __init__(self, coordinate):
assert coordinate in ['camera', 'lidar', 'depth']
self.coordinate = coordinate
def __call__(self, bboxes1, bboxes2, mode='iou'):
"""Calculate 3D IoU using cuda implementation.
Note:
This function calculate the IoU of 3D boxes based on their volumes.
IoU calculator ``:class:BboxOverlaps3D`` uses this function to
calculate the actual 3D IoUs of boxes.
Args:
bboxes1 (torch.Tensor): with shape (N, 7+C),
(x, y, z, x_size, y_size, z_size, ry, v*).
bboxes2 (torch.Tensor): with shape (M, 7+C),
(x, y, z, x_size, y_size, z_size, ry, v*).
mode (str): "iou" (intersection over union) or
iof (intersection over foreground).
Return:
torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
with shape (M, N) (aligned mode is not supported currently).
"""
return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
def __repr__(self):
"""str: return a string that describes the module"""
repr_str = self.__class__.__name__
repr_str += f'(coordinate={self.coordinate}'
return repr_str
def bbox_overlaps_nearest_3d(bboxes1,
bboxes2,
mode='iou',
is_aligned=False,
coordinate='lidar'):
"""Calculate nearest 3D IoU.
Note:
This function first finds the nearest 2D boxes in bird eye view
(BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
This IoU calculator :class:`BboxOverlapsNearest3D` uses this
function to calculate IoUs of boxes.
If ``is_aligned`` is ``False``, then it calculates the ious between
each bbox of bboxes1 and bboxes2, otherwise the ious between each
aligned pair of bboxes1 and bboxes2.
Args:
bboxes1 (torch.Tensor): with shape (N, 7+C),
(x, y, z, x_size, y_size, z_size, ry, v*).
bboxes2 (torch.Tensor): with shape (M, 7+C),
(x, y, z, x_size, y_size, z_size, ry, v*).
mode (str): "iou" (intersection over union) or iof
(intersection over foreground).
is_aligned (bool): Whether the calculation is aligned
Return:
torch.Tensor: If ``is_aligned`` is ``True``, return ious between
bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
``False``, return shape is M.
"""
assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
box_type, _ = get_box_type(coordinate)
bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
# Change the bboxes to bev
# box conversion and iou calculation in torch version on CUDA
# is 10x faster than that in numpy version
bboxes1_bev = bboxes1.nearest_bev
bboxes2_bev = bboxes2.nearest_bev
ret = bbox_overlaps(bboxes1_bev,
bboxes2_bev,
mode=mode,
is_aligned=is_aligned)
return ret
def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
"""Calculate 3D IoU using cuda implementation.
Note:
This function calculates the IoU of 3D boxes based on their volumes.
IoU calculator :class:`BboxOverlaps3D` uses this function to
calculate the actual IoUs of boxes.
Args:
bboxes1 (torch.Tensor): with shape (N, 7+C),
(x, y, z, x_size, y_size, z_size, ry, v*).
bboxes2 (torch.Tensor): with shape (M, 7+C),
(x, y, z, x_size, y_size, z_size, ry, v*).
mode (str): "iou" (intersection over union) or
iof (intersection over foreground).
coordinate (str): 'camera' or 'lidar' coordinate system.
Return:
torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
with shape (M, N) (aligned mode is not supported currently).
"""
assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
box_type, _ = get_box_type(coordinate)
bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
@TASK_UTILS.register_module()
class AxisAlignedBboxOverlaps3D(object):
"""Axis-aligned 3D Overlaps (IoU) Calculator."""
def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
"""Calculate IoU between 2D bboxes.
Args:
bboxes1 (Tensor): shape (B, m, 6) in
format or empty.
bboxes2 (Tensor): shape (B, n, 6) in
format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned`` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or "giou" (generalized
intersection over union).
is_aligned (bool, optional): If True, then m and n must be equal.
Defaults to False.
Returns:
Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
"""
assert bboxes1.size(-1) == bboxes2.size(-1) == 6
return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
is_aligned)
def __repr__(self):
"""str: a string describing the module"""
repr_str = self.__class__.__name__ + '()'
return repr_str
def axis_aligned_bbox_overlaps_3d(bboxes1,
bboxes2,
mode='iou',
is_aligned=False,
eps=1e-6):
"""Calculate overlap between two set of axis aligned 3D bboxes. If
``is_aligned`` is ``False``, then calculate the overlaps between each bbox
of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (B, m, 6) in
format or empty.
bboxes2 (Tensor): shape (B, n, 6) in
format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned`` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or "giou" (generalized
intersection over union).
is_aligned (bool, optional): If True, then m and n must be equal.
Defaults to False.
eps (float, optional): A value added to the denominator for numerical
stability. Defaults to 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
Example:
>>> bboxes1 = torch.FloatTensor([
>>> [0, 0, 0, 10, 10, 10],
>>> [10, 10, 10, 20, 20, 20],
>>> [32, 32, 32, 38, 40, 42],
>>> ])
>>> bboxes2 = torch.FloatTensor([
>>> [0, 0, 0, 10, 20, 20],
>>> [0, 10, 10, 10, 19, 20],
>>> [10, 10, 10, 20, 20, 20],
>>> ])
>>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
>>> assert overlaps.shape == (3, 3)
>>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
>>> assert overlaps.shape == (3, )
Example:
>>> empty = torch.empty(0, 6)
>>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
"""
assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
# Either the boxes are empty or the length of boxes's last dimension is 6
assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.size(-2)
cols = bboxes2.size(-2)
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return bboxes1.new(batch_shape + (rows, ))
else:
return bboxes1.new(batch_shape + (rows, cols))
area1 = (bboxes1[..., 3] - bboxes1[..., 0]) * (
bboxes1[..., 4] - bboxes1[..., 1]) * (bboxes1[..., 5] -
bboxes1[..., 2])
area2 = (bboxes2[..., 3] - bboxes2[..., 0]) * (
bboxes2[..., 4] - bboxes2[..., 1]) * (bboxes2[..., 5] -
bboxes2[..., 2])
if is_aligned:
lt = torch.max(bboxes1[..., :3], bboxes2[..., :3]) # [B, rows, 3]
rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:]) # [B, rows, 3]
wh = (rb - lt).clamp(min=0) # [B, rows, 2]
overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
else:
lt = torch.max(bboxes1[..., :, None, :3],
bboxes2[..., None, :, :3]) # [B, rows, cols, 3]
rb = torch.min(bboxes1[..., :, None, 3:],
bboxes2[..., None, :, 3:]) # [B, rows, cols, 3]
wh = (rb - lt).clamp(min=0) # [B, rows, cols, 3]
overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
if mode in ['iou', 'giou']:
union = area1[..., None] + area2[..., None, :] - overlap
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :, None, :3],
bboxes2[..., None, :, :3])
enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
bboxes2[..., None, :, 3:])
eps = union.new_tensor([eps])
union = torch.max(union, eps)
ious = overlap / union
if mode in ['iou']:
return ious
# calculate gious
enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
enclose_area = torch.max(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return gious
================================================
FILE: bip3d/structures/ops/transforms.py
================================================
# Copyright (c) OpenRobotLab. All rights reserved.
import torch
def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
"""Map bboxes from testing scale to original image scale.
Args:
bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
scale_factor (float): Scale factor.
flip_horizontal (bool): Whether to flip horizontally.
flip_vertical (bool): Whether to flip vertically.
Returns:
:obj:`BaseInstance3DBoxes`: Boxes mapped back.
"""
new_bboxes = bboxes.clone()
if flip_horizontal:
new_bboxes.flip('horizontal')
if flip_vertical:
new_bboxes.flip('vertical')
new_bboxes.scale(1 / scale_factor)
return new_bboxes
def bbox3d2roi(bbox_list):
"""Convert a list of bounding boxes to roi format.
Args:
bbox_list (list[torch.Tensor]): A list of bounding boxes
corresponding to a batch of images.
Returns:
torch.Tensor: Region of interests in shape (n, c), where
the channels are in order of [batch_ind, x, y ...].
"""
rois_list = []
for img_id, bboxes in enumerate(bbox_list):
if bboxes.size(0) > 0:
img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
rois = torch.cat([img_inds, bboxes], dim=-1)
else:
rois = torch.zeros_like(bboxes)
rois_list.append(rois)
rois = torch.cat(rois_list, 0)
return rois
# TODO delete this
def bbox3d2result(bboxes, scores, labels, attrs=None):
"""Convert detection results to a list of numpy arrays.
Args:
bboxes (torch.Tensor): Bounding boxes with shape (N, 5).
labels (torch.Tensor): Labels with shape (N, ).
scores (torch.Tensor): Scores with shape (N, ).
attrs (torch.Tensor, optional): Attributes with shape (N, ).
Defaults to None.
Returns:
dict[str, torch.Tensor]: Bounding box results in cpu mode.
- boxes_3d (torch.Tensor): 3D boxes.
- scores (torch.Tensor): Prediction scores.
- labels_3d (torch.Tensor): Box labels.
- attrs_3d (torch.Tensor, optional): Box attributes.
"""
result_dict = dict(bboxes_3d=bboxes.to('cpu'),
scores_3d=scores.cpu(),
labels_3d=labels.cpu())
if attrs is not None:
result_dict['attr_labels'] = attrs.cpu()
return result_dict
================================================
FILE: bip3d/structures/points/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .base_points import BasePoints
from .cam_points import CameraPoints
from .depth_points import DepthPoints
from .lidar_points import LiDARPoints
__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
def get_points_type(points_type: str) -> type:
"""Get the class of points according to coordinate type.
Args:
points_type (str): The type of points coordinate. The valid value are
"CAMERA", "LIDAR" and "DEPTH".
Returns:
type: Points type.
"""
points_type_upper = points_type.upper()
if points_type_upper == 'CAMERA':
points_cls = CameraPoints
elif points_type_upper == 'LIDAR':
points_cls = LiDARPoints
elif points_type_upper == 'DEPTH':
points_cls = DepthPoints
else:
raise ValueError('Only "points_type" of "CAMERA", "LIDAR" and "DEPTH" '
f'are supported, got {points_type}')
return points_cls
================================================
FILE: bip3d/structures/points/base_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from abc import abstractmethod
from typing import Iterator, Optional, Sequence, Union
import numpy as np
import torch
from torch import Tensor
from bip3d.structures.bbox_3d.utils import (rotation_3d_in_axis,
rotation_3d_in_euler)
class BasePoints:
"""Base class for Points.
Args:
tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
data with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...). Defaults to 3.
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
Attributes:
tensor (Tensor): Float matrix with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...).
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self,
tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
points_dim: int = 3,
attribute_dims: Optional[dict] = None) -> None:
if isinstance(tensor, Tensor):
device = tensor.device
else:
device = torch.device('cpu')
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that does
# not depend on the inputs (and consequently confuses jit)
tensor = tensor.reshape((-1, points_dim))
assert tensor.dim() == 2 and tensor.size(-1) == points_dim, \
('The points dimension must be 2 and the length of the last '
f'dimension must be {points_dim}, but got points with shape '
f'{tensor.shape}.')
self.tensor = tensor.clone()
self.points_dim = points_dim
self.attribute_dims = attribute_dims
self.rotation_axis = 0
@property
def coord(self) -> Tensor:
"""Tensor: Coordinates of each point in shape (N, 3)."""
return self.tensor[:, :3]
@coord.setter
def coord(self, tensor: Union[Tensor, np.ndarray]) -> None:
"""Set the coordinates of each point.
Args:
tensor (Tensor or np.ndarray): Coordinates of each point with shape
(N, 3).
"""
try:
tensor = tensor.reshape(self.shape[0], 3)
except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray
raise ValueError(f'got unexpected shape {tensor.shape}')
if not isinstance(tensor, Tensor):
tensor = self.tensor.new_tensor(tensor)
self.tensor[:, :3] = tensor
@property
def height(self) -> Union[Tensor, None]:
"""Tensor or None: Returns a vector with height of each point in shape
(N, )."""
if self.attribute_dims is not None and \
'height' in self.attribute_dims.keys():
return self.tensor[:, self.attribute_dims['height']]
else:
return None
@height.setter
def height(self, tensor: Union[Tensor, np.ndarray]) -> None:
"""Set the height of each point.
Args:
tensor (Tensor or np.ndarray): Height of each point with shape
(N, ).
"""
try:
tensor = tensor.reshape(self.shape[0])
except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray
raise ValueError(f'got unexpected shape {tensor.shape}')
if not isinstance(tensor, Tensor):
tensor = self.tensor.new_tensor(tensor)
if self.attribute_dims is not None and \
'height' in self.attribute_dims.keys():
self.tensor[:, self.attribute_dims['height']] = tensor
else:
# add height attribute
if self.attribute_dims is None:
self.attribute_dims = dict()
attr_dim = self.shape[1]
self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
self.attribute_dims.update(dict(height=attr_dim))
self.points_dim += 1
@property
def color(self) -> Union[Tensor, None]:
"""Tensor or None: Returns a vector with color of each point in shape
(N, 3)."""
if self.attribute_dims is not None and \
'color' in self.attribute_dims.keys():
return self.tensor[:, self.attribute_dims['color']]
else:
return None
@color.setter
def color(self, tensor: Union[Tensor, np.ndarray]) -> None:
"""Set the color of each point.
Args:
tensor (Tensor or np.ndarray): Color of each point with shape
(N, 3).
"""
try:
tensor = tensor.reshape(self.shape[0], 3)
except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray
raise ValueError(f'got unexpected shape {tensor.shape}')
if tensor.max() >= 256 or tensor.min() < 0:
warnings.warn('point got color value beyond [0, 255]')
if not isinstance(tensor, Tensor):
tensor = self.tensor.new_tensor(tensor)
if self.attribute_dims is not None and \
'color' in self.attribute_dims.keys():
self.tensor[:, self.attribute_dims['color']] = tensor
else:
# add color attribute
if self.attribute_dims is None:
self.attribute_dims = dict()
attr_dim = self.shape[1]
self.tensor = torch.cat([self.tensor, tensor], dim=1)
self.attribute_dims.update(
dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
self.points_dim += 3
@property
def shape(self) -> torch.Size:
"""torch.Size: Shape of points."""
return self.tensor.shape
def shuffle(self) -> Tensor:
"""Shuffle the points.
Returns:
Tensor: The shuffled index.
"""
idx = torch.randperm(self.__len__(), device=self.tensor.device)
self.tensor = self.tensor[idx]
return idx
def rotate(self,
rotation: Union[Tensor, np.ndarray, float],
axis: Optional[int] = None) -> Tensor:
"""Rotate points with the given rotation matrix or angle.
Args:
rotation (Tensor or np.ndarray or float): Rotation matrix or angle.
axis (int, optional): Axis to rotate at. Defaults to None.
Returns:
Tensor: Rotation matrix.
"""
if not isinstance(rotation, Tensor):
rotation = self.tensor.new_tensor(rotation)
assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, \
f'invalid rotation shape {rotation.shape}'
if axis is None:
axis = self.rotation_axis
if rotation.numel() == 1:
rotated_points, rot_mat_T = rotation_3d_in_axis(
self.tensor[:, :3][None], rotation, axis=axis, return_mat=True)
self.tensor[:, :3] = rotated_points.squeeze(0)
rot_mat_T = rot_mat_T.squeeze(0)
elif rotation.numel() == 3:
rotated_points, rot_mat_T = rotation_3d_in_euler(
self.tensor[:, :3][None], rotation, return_mat=True)
self.tensor[:, :3] = rotated_points.squeeze(0)
rot_mat_T = rot_mat_T.squeeze(0)
else:
# rotation.numel() == 9
self.tensor[:, :3] = self.tensor[:, :3] @ rotation
rot_mat_T = rotation
return rot_mat_T
@abstractmethod
def flip(self, bev_direction: str = 'horizontal') -> None:
"""Flip the points along given BEV direction.
Args:
bev_direction (str): Flip direction (horizontal or vertical).
Defaults to 'horizontal'.
"""
pass
def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
"""Translate points with the given translation vector.
Args:
trans_vector (Tensor or np.ndarray): Translation vector of size 3
or nx3.
"""
if not isinstance(trans_vector, Tensor):
trans_vector = self.tensor.new_tensor(trans_vector)
trans_vector = trans_vector.squeeze(0)
if trans_vector.dim() == 1:
assert trans_vector.shape[0] == 3
elif trans_vector.dim() == 2:
assert trans_vector.shape[0] == self.tensor.shape[0] and \
trans_vector.shape[1] == 3
else:
raise NotImplementedError(
f'Unsupported translation vector of shape {trans_vector.shape}'
)
self.tensor[:, :3] += trans_vector
def in_range_3d(
self, point_range: Union[Tensor, np.ndarray,
Sequence[float]]) -> Tensor:
"""Check whether the points are in the given range.
Args:
point_range (Tensor or np.ndarray or Sequence[float]): The range of
point (x_min, y_min, z_min, x_max, y_max, z_max).
Note:
In the original implementation of SECOND, checking whether a box in
the range checks whether the points are in a convex polygon, we try
to reduce the burden for simpler cases.
Returns:
Tensor: A binary vector indicating whether each point is inside the
reference range.
"""
in_range_flags = ((self.tensor[:, 0] > point_range[0])
& (self.tensor[:, 1] > point_range[1])
& (self.tensor[:, 2] > point_range[2])
& (self.tensor[:, 0] < point_range[3])
& (self.tensor[:, 1] < point_range[4])
& (self.tensor[:, 2] < point_range[5]))
return in_range_flags
@property
def bev(self) -> Tensor:
"""Tensor: BEV of the points in shape (N, 2)."""
return self.tensor[:, [0, 1]]
def in_range_bev(
self, point_range: Union[Tensor, np.ndarray,
Sequence[float]]) -> Tensor:
"""Check whether the points are in the given range.
Args:
point_range (Tensor or np.ndarray or Sequence[float]): The range of
point in order of (x_min, y_min, x_max, y_max).
Returns:
Tensor: A binary vector indicating whether each point is inside the
reference range.
"""
in_range_flags = ((self.bev[:, 0] > point_range[0])
& (self.bev[:, 1] > point_range[1])
& (self.bev[:, 0] < point_range[2])
& (self.bev[:, 1] < point_range[3]))
return in_range_flags
@abstractmethod
def convert_to(self,
dst: int,
rt_mat: Optional[Union[Tensor,
np.ndarray]] = None) -> 'BasePoints':
"""Convert self to ``dst`` mode.
Args:
dst (int): The target Point mode.
rt_mat (Tensor or np.ndarray, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
Returns:
:obj:`BasePoints`: The converted point of the same type in the
``dst`` mode.
"""
pass
def scale(self, scale_factor: float) -> None:
"""Scale the points with horizontal and vertical scaling factors.
Args:
scale_factors (float): Scale factors to scale the points.
"""
self.tensor[:, :3] *= scale_factor
def __getitem__(
self, item: Union[int, tuple, slice, np.ndarray,
Tensor]) -> 'BasePoints':
"""
Args:
item (int or tuple or slice or np.ndarray or Tensor): Index of
points.
Note:
The following usage are allowed:
1. `new_points = points[3]`: Return a `Points` that contains only
one point.
2. `new_points = points[2:10]`: Return a slice of points.
3. `new_points = points[vector]`: Whether vector is a
torch.BoolTensor with `length = len(points)`. Nonzero elements
in the vector will be selected.
4. `new_points = points[3:11, vector]`: Return a slice of points
and attribute dims.
5. `new_points = points[4:12, 2]`: Return a slice of points with
single attribute.
Note that the returned Points might share storage with this Points,
subject to PyTorch's indexing semantics.
Returns:
:obj:`BasePoints`: A new object of :class:`BasePoints` after
indexing.
"""
original_type = type(self)
if isinstance(item, int):
return original_type(self.tensor[item].view(1, -1),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
elif isinstance(item, tuple) and len(item) == 2:
if isinstance(item[1], slice):
start = 0 if item[1].start is None else item[1].start
stop = self.tensor.shape[1] \
if item[1].stop is None else item[1].stop
step = 1 if item[1].step is None else item[1].step
item = list(item)
item[1] = list(range(start, stop, step))
item = tuple(item)
elif isinstance(item[1], int):
item = list(item)
item[1] = [item[1]]
item = tuple(item)
p = self.tensor[item[0], item[1]]
keep_dims = list(
set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
if self.attribute_dims is not None:
attribute_dims = self.attribute_dims.copy()
for key in self.attribute_dims.keys():
cur_attribute_dims = attribute_dims[key]
if isinstance(cur_attribute_dims, int):
cur_attribute_dims = [cur_attribute_dims]
intersect_attr = list(
set(cur_attribute_dims).intersection(set(keep_dims)))
if len(intersect_attr) == 1:
attribute_dims[key] = intersect_attr[0]
elif len(intersect_attr) > 1:
attribute_dims[key] = intersect_attr
else:
attribute_dims.pop(key)
else:
attribute_dims = None
elif isinstance(item, (slice, np.ndarray, Tensor)):
p = self.tensor[item]
attribute_dims = self.attribute_dims
else:
raise NotImplementedError(f'Invalid slice {item}!')
assert p.dim() == 2, \
f'Indexing on Points with {item} failed to return a matrix!'
return original_type(p,
points_dim=p.shape[1],
attribute_dims=attribute_dims)
def __len__(self) -> int:
"""int: Number of points in the current object."""
return self.tensor.shape[0]
def __repr__(self) -> str:
"""str: Return a string that describes the object."""
return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')'
@classmethod
def cat(cls, points_list: Sequence['BasePoints']) -> 'BasePoints':
"""Concatenate a list of Points into a single Points.
Args:
points_list (Sequence[:obj:`BasePoints`]): List of points.
Returns:
:obj:`BasePoints`: The concatenated points.
"""
assert isinstance(points_list, (list, tuple))
if len(points_list) == 0:
return cls(torch.empty(0))
assert all(isinstance(points, cls) for points in points_list)
# use torch.cat (v.s. layers.cat)
# so the returned points never share storage with input
cat_points = cls(torch.cat([p.tensor for p in points_list], dim=0),
points_dim=points_list[0].points_dim,
attribute_dims=points_list[0].attribute_dims)
return cat_points
def numpy(self) -> np.ndarray:
"""Reload ``numpy`` from self.tensor."""
return self.tensor.numpy()
def to(self, device: Union[str, torch.device], *args,
**kwargs) -> 'BasePoints':
"""Convert current points to a specific device.
Args:
device (str or :obj:`torch.device`): The name of the device.
Returns:
:obj:`BasePoints`: A new points object on the specific device.
"""
original_type = type(self)
return original_type(self.tensor.to(device, *args, **kwargs),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
def cpu(self) -> 'BasePoints':
"""Convert current points to cpu device.
Returns:
:obj:`BasePoints`: A new points object on the cpu device.
"""
original_type = type(self)
return original_type(self.tensor.cpu(),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
def cuda(self, *args, **kwargs) -> 'BasePoints':
"""Convert current points to cuda device.
Returns:
:obj:`BasePoints`: A new points object on the cuda device.
"""
original_type = type(self)
return original_type(self.tensor.cuda(*args, **kwargs),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
def clone(self) -> 'BasePoints':
"""Clone the points.
Returns:
:obj:`BasePoints`: Point object with the same properties as self.
"""
original_type = type(self)
return original_type(self.tensor.clone(),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
def detach(self) -> 'BasePoints':
"""Detach the points.
Returns:
:obj:`BasePoints`: Point object with the same properties as self.
"""
original_type = type(self)
return original_type(self.tensor.detach(),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
@property
def device(self) -> torch.device:
"""torch.device: The device of the points are on."""
return self.tensor.device
def __iter__(self) -> Iterator[Tensor]:
"""Yield a point as a Tensor at a time.
Returns:
Iterator[Tensor]: A point of shape (points_dim, ).
"""
yield from self.tensor
def new_point(
self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
) -> 'BasePoints':
"""Create a new point object with data.
The new point and its tensor has the similar properties as self and
self.tensor, respectively.
Args:
data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
be copied.
Returns:
:obj:`BasePoints`: A new point object with ``data``, the object's
other properties are similar to ``self``.
"""
new_tensor = self.tensor.new_tensor(data) \
if not isinstance(data, Tensor) else data.to(self.device)
original_type = type(self)
return original_type(new_tensor,
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
================================================
FILE: bip3d/structures/points/cam_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Sequence, Union
import numpy as np
from torch import Tensor
from .base_points import BasePoints
class CameraPoints(BasePoints):
"""Points of instances in CAM coordinates.
Args:
tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
data with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...). Defaults to 3.
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
Attributes:
tensor (Tensor): Float matrix with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...).
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self,
tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
points_dim: int = 3,
attribute_dims: Optional[dict] = None) -> None:
super(CameraPoints, self).__init__(tensor,
points_dim=points_dim,
attribute_dims=attribute_dims)
self.rotation_axis = 1
def flip(self, bev_direction: str = 'horizontal') -> None:
"""Flip the points along given BEV direction.
Args:
bev_direction (str): Flip direction (horizontal or vertical).
Defaults to 'horizontal'.
"""
assert bev_direction in ('horizontal', 'vertical')
if bev_direction == 'horizontal':
self.tensor[:, 0] = -self.tensor[:, 0]
elif bev_direction == 'vertical':
self.tensor[:, 2] = -self.tensor[:, 2]
@property
def bev(self) -> Tensor:
"""Tensor: BEV of the points in shape (N, 2)."""
return self.tensor[:, [0, 2]]
def convert_to(self,
dst: int,
rt_mat: Optional[Union[Tensor,
np.ndarray]] = None) -> 'BasePoints':
"""Convert self to ``dst`` mode.
Args:
dst (int): The target Point mode.
rt_mat (Tensor or np.ndarray, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
Returns:
:obj:`BasePoints`: The converted point of the same type in the
``dst`` mode.
"""
from embodiedscan.structures.bbox_3d import Coord3DMode
return Coord3DMode.convert_point(point=self,
src=Coord3DMode.CAM,
dst=dst,
rt_mat=rt_mat)
================================================
FILE: bip3d/structures/points/depth_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Sequence, Union
import numpy as np
from torch import Tensor
from .base_points import BasePoints
class DepthPoints(BasePoints):
"""Points of instances in DEPTH coordinates.
Args:
tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
data with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...). Defaults to 3.
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
Attributes:
tensor (Tensor): Float matrix with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...).
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self,
tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
points_dim: int = 3,
attribute_dims: Optional[dict] = None) -> None:
super(DepthPoints, self).__init__(tensor,
points_dim=points_dim,
attribute_dims=attribute_dims)
self.rotation_axis = 2
def flip(self, bev_direction: str = 'horizontal') -> None:
"""Flip the points along given BEV direction.
Args:
bev_direction (str): Flip direction (horizontal or vertical).
Defaults to 'horizontal'.
"""
assert bev_direction in ('horizontal', 'vertical')
if bev_direction == 'horizontal':
self.tensor[:, 0] = -self.tensor[:, 0]
elif bev_direction == 'vertical':
self.tensor[:, 1] = -self.tensor[:, 1]
def convert_to(self,
dst: int,
rt_mat: Optional[Union[Tensor,
np.ndarray]] = None) -> 'BasePoints':
"""Convert self to ``dst`` mode.
Args:
dst (int): The target Point mode.
rt_mat (Tensor or np.ndarray, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
Returns:
:obj:`BasePoints`: The converted point of the same type in the
``dst`` mode.
"""
from embodiedscan.structures.bbox_3d import Coord3DMode
return Coord3DMode.convert_point(point=self,
src=Coord3DMode.DEPTH,
dst=dst,
rt_mat=rt_mat)
================================================
FILE: bip3d/structures/points/lidar_points.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Sequence, Union
import numpy as np
from torch import Tensor
from .base_points import BasePoints
class LiDARPoints(BasePoints):
"""Points of instances in LIDAR coordinates.
Args:
tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
data with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...). Defaults to 3.
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
Attributes:
tensor (Tensor): Float matrix with shape (N, points_dim).
points_dim (int): Integer indicating the dimension of a point. Each row
is (x, y, z, ...).
attribute_dims (dict, optional): Dictionary to indicate the meaning of
extra dimension. Defaults to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self,
tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
points_dim: int = 3,
attribute_dims: Optional[dict] = None) -> None:
super(LiDARPoints, self).__init__(tensor,
points_dim=points_dim,
attribute_dims=attribute_dims)
self.rotation_axis = 2
def flip(self, bev_direction: str = 'horizontal') -> None:
"""Flip the points along given BEV direction.
Args:
bev_direction (str): Flip direction (horizontal or vertical).
Defaults to 'horizontal'.
"""
assert bev_direction in ('horizontal', 'vertical')
if bev_direction == 'horizontal':
self.tensor[:, 1] = -self.tensor[:, 1]
elif bev_direction == 'vertical':
self.tensor[:, 0] = -self.tensor[:, 0]
def convert_to(self,
dst: int,
rt_mat: Optional[Union[Tensor,
np.ndarray]] = None) -> 'BasePoints':
"""Convert self to ``dst`` mode.
Args:
dst (int): The target Point mode.
rt_mat (Tensor or np.ndarray, optional): The rotation and
translation matrix between different coordinates.
Defaults to None. The conversion from ``src`` coordinates to
``dst`` coordinates usually comes along the change of sensors,
e.g., from camera to LiDAR. This requires a transformation
matrix.
Returns:
:obj:`BasePoints`: The converted point of the same type in the
``dst`` mode.
"""
from embodiedscan.structures.bbox_3d import Coord3DMode
return Coord3DMode.convert_point(point=self,
src=Coord3DMode.LIDAR,
dst=dst,
rt_mat=rt_mat)
================================================
FILE: bip3d/utils/__init__.py
================================================
from .array_converter import ArrayConverter, array_converter
from .typing_config import ConfigType, OptConfigType, OptMultiConfig
__all__ = [
'ConfigType', 'OptConfigType', 'OptMultiConfig', 'ArrayConverter',
'array_converter'
]
================================================
FILE: bip3d/utils/array_converter.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import functools
from inspect import getfullargspec
from typing import Callable, Optional, Tuple, Type, Union
import numpy as np
import torch
TemplateArrayType = Union[np.ndarray, torch.Tensor, list, tuple, int, float]
def array_converter(to_torch: bool = True,
apply_to: Tuple[str, ...] = tuple(),
template_arg_name_: Optional[str] = None,
recover: bool = True) -> Callable:
"""Wrapper function for data-type agnostic processing.
First converts input arrays to PyTorch tensors or NumPy arrays for middle
calculation, then convert output to original data-type if `recover=True`.
Args:
to_torch (bool): Whether to convert to PyTorch tensors for middle
calculation. Defaults to True.
apply_to (Tuple[str]): The arguments to which we apply data-type
conversion. Defaults to an empty tuple.
template_arg_name_ (str, optional): Argument serving as the template
(return arrays should have the same dtype and device as the
template). Defaults to None. If None, we will use the first
argument in `apply_to` as the template argument.
recover (bool): Whether or not to recover the wrapped function outputs
to the `template_arg_name_` type. Defaults to True.
Raises:
ValueError: When template_arg_name_ is not among all args, or when
apply_to contains an arg which is not among all args, a ValueError
will be raised. When the template argument or an argument to
convert is a list or tuple, and cannot be converted to a NumPy
array, a ValueError will be raised.
TypeError: When the type of the template argument or an argument to
convert does not belong to the above range, or the contents of such
an list-or-tuple-type argument do not share the same data type, a
TypeError will be raised.
Returns:
Callable: Wrapped function.
Examples:
>>> import torch
>>> import numpy as np
>>>
>>> # Use torch addition for a + b,
>>> # and convert return values to the type of a
>>> @array_converter(apply_to=('a', 'b'))
>>> def simple_add(a, b):
>>> return a + b
>>>
>>> a = np.array([1.1])
>>> b = np.array([2.2])
>>> simple_add(a, b)
>>>
>>> # Use numpy addition for a + b,
>>> # and convert return values to the type of b
>>> @array_converter(to_torch=False, apply_to=('a', 'b'),
>>> template_arg_name_='b')
>>> def simple_add(a, b):
>>> return a + b
>>>
>>> simple_add(a, b)
>>>
>>> # Use torch funcs for floor(a) if flag=True else ceil(a),
>>> # and return the torch tensor
>>> @array_converter(apply_to=('a',), recover=False)
>>> def floor_or_ceil(a, flag=True):
>>> return torch.floor(a) if flag else torch.ceil(a)
>>>
>>> floor_or_ceil(a, flag=False)
"""
def array_converter_wrapper(func):
"""Outer wrapper for the function."""
@functools.wraps(func)
def new_func(*args, **kwargs):
"""Inner wrapper for the arguments."""
if len(apply_to) == 0:
return func(*args, **kwargs)
func_name = func.__name__
arg_spec = getfullargspec(func)
arg_names = arg_spec.args
arg_num = len(arg_names)
default_arg_values = arg_spec.defaults
if default_arg_values is None:
default_arg_values = []
no_default_arg_num = len(arg_names) - len(default_arg_values)
kwonly_arg_names = arg_spec.kwonlyargs
kwonly_default_arg_values = arg_spec.kwonlydefaults
if kwonly_default_arg_values is None:
kwonly_default_arg_values = {}
all_arg_names = arg_names + kwonly_arg_names
# in case there are args in the form of *args
if len(args) > arg_num:
named_args = args[:arg_num]
nameless_args = args[arg_num:]
else:
named_args = args
nameless_args = []
# template argument data type is used for all array-like arguments
if template_arg_name_ is None:
template_arg_name = apply_to[0]
else:
template_arg_name = template_arg_name_
if template_arg_name not in all_arg_names:
raise ValueError(f'{template_arg_name} is not among the '
f'argument list of function {func_name}')
# inspect apply_to
for arg_to_apply in apply_to:
if arg_to_apply not in all_arg_names:
raise ValueError(
f'{arg_to_apply} is not an argument of {func_name}')
new_args = []
new_kwargs = {}
converter = ArrayConverter()
target_type = torch.Tensor if to_torch else np.ndarray
# non-keyword arguments
for i, arg_value in enumerate(named_args):
if arg_names[i] in apply_to:
new_args.append(
converter.convert(input_array=arg_value,
target_type=target_type))
else:
new_args.append(arg_value)
if arg_names[i] == template_arg_name:
template_arg_value = arg_value
kwonly_default_arg_values.update(kwargs)
kwargs = kwonly_default_arg_values
# keyword arguments and non-keyword arguments using default value
for i in range(len(named_args), len(all_arg_names)):
arg_name = all_arg_names[i]
if arg_name in kwargs:
if arg_name in apply_to:
new_kwargs[arg_name] = converter.convert(
input_array=kwargs[arg_name],
target_type=target_type)
else:
new_kwargs[arg_name] = kwargs[arg_name]
else:
default_value = default_arg_values[i - no_default_arg_num]
if arg_name in apply_to:
new_kwargs[arg_name] = converter.convert(
input_array=default_value, target_type=target_type)
else:
new_kwargs[arg_name] = default_value
if arg_name == template_arg_name:
template_arg_value = kwargs[arg_name]
# add nameless args provided by *args (if exists)
new_args += nameless_args
return_values = func(*new_args, **new_kwargs)
converter.set_template(template_arg_value)
def recursive_recover(input_data):
if isinstance(input_data, (tuple, list)):
new_data = []
for item in input_data:
new_data.append(recursive_recover(item))
return tuple(new_data) if isinstance(input_data,
tuple) else new_data
elif isinstance(input_data, dict):
new_data = {}
for k, v in input_data.items():
new_data[k] = recursive_recover(v)
return new_data
elif isinstance(input_data, (torch.Tensor, np.ndarray)):
return converter.recover(input_data)
else:
return input_data
if recover:
return recursive_recover(return_values)
else:
return return_values
return new_func
return array_converter_wrapper
class ArrayConverter:
"""Utility class for data-type agnostic processing.
Args:
template_array (np.ndarray or torch.Tensor or list or tuple or int or
float, optional): Template array. Defaults to None.
"""
SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32,
np.int64, np.uint8, np.uint16, np.uint32,
np.uint64, np.float16, np.float32, np.float64)
def __init__(self,
template_array: Optional[TemplateArrayType] = None) -> None:
if template_array is not None:
self.set_template(template_array)
def set_template(self, array: TemplateArrayType) -> None:
"""Set template array.
Args:
array (np.ndarray or torch.Tensor or list or tuple or int or
float): Template array.
Raises:
ValueError: If input is list or tuple and cannot be converted to a
NumPy array, a ValueError is raised.
TypeError: If input type does not belong to the above range, or the
contents of a list or tuple do not share the same data type, a
TypeError is raised.
"""
self.array_type = type(array)
self.is_num = False
self.device = 'cpu'
if isinstance(array, np.ndarray):
self.dtype = array.dtype
elif isinstance(array, torch.Tensor):
self.dtype = array.dtype
self.device = array.device
elif isinstance(array, (list, tuple)):
try:
array = np.array(array)
if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
raise TypeError
self.dtype = array.dtype
except (ValueError, TypeError):
print('The following list cannot be converted to a numpy '
f'array of supported dtype:\n{array}')
raise
elif isinstance(array, (int, float)):
self.array_type = np.ndarray
self.is_num = True
self.dtype = np.dtype(type(array))
else:
raise TypeError(
f'Template type {self.array_type} is not supported.')
def convert(
self,
input_array: TemplateArrayType,
target_type: Optional[Type] = None,
target_array: Optional[Union[np.ndarray, torch.Tensor]] = None
) -> Union[np.ndarray, torch.Tensor]:
"""Convert input array to target data type.
Args:
input_array (np.ndarray or torch.Tensor or list or tuple or int or
float): Input array.
target_type (Type, optional): Type to which input array is
converted. It should be `np.ndarray` or `torch.Tensor`.
Defaults to None.
target_array (np.ndarray or torch.Tensor, optional): Template array
to which input array is converted. Defaults to None.
Raises:
ValueError: If input is list or tuple and cannot be converted to a
NumPy array, a ValueError is raised.
TypeError: If input type does not belong to the above range, or the
contents of a list or tuple do not share the same data type, a
TypeError is raised.
Returns:
np.ndarray or torch.Tensor: The converted array.
"""
if isinstance(input_array, (list, tuple)):
try:
input_array = np.array(input_array)
if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
raise TypeError
except (ValueError, TypeError):
print('The input cannot be converted to a single-type numpy '
f'array:\n{input_array}')
raise
elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES):
input_array = np.array(input_array)
array_type = type(input_array)
assert target_type is not None or target_array is not None, \
'must specify a target'
if target_type is not None:
assert target_type in (np.ndarray, torch.Tensor), \
'invalid target type'
if target_type == array_type:
return input_array
elif target_type == np.ndarray:
# default dtype is float32
converted_array = input_array.cpu().numpy().astype(np.float32)
else:
# default dtype is float32, device is 'cpu'
converted_array = torch.tensor(input_array,
dtype=torch.float32)
else:
assert isinstance(target_array, (np.ndarray, torch.Tensor)), \
'invalid target array type'
if isinstance(target_array, array_type):
return input_array
elif isinstance(target_array, np.ndarray):
converted_array = input_array.cpu().numpy().astype(
target_array.dtype)
else:
converted_array = target_array.new_tensor(input_array)
return converted_array
def recover(
self, input_array: Union[np.ndarray, torch.Tensor]
) -> Union[np.ndarray, torch.Tensor, int, float]:
"""Recover input type to original array type.
Args:
input_array (np.ndarray or torch.Tensor): Input array.
Returns:
np.ndarray or torch.Tensor or int or float: Converted array.
"""
assert isinstance(input_array, (np.ndarray, torch.Tensor)), \
'invalid input array type'
if isinstance(input_array, self.array_type):
return input_array
elif isinstance(input_array, torch.Tensor):
converted_array = input_array.cpu().numpy().astype(self.dtype)
else:
converted_array = torch.tensor(input_array,
dtype=self.dtype,
device=self.device)
if self.is_num:
converted_array = converted_array.item()
return converted_array
================================================
FILE: bip3d/utils/default_color_map.py
================================================
DEFAULT_COLOR_MAP = {
'floor': [255, 193, 193],
'wall': [137, 54, 74],
'chair': [153, 69, 1],
'cabinet': [134, 199, 156],
'door': [92, 136, 89],
'table': [255.0, 187.0, 120.0],
'couch': [3, 95, 161],
'shelf': [255, 160, 98],
'window': [183, 121, 142],
'bed': [123, 104, 238],
'curtain': [210, 170, 100],
'plant': [163, 255, 0],
'stairs': [104, 84, 109],
'pillow': [76, 91, 113],
'counter': [146, 112, 198],
'bench': [250, 0, 30],
'rail': [230, 150, 140],
'sink': [135, 206, 250],
'mirror': [154, 208, 0],
'toilet': [0, 165, 120],
'refrigerator': [59, 105, 106],
'book': [142, 108, 45],
'tv': [183, 130, 88],
'blanket': [147, 211, 203],
'rack': [255, 208, 186],
'towel': [225, 199, 255],
'backpack': [255, 179, 240],
'roof': [92, 82, 55],
'bag': [209, 0, 151],
'board': [133, 129, 255],
'bicycle': [119, 11, 32],
'oven': [178, 90, 62],
'microwave': [79, 210, 114],
'desk': [109, 63, 54],
'doorframe': [199, 100, 0],
'wardrobe': [7, 246, 231],
'picture': [171, 134, 1],
'bathtub': [92, 0, 73],
'box': [188, 208, 182],
'stand': [146, 139, 141],
'clothes': [96, 96, 96],
'lamp': [107, 255, 200],
'dresser': [206, 186, 171],
'stool': [73, 77, 174],
'fireplace': [255, 77, 255],
'commode': [102, 102, 156],
'washing machine': [152, 251, 152],
'monitor': [208, 195, 210],
'window frame': [227, 255, 205],
'radiator': [191, 162, 208],
'mat': [250, 141, 255],
'shower': [154, 255, 154],
'ottoman': [95, 32, 0],
'column': [60, 143, 255],
'blinds': [134, 134, 103],
'stove': [128, 64, 128],
'bar': [72, 0, 118],
'pillar': [220, 20, 60],
'bin': [187, 255, 255],
'heater': [209, 226, 140],
'clothes dryer': [100, 170, 30],
'blackboard': [0, 82, 0],
'decoration': [107, 142, 35],
'steps': [120, 166, 157],
'windowsill': [9, 80, 61],
'cushion': [0, 228, 0],
'carpet': [175, 116, 175],
'copier': [241, 129, 0],
'countertop': [207, 138, 255],
'basket': [0, 0, 70],
'mailbox': [150, 100, 100],
'kitchen island': [220, 220, 0],
'washbasin': [0, 80, 100],
'drawer': [0, 220, 176],
'piano': [78, 180, 255],
'exercise equipment': [151, 0, 95],
'beam': [255, 255, 128],
'partition': [168, 171, 172],
'printer': [179, 0, 194],
'frame': [255, 180, 195],
'object': [0, 0, 0],
'adhesive tape': [0, 220, 176],
'air conditioner': [109, 63, 54],
'alarm': [0, 114, 143],
'album': [147, 186, 208],
'arch': [135, 158, 223],
'balcony': [70, 70, 70],
'ball': [96, 96, 96],
'banister': [196, 172, 0],
'barricade': [45, 89, 255],
'baseboard': [153, 69, 1],
'basin': [255.0, 187.0, 120.0],
'beanbag': [190, 153, 153],
'bidet': [123, 104, 238],
'body loofah': [196, 172, 0],
'boots': [134, 199, 156],
'bottle': [241, 129, 0],
'bowl': [92, 136, 89],
'bread': [119, 11, 32],
'broom': [0, 226, 252],
'brush': [255, 255, 128],
'bucket': [255, 73, 97],
'calendar': [76, 91, 113],
'camera': [72, 0, 118],
'can': [109, 63, 54],
'candle': [78, 180, 255],
'candlestick': [104, 84, 109],
'cap': [128, 76, 255],
'car': [107, 142, 35],
'cart': [255, 255, 128],
'case': [0, 0, 230],
'chandelier': [169, 164, 131],
'cleanser': [0, 165, 120],
'clock': [190, 153, 153],
'coat hanger': [179, 0, 194],
'coffee maker': [0, 82, 0],
'coil': [255, 179, 240],
'computer': [225, 199, 255],
'conducting wire': [150, 100, 100],
'container': [0, 0, 70],
'control': [255, 77, 255],
'cosmetics': [142, 108, 45],
'crate': [0, 226, 252],
'crib': [169, 164, 131],
'cube': [116, 112, 0],
'cup': [175, 116, 175],
'detergent': [255, 208, 186],
'device': [146, 139, 141],
'dish rack': [0, 0, 142],
'dishwasher': [92, 82, 55],
'dispenser': [95, 32, 0],
'divider': [219, 142, 185],
'door knob': [166, 74, 118],
'doorway': [134, 134, 103],
'dress': [0, 114, 143],
'drum': [107, 142, 35],
'duct': [0, 80, 100],
'dumbbell': [0, 0, 192],
'dustpan': [78, 180, 255],
'dvd': [0, 143, 149],
'eraser': [0, 82, 0],
'fan': [0, 0, 70],
'faucet': [84, 105, 51],
'fence': [190, 153, 153],
'file': [255, 228, 255],
'fire extinguisher': [107, 255, 200],
'flowerpot': [9, 80, 61],
'flush': [227, 255, 205],
'folder': [208, 229, 228],
'food': [109, 63, 54],
'footstool': [133, 129, 255],
'fruit': [179, 0, 194],
'furniture': [220, 20, 60],
'garage door': [217, 17, 255],
'garbage': [0, 82, 0],
'glass': [255, 99, 164],
'globe': [255, 77, 255],
'glove': [166, 196, 102],
'grab bar': [145, 148, 174],
'grass': [0, 60, 100],
'guitar': [73, 77, 174],
'hair dryer': [169, 164, 131],
'hamper': [241, 129, 0],
'handle': [142, 108, 45],
'hanger': [150, 100, 100],
'hat': [154, 208, 0],
'headboard': [171, 134, 1],
'headphones': [124, 74, 181],
'helmets': [209, 226, 140],
'holder': [151, 0, 95],
'hook': [92, 136, 89],
'humidifier': [209, 99, 106],
'ironware': [127, 167, 115],
'jacket': [255, 73, 97],
'jalousie': [255, 179, 240],
'jar': [106, 154, 176],
'kettle': [196, 172, 0],
'keyboard': [0, 125, 92],
'kitchenware': [74, 65, 105],
'knife': [70, 130, 180],
'label': [0, 228, 0],
'ladder': [0, 114, 143],
'laptop': [255, 180, 195],
'ledge': [58, 41, 149],
'letter': [0, 0, 192],
'light': [78, 180, 255],
'luggage': [0, 226, 252],
'machine': [197, 226, 255],
'magazine': [199, 100, 0],
'map': [183, 121, 142],
'mask': [74, 65, 105],
'mattress': [255, 179, 240],
'menu': [255, 255, 128],
'molding': [104, 84, 109],
'mop': [199, 100, 0],
'mouse': [5, 121, 0],
'napkins': [165, 42, 42],
'notebook': [175, 116, 175],
'pack': [0, 143, 149],
'package': [166, 196, 102],
'pad': [208, 229, 228],
'pan': [209, 99, 106],
'panel': [201, 57, 1],
'paper': [255, 179, 240],
'paper cutter': [207, 138, 255],
'pedestal': [64, 170, 64],
'pen': [193, 0, 92],
'person': [7, 246, 231],
'pipe': [255, 180, 195],
'pitcher': [220, 20, 60],
'plate': [142, 108, 45],
'player': [0, 143, 149],
'plug': [255, 77, 255],
'plunger': [165, 42, 42],
'pool': [153, 69, 1],
'pool table': [0, 0, 230],
'poster': [130, 114, 135],
'pot': [96, 36, 108],
'price tag': [255, 77, 255],
'projector': [179, 0, 194],
'purse': [0, 228, 0],
'radio': [116, 112, 0],
'range hood': [199, 100, 0],
'remote control': [188, 208, 182],
'ridge': [59, 105, 106],
'rod': [207, 138, 255],
'roll': [123, 104, 238],
'rope': [110, 76, 0],
'sack': [190, 153, 153],
'salt': [250, 0, 30],
'scale': [58, 41, 149],
'scissors': [60, 143, 255],
'screen': [0, 82, 0],
'seasoning': [254, 212, 124],
'shampoo': [70, 130, 180],
'sheet': [151, 0, 95],
'shirt': [190, 153, 153],
'shoe': [199, 100, 0],
'shovel': [241, 129, 0],
'sign': [208, 195, 210],
'soap': [109, 63, 54],
'soap dish': [166, 74, 118],
'soap dispenser': [95, 32, 0],
'socket': [255, 255, 255],
'speaker': [65, 70, 15],
'sponge': [0, 220, 176],
'spoon': [134, 134, 103],
'stall': [0, 60, 100],
'stapler': [246, 0, 122],
'statue': [196, 172, 0],
'stick': [0, 165, 120],
'stopcock': [0, 60, 100],
'structure': [220, 20, 60],
'sunglasses': [142, 108, 45],
'support': [209, 226, 140],
'switch': [7, 246, 231],
'tablet': [137, 54, 74],
'teapot': [0, 80, 100],
'telephone': [220, 220, 0],
'thermostat': [128, 76, 255],
'tissue': [73, 77, 174],
'tissue box': [96, 96, 96],
'toaster': [106, 0, 228],
'toilet paper': [84, 105, 51],
'toiletry': [128, 64, 128],
'tool': [220, 20, 60],
'toothbrush': [130, 114, 135],
'toothpaste': [0, 143, 149],
'toy': [255.0, 187.0, 120.0],
'tray': [255, 179, 240],
'treadmill': [166, 74, 118],
'trophy': [0, 220, 176],
'tube': [255, 255, 128],
'umbrella': [250, 0, 30],
'urn': [152, 251, 152],
'utensil': [220, 220, 0],
'vacuum cleaner': [96, 36, 108],
'vanity': [5, 121, 0],
'vase': [255, 193, 193],
'vent': [209, 226, 140],
'ventilation': [123, 104, 238],
'water cooler': [255, 255, 128],
'water heater': [145, 148, 174],
'wine': [220, 220, 0],
'wire': [96, 36, 108],
'wood': [127, 167, 115],
'wrap': [175, 116, 175],
}
================================================
FILE: bip3d/utils/dist_utils.py
================================================
import torch.distributed as dist
def reduce_mean(tensor):
""""Obtain the mean of tensor on different GPUs."""
if not (dist.is_available() and dist.is_initialized()):
return tensor
tensor = tensor.clone()
dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
return tensor
================================================
FILE: bip3d/utils/line_mesh.py
================================================
"""Adapted from https://github.com/isl-
org/Open3D/pull/738#issuecomment-564785941.
Module which creates mesh lines from a line set
Open3D relies upon using glLineWidth to set line width on a LineSet
However, this method is now deprecated and not fully supported in
newer OpenGL versions.
See:
Open3D Github Pull Request:
https://github.com/intel-isl/Open3D/pull/738
Other Framework Issues:
https://github.com/openframeworks/openFrameworks/issues/3460
This module aims to solve this by converting a line into a triangular mesh
(which has thickness). The basic idea is to create a cylinder for each line
segment, translate it, and then rotate it.
License: MIT
"""
import numpy as np
import open3d as o3d
def align_vector_to_another(a=np.array([0, 0, 1]), b=np.array([1, 0, 0])):
"""Aligns vector a to vector b with axis angle rotation."""
if np.array_equal(a, b):
return None, None
axis_ = np.cross(a, b)
axis_ = axis_ / np.linalg.norm(axis_)
angle = np.arccos(np.dot(a, b))
return axis_, angle
def normalized(a, axis=-1, order=2):
"""Normalizes a numpy array of points."""
l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
l2[l2 == 0] = 1
return a / np.expand_dims(l2, axis), l2
class LineMesh(object):
def __init__(self, points, lines=None, colors=[0, 1, 0], radius=0.15):
"""Creates a line represented as sequence of cylinder triangular
meshes.
Arguments:
points {ndarray} -- Numpy array of ponts Nx3.
Keyword Arguments:
lines {list[list] or None} -- List of point index pairs denoting
line segments. If None, implicit lines from ordered pairwise
points. (default: {None})
colors {list} -- list of colors, or single color of the line
(default: {[0, 1, 0]})
radius {float} -- radius of cylinder (default: {0.15})
"""
self.points = np.array(points)
self.lines = np.array(
lines) if lines is not None else self.lines_from_ordered_points(
self.points)
self.colors = np.array(colors)
self.radius = radius
self.cylinder_segments = []
self.create_line_mesh()
@staticmethod
def lines_from_ordered_points(points):
lines = [[i, i + 1] for i in range(0, points.shape[0] - 1, 1)]
return np.array(lines)
def create_line_mesh(self):
first_points = self.points[self.lines[:, 0], :]
second_points = self.points[self.lines[:, 1], :]
line_segments = second_points - first_points
line_segments_unit, line_lengths = normalized(line_segments)
z_axis = np.array([0, 0, 1])
# Create triangular mesh cylinder segments of line
for i in range(line_segments_unit.shape[0]):
line_segment = line_segments_unit[i, :]
line_length = line_lengths[i]
# get axis angle rotation to allign cylinder with line segment
axis, angle = align_vector_to_another(z_axis, line_segment)
# Get translation vector
translation = first_points[i, :] + \
line_segment * line_length * 0.5
# create cylinder and apply transformations
cylinder_segment = o3d.geometry.TriangleMesh.create_cylinder(
self.radius, line_length)
cylinder_segment = cylinder_segment.translate(translation,
relative=False)
if axis is not None:
axis_a = axis * angle
cylinder_segment = cylinder_segment.rotate(
R=o3d.geometry.get_rotation_matrix_from_axis_angle(
axis_a)) # center=True)
# cylinder_segment = cylinder_segment.rotate(
# axis_a, center=True,
# type=o3d.geometry.RotationType.AxisAngle)
# color cylinder
color = self.colors if self.colors.ndim == 1 \
else self.colors[i, :]
cylinder_segment.paint_uniform_color(color)
self.cylinder_segments.append(cylinder_segment)
def add_line(self, vis):
"""Adds this line to the visualizer."""
for cylinder in self.cylinder_segments:
vis.add_geometry(cylinder)
def remove_line(self, vis):
"""Removes this line from the visualizer."""
for cylinder in self.cylinder_segments:
vis.remove_geometry(cylinder)
if __name__ == '__main__':
def main():
print('Demonstrating LineMesh vs LineSet')
# Create Line Set
points = [[0, 0, 0], [1, 0, 0], [0, 1, 0], [1, 1, 0], [0, 0, 1],
[1, 0, 1], [0, 1, 1], [1, 1, 1]]
lines = [[0, 1], [0, 2], [1, 3], [2, 3], [4, 5], [4, 6], [5, 7],
[6, 7], [0, 4], [1, 5], [2, 6], [3, 7]]
colors = [[1, 0, 0] for i in range(len(lines))]
line_set = o3d.geometry.LineSet()
line_set.points = o3d.utility.Vector3dVector(points)
line_set.lines = o3d.utility.Vector2iVector(lines)
line_set.colors = o3d.utility.Vector3dVector(colors)
# Create Line Mesh 1
points = np.array(points) + [0, 0, 2]
line_mesh1 = LineMesh(points, lines, colors, radius=0.02)
line_mesh1_geoms = line_mesh1.cylinder_segments
# Create Line Mesh 1
points = np.array(points) + [0, 2, 0]
line_mesh2 = LineMesh(points, radius=0.03)
line_mesh2_geoms = line_mesh2.cylinder_segments
o3d.visualization.draw_geometries(
[line_set, *line_mesh1_geoms, *line_mesh2_geoms])
main()
================================================
FILE: bip3d/utils/typing_config.py
================================================
from collections.abc import Sized
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from mmdet.models.task_modules.samplers import SamplingResult
from mmengine.config import ConfigDict
from mmengine.structures import BaseDataElement, InstanceData
class Det3DDataElement(BaseDataElement):
@property
def gt_instances_3d(self) -> InstanceData:
return self._gt_instances_3d
@gt_instances_3d.setter
def gt_instances_3d(self, value: InstanceData) -> None:
self.set_field(value, '_gt_instances_3d', dtype=InstanceData)
@gt_instances_3d.deleter
def gt_instances_3d(self) -> None:
del self._gt_instances_3d
@property
def pred_instances_3d(self) -> InstanceData:
return self._pred_instances_3d
@pred_instances_3d.setter
def pred_instances_3d(self, value: InstanceData) -> None:
self.set_field(value, '_pred_instances_3d', dtype=InstanceData)
@pred_instances_3d.deleter
def pred_instances_3d(self) -> None:
del self._pred_instances_3d
IndexType = Union[str, slice, int, list, torch.LongTensor,
torch.cuda.LongTensor, torch.BoolTensor,
torch.cuda.BoolTensor, np.ndarray]
class PointData(BaseDataElement):
"""Data structure for point-level annotations or predictions.
All data items in ``data_fields`` of ``PointData`` meet the following
requirements:
- They are all one dimension.
- They should have the same length.
`PointData` is used to save point-level semantic and instance mask,
it also can save `instances_labels` and `instances_scores` temporarily.
In the future, we would consider to move the instance-level info into
`gt_instances_3d` and `pred_instances_3d`.
Examples:
>>> metainfo = dict(
... sample_idx=random.randint(0, 100))
>>> points = np.random.randint(0, 255, (100, 3))
>>> point_data = PointData(metainfo=metainfo,
... points=points)
>>> print(len(point_data))
100
>>> # slice
>>> slice_data = point_data[10:60]
>>> assert len(slice_data) == 50
>>> # set
>>> point_data.pts_semantic_mask = torch.randint(0, 255, (100,))
>>> point_data.pts_instance_mask = torch.randint(0, 255, (100,))
>>> assert tuple(point_data.pts_semantic_mask.shape) == (100,)
>>> assert tuple(point_data.pts_instance_mask.shape) == (100,)
"""
def __setattr__(self, name: str, value: Sized) -> None:
"""setattr is only used to set data.
The value must have the attribute of `__len__` and have the same length
of `PointData`.
"""
if name in ('_metainfo_fields', '_data_fields'):
if not hasattr(self, name):
super().__setattr__(name, value)
else:
raise AttributeError(f'{name} has been used as a '
'private attribute, which is immutable.')
else:
assert isinstance(value,
Sized), 'value must contain `__len__` attribute'
# TODO: make sure the input value share the same length
super().__setattr__(name, value)
__setitem__ = __setattr__
def __getitem__(self, item: IndexType) -> 'PointData':
"""
Args:
item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`,
:obj:`torch.LongTensor`, :obj:`torch.BoolTensor`):
Get the corresponding values according to item.
Returns:
:obj:`PointData`: Corresponding values.
"""
if isinstance(item, list):
item = np.array(item)
if isinstance(item, np.ndarray):
# The default int type of numpy is platform dependent, int32 for
# windows and int64 for linux. `torch.Tensor` requires the index
# should be int64, therefore we simply convert it to int64 here.
# Mode details in https://github.com/numpy/numpy/issues/9464
item = item.astype(np.int64) if item.dtype == np.int32 else item
item = torch.from_numpy(item)
assert isinstance(
item, (str, slice, int, torch.LongTensor, torch.cuda.LongTensor,
torch.BoolTensor, torch.cuda.BoolTensor))
if isinstance(item, str):
return getattr(self, item)
if isinstance(item, int):
if item >= len(self) or item < -len(self): # type: ignore
raise IndexError(f'Index {item} out of range!')
else:
# keep the dimension
item = slice(item, None, len(self))
new_data = self.__class__(metainfo=self.metainfo)
if isinstance(item, torch.Tensor):
assert item.dim() == 1, 'Only support to get the' \
' values along the first dimension.'
if isinstance(item, (torch.BoolTensor, torch.cuda.BoolTensor)):
assert len(item) == len(self), 'The shape of the ' \
'input(BoolTensor) ' \
f'{len(item)} ' \
'does not match the shape ' \
'of the indexed tensor ' \
'in results_field ' \
f'{len(self)} at ' \
'first dimension.'
for k, v in self.items():
if isinstance(v, torch.Tensor):
new_data[k] = v[item]
elif isinstance(v, np.ndarray):
new_data[k] = v[item.cpu().numpy()]
elif isinstance(
v, (str, list, tuple)) or (hasattr(v, '__getitem__')
and hasattr(v, 'cat')):
# convert to indexes from BoolTensor
if isinstance(item,
(torch.BoolTensor, torch.cuda.BoolTensor)):
indexes = torch.nonzero(item).view(
-1).cpu().numpy().tolist()
else:
indexes = item.cpu().numpy().tolist()
slice_list = []
if indexes:
for index in indexes:
slice_list.append(slice(index, None, len(v)))
else:
slice_list.append(slice(None, 0, None))
r_list = [v[s] for s in slice_list]
if isinstance(v, (str, list, tuple)):
new_value = r_list[0]
for r in r_list[1:]:
new_value = new_value + r
else:
new_value = v.cat(r_list)
new_data[k] = new_value
else:
raise ValueError(
f'The type of `{k}` is `{type(v)}`, which has no '
'attribute of `cat`, so it does not '
'support slice with `bool`')
else:
# item is a slice
for k, v in self.items():
new_data[k] = v[item]
return new_data # type: ignore
def __len__(self) -> int:
"""int: The length of `PointData`."""
if len(self._data_fields) > 0:
return len(self.values()[0])
else:
return 0
# Type hint of config data
ConfigType = Union[ConfigDict, dict]
OptConfigType = Optional[ConfigType]
# Type hint of one or more config data
MultiConfig = Union[ConfigType, List[ConfigType]]
OptMultiConfig = Optional[MultiConfig]
InstanceList = List[InstanceData]
OptInstanceList = Optional[InstanceList]
ForwardResults = Union[Dict[str, torch.Tensor], List[Det3DDataElement],
Tuple[torch.Tensor], torch.Tensor]
SamplingResultList = List[SamplingResult]
OptSamplingResultList = Optional[SamplingResultList]
SampleList = List[Det3DDataElement]
OptSampleList = Optional[SampleList]
================================================
FILE: configs/__init__.py
================================================
================================================
FILE: configs/bip3d_det.py
================================================
_base_ = ["./default_runtime.py"]
import os
from bip3d.datasets.embodiedscan_det_grounding_dataset import (
class_names, head_labels, common_labels, tail_labels
)
DEBUG = os.environ.get("CLUSTER") is None
del os
backend_args = None
metainfo = dict(classes="all")
depth = True
depth_loss = True
z_range=[-0.2, 3]
min_depth = 0.25
max_depth = 10
num_depth = 64
model = dict(
type="BIP3D",
input_3d="depth_img",
use_depth_grid_mask=True,
data_preprocessor=dict(
type="CustomDet3DDataPreprocessor",
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32,
),
backbone=dict(
type="mmdet.SwinTransformer",
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
patch_norm=True,
out_indices=(1, 2, 3),
with_cp=True,
convert_weights=False,
),
neck=dict(
type="mmdet.ChannelMapper",
in_channels=[192, 384, 768],
kernel_size=1,
out_channels=256,
act_cfg=None,
bias=True,
norm_cfg=dict(type="GN", num_groups=32),
num_outs=4,
),
text_encoder=dict(
type="BertModel",
special_tokens_list=["[CLS]", "[SEP]"],
name="./ckpt/bert-base-uncased",
pad_to_max=False,
use_sub_sentence_represent=True,
add_pooling_layer=False,
max_tokens=768,
use_checkpoint=True,
return_tokenized=True,
),
backbone_3d=(
dict(
type="mmdet.ResNet",
depth=34,
in_channels=1,
base_channels=4,
num_stages=4,
out_indices=(1, 2, 3),
norm_cfg=dict(type="BN", requires_grad=True),
norm_eval=True,
with_cp=True,
style="pytorch",
)
if depth
else None
),
neck_3d=(
dict(
type="mmdet.ChannelMapper",
in_channels=[8, 16, 32],
kernel_size=1,
out_channels=32,
act_cfg=None,
bias=True,
norm_cfg=dict(type="GN", num_groups=4),
num_outs=4,
)
if depth
else None
),
feature_enhancer=dict(
type="TextImageDeformable2DEnhancer",
num_layers=6,
text_img_attn_block=dict(
v_dim=256, l_dim=256, embed_dim=1024, num_heads=4, init_values=1e-4
),
img_attn_block=dict(
self_attn_cfg=dict(
embed_dims=256, num_levels=4, dropout=0.0, im2col_step=64
),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=2048, ffn_drop=0.0
),
),
text_attn_block=dict(
self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=1024, ffn_drop=0.0
),
),
embed_dims=256,
num_feature_levels=4,
positional_encoding=dict(
num_feats=128, normalize=True, offset=0.0, temperature=20
),
),
spatial_enhancer=dict(
type="DepthFusionSpatialEnhancer",
embed_dims=256,
feature_3d_dim=32,
num_depth_layers=2,
min_depth=min_depth,
max_depth=max_depth,
num_depth=num_depth,
with_feature_3d=depth,
loss_depth_weight=1.0 if depth_loss else -1,
),
decoder=dict(
type="BBox3DDecoder",
look_forward_twice=True,
instance_bank=dict(
type="InstanceBank",
num_anchor=50,
anchor="anchor_files/embodiedscan_kmeans.npy",
embed_dims=256,
anchor_in_camera=True,
),
anchor_encoder=dict(
type="DoF9BoxEncoder",
embed_dims=256,
rot_dims=3,
),
graph_model=dict(
type="MultiheadAttention",
embed_dims=256,
num_heads=8,
dropout=0.0,
batch_first=True,
),
ffn=dict(
type="FFN",
embed_dims=256,
feedforward_channels=2048,
ffn_drop=0.0,
),
norm_layer=dict(type="LN", normalized_shape=256),
deformable_model=dict(
type="DeformableFeatureAggregation",
embed_dims=256,
num_groups=8,
num_levels=4,
use_camera_embed=True,
use_deformable_func=True,
with_depth=True,
min_depth=min_depth,
max_depth=max_depth,
kps_generator=dict(
type="SparseBox3DKeyPointsGenerator",
fix_scale=[
[0, 0, 0],
[0.45, 0, 0],
[-0.45, 0, 0],
[0, 0.45, 0],
[0, -0.45, 0],
[0, 0, 0.45],
[0, 0, -0.45],
],
num_learnable_pts=9,
),
with_value_proj=True,
filter_outlier=True,
),
text_cross_attn=dict(
type="MultiheadAttention",
embed_dims=256,
num_heads=8,
dropout=0.0,
batch_first=True,
),
refine_layer=dict(
type="GroundingRefineClsHead",
embed_dims=256,
output_dim=9,
cls_bias=True,
# cls_layers=True,
),
loss_cls=dict(
type="mmdet.FocalLoss",
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0,
),
loss_reg=dict(
type="DoF9BoxLoss",
loss_weight_wd=1.0,
loss_weight_pcd=0.0,
loss_weight_cd=0.8,
),
sampler=dict(
type="Grounding3DTarget",
cls_weight=1.0,
box_weight=1.0,
num_dn=100,
cost_weight_wd=1.0,
cost_weight_pcd=0.0,
cost_weight_cd=0.8,
with_dn_query=True,
num_classes=284,
embed_dims=256,
),
gt_reg_key="gt_bboxes_3d",
gt_cls_key="tokens_positive",
post_processor=dict(
type="GroundingBox3DPostProcess",
num_output=1000,
),
with_instance_id=False,
),
)
dataset_type = "EmbodiedScanDetGroundingDataset"
data_root = "data"
metainfo = dict(
classes=class_names,
classes_split=(head_labels, common_labels, tail_labels),
box_type_3d="euler-depth",
)
image_wh = (512, 512)
rotate_3rscan = True
sep_token = "[SEP]"
cam_standardization = True
if cam_standardization:
resize = dict(
type="CamIntrisicStandardization",
dst_intrinsic=[
[432.57943431339237, 0.0, 256],
[0.0, 539.8570854208559, 256],
[0.0, 0.0, 1.0],
],
dst_wh=image_wh,
)
else:
resize = dict(type="CustomResize", scale=image_wh, keep_ratio=False)
train_pipeline = [
dict(type="LoadAnnotations3D"),
dict(
type="MultiViewPipeline",
n_images=1,
max_n_images=18,
transforms=[
dict(type="LoadImageFromFile", backend_args=backend_args),
dict(type="LoadDepthFromFile", backend_args=backend_args),
resize,
dict(
type="ResizeCropFlipImage",
data_aug_conf={
"resize_lim": (1.0, 1.0),
"final_dim": image_wh,
"bot_pct_lim": (0.0, 0.05),
"rot_lim": (0, 0),
"H": image_wh[1],
"W": image_wh[0],
"rand_flip": False,
},
),
],
rotate_3rscan=rotate_3rscan,
ordered=False,
),
dict(
type="CategoryGroundingDataPrepare",
classes=class_names,
filter_others=True,
sep_token=sep_token,
max_class=128,
training=True,
z_range=z_range,
),
dict(
type="Pack3DDetInputs",
keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
),
]
if depth_loss:
train_pipeline.append(
dict(
type="DepthProbLabelGenerator",
origin_stride=4,
min_depth=min_depth,
max_depth=max_depth,
num_depth=num_depth,
),
)
test_pipeline = [
dict(type="LoadAnnotations3D"),
dict(
type="MultiViewPipeline",
n_images=1,
max_n_images=50,
ordered=True,
transforms=[
dict(type="LoadImageFromFile", backend_args=backend_args),
dict(type="LoadDepthFromFile", backend_args=backend_args),
resize,
],
rotate_3rscan=rotate_3rscan,
),
dict(
type="CategoryGroundingDataPrepare",
classes=class_names,
sep_token=sep_token,
training=False,
filter_others=False,
),
dict(
type="Pack3DDetInputs",
keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
),
]
trainval = False
data_version = "v1"
if data_version == "v1":
train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl"
val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl"
elif data_version == "v2":
train_ann_file = "embodiedscan-v2/embodiedscan_infos_train.pkl"
val_ann_file = "embodiedscan-v2/embodiedscan_infos_val.pkl"
else:
assert False
train_dataset = dict(
type=dataset_type,
data_root=data_root,
ann_file=train_ann_file,
pipeline=train_pipeline,
test_mode=False,
filter_empty_gt=True,
box_type_3d="Euler-Depth",
metainfo=metainfo,
)
if trainval:
train_dataset = dict(
type="ConcatDataset",
datasets=[
train_dataset,
dict(
type=dataset_type,
data_root=data_root,
ann_file=val_ann_file,
pipeline=train_pipeline,
test_mode=False,
filter_empty_gt=True,
box_type_3d="Euler-Depth",
metainfo=metainfo,
)
]
)
train_dataloader = dict(
batch_size=1,
num_workers=4 if not DEBUG else 0,
persistent_workers=False,
sampler=dict(type="DefaultSampler", shuffle=True),
dataset=dict(
type="RepeatDataset",
times=10,
dataset=train_dataset,
),
)
val_dataloader = dict(
batch_size=1,
num_workers=4,
persistent_workers=False,
drop_last=False,
sampler=dict(type="DefaultSampler", shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=val_ann_file,
pipeline=test_pipeline,
test_mode=True,
filter_empty_gt=True,
box_type_3d="Euler-Depth",
metainfo=metainfo,
),
)
test_dataloader = val_dataloader
val_evaluator = dict(
type="IndoorDetMetric",
collect_dir="/job_data/.dist_test" if not DEBUG else None,
# collect_device="gpu"
)
test_evaluator = val_evaluator
max_epochs = 24
train_cfg = dict(
type="EpochBasedTrainLoop", max_epochs=max_epochs, val_interval=1
)
val_cfg = dict(type="ValLoop")
test_cfg = dict(type="TestLoop")
lr = 2e-4
optim_wrapper = dict(
type="OptimWrapper",
optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005),
paramwise_cfg=dict(
custom_keys={
"backbone.": dict(lr_mult=0.1),
"text_encoder": dict(lr_mult=0.05),
"absolute_pos_embed": dict(decay_mult=0.0),
}
),
clip_grad=dict(max_norm=10, norm_type=2),
)
# learning rate
param_scheduler = [
dict(
type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500
),
dict(
type="MultiStepLR",
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[int(max_epochs / 12 * 8), int(max_epochs / 12 * 11)],
gamma=0.1,
),
]
custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)]
default_hooks = dict(
checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3),
)
vis_backends = [
dict(
type="TensorboardVisBackend",
save_dir="/job_tboard" if not DEBUG else "./work-dir",
),
]
visualizer = dict(
type="Visualizer",
vis_backends=vis_backends,
name="visualizer",
)
load_from = "ckpt/groundingdino_swint_ogc_mmdet-822d7e9d-rename.pth"
================================================
FILE: configs/bip3d_det_grounding.py
================================================
_base_ = ["./bip3d_grounding.py"]
from mmengine import Config
import os
det_config = Config.fromfile("configs/bip3d_det.py")
det_train_dataset = det_config.train_dataset
del Config, det_config
train_dataloader = _base_.train_dataloader
DEBUG = os.environ.get("CLUSTER") is None
train_dataloader["dataset"] = dict(
type="ConcatDataset",
datasets=[
dict(
type="RepeatDataset",
times=20,
dataset=det_train_dataset,
),
train_dataloader["dataset"],
]
)
max_iters = 50000
train_cfg = dict(
type="IterBasedTrainLoop", max_iters=max_iters, val_interval=25000,
_delete_=True,
)
val_cfg = dict(type="ValLoop")
test_cfg = dict(type="TestLoop")
lr = 2e-4
optim_wrapper = dict(
type="OptimWrapper",
optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005),
paramwise_cfg=dict(
custom_keys={
"backbone.": dict(lr_mult=0.1),
"text_encoder": dict(lr_mult=0.05),
"absolute_pos_embed": dict(decay_mult=0.0),
}
),
clip_grad=dict(max_norm=10, norm_type=2),
)
# learning rate
param_scheduler = [
dict(
type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500
),
dict(
type="MultiStepLR",
begin=0,
end=max_iters,
by_epoch=False,
milestones=[int(max_iters*0.8), int(max_iters*0.95)],
gamma=0.1,
),
]
custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)]
default_hooks = dict(
logger=dict(
type="LoggerHook",
interval=25,
log_metric_by_epoch=False,
),
checkpoint=dict(
type="CheckpointHook", interval=25000, max_keep_ckpts=3, by_epoch=False
),
)
vis_backends = [
dict(
type="TensorboardVisBackend",
save_dir="/job_tboard" if not DEBUG else "./work-dir",
),
]
visualizer = dict(
type="Visualizer",
vis_backends=vis_backends,
name="visualizer",
)
log_processor = dict(type="LogProcessor", window_size=50, by_epoch=False)
load_from = "ckpt/bip3d_det.pth"
================================================
FILE: configs/bip3d_det_rgb.py
================================================
_base_ = ["./bip3d_det.py"]
model = dict(
backbone_3d=None,
neck_3d=None,
spatial_enhancer=dict(with_feature_3d=False),
decoder=dict(deformable_model=dict(with_depth=False)),
)
================================================
FILE: configs/bip3d_grounding.py
================================================
_base_ = ["./default_runtime.py"]
import os
from bip3d.datasets.embodiedscan_det_grounding_dataset import (
class_names, head_labels, common_labels, tail_labels
)
DEBUG = os.environ.get("CLUSTER") is None
del os
backend_args = None
metainfo = dict(classes="all")
depth = True
depth_loss = True
z_range=[-0.2, 3]
min_depth = 0.25
max_depth = 10
num_depth = 64
model = dict(
type="BIP3D",
input_3d="depth_img",
use_depth_grid_mask=True,
data_preprocessor=dict(
type="CustomDet3DDataPreprocessor",
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32,
),
backbone=dict(
type="mmdet.SwinTransformer",
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
patch_norm=True,
out_indices=(1, 2, 3),
with_cp=True,
convert_weights=False,
),
neck=dict(
type="mmdet.ChannelMapper",
in_channels=[192, 384, 768],
kernel_size=1,
out_channels=256,
act_cfg=None,
bias=True,
norm_cfg=dict(type="GN", num_groups=32),
num_outs=4,
),
text_encoder=dict(
type="BertModel",
special_tokens_list=["[CLS]", "[SEP]"],
name="./ckpt/bert-base-uncased",
pad_to_max=False,
use_sub_sentence_represent=True,
add_pooling_layer=False,
max_tokens=768,
use_checkpoint=True,
return_tokenized=True,
),
backbone_3d=(
dict(
type="mmdet.ResNet",
depth=34,
in_channels=1,
base_channels=4,
num_stages=4,
out_indices=(1, 2, 3),
norm_cfg=dict(type="BN", requires_grad=True),
norm_eval=True,
with_cp=True,
style="pytorch",
)
if depth
else None
),
neck_3d=(
dict(
type="mmdet.ChannelMapper",
in_channels=[8, 16, 32],
kernel_size=1,
out_channels=32,
act_cfg=None,
bias=True,
norm_cfg=dict(type="GN", num_groups=4),
num_outs=4,
)
if depth
else None
),
feature_enhancer=dict(
type="TextImageDeformable2DEnhancer",
num_layers=6,
text_img_attn_block=dict(
v_dim=256, l_dim=256, embed_dim=1024, num_heads=4, init_values=1e-4
),
img_attn_block=dict(
self_attn_cfg=dict(
embed_dims=256, num_levels=4, dropout=0.0, im2col_step=64
),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=2048, ffn_drop=0.0
),
),
text_attn_block=dict(
self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=1024, ffn_drop=0.0
),
),
embed_dims=256,
num_feature_levels=4,
positional_encoding=dict(
num_feats=128, normalize=True, offset=0.0, temperature=20
),
),
spatial_enhancer=dict(
type="DepthFusionSpatialEnhancer",
embed_dims=256,
feature_3d_dim=32,
num_depth_layers=2,
min_depth=min_depth,
max_depth=max_depth,
num_depth=num_depth,
with_feature_3d=depth,
loss_depth_weight=1.0 if depth_loss else -1,
),
decoder=dict(
type="BBox3DDecoder",
look_forward_twice=True,
instance_bank=dict(
type="InstanceBank",
num_anchor=50,
anchor="anchor_files/embodiedscan_kmeans.npy",
embed_dims=256,
anchor_in_camera=True,
),
anchor_encoder=dict(
type="DoF9BoxEncoder",
embed_dims=256,
rot_dims=3,
),
graph_model=dict(
type="MultiheadAttention",
embed_dims=256,
num_heads=8,
dropout=0.0,
batch_first=True,
),
ffn=dict(
type="FFN",
embed_dims=256,
feedforward_channels=2048,
ffn_drop=0.0,
),
norm_layer=dict(type="LN", normalized_shape=256),
deformable_model=dict(
type="DeformableFeatureAggregation",
embed_dims=256,
num_groups=8,
num_levels=4,
use_camera_embed=True,
use_deformable_func=True,
with_depth=True,
min_depth=min_depth,
max_depth=max_depth,
kps_generator=dict(
type="SparseBox3DKeyPointsGenerator",
fix_scale=[
[0, 0, 0],
[0.45, 0, 0],
[-0.45, 0, 0],
[0, 0.45, 0],
[0, -0.45, 0],
[0, 0, 0.45],
[0, 0, -0.45],
],
num_learnable_pts=9,
),
with_value_proj=True,
filter_outlier=True,
),
text_cross_attn=dict(
type="MultiheadAttention",
embed_dims=256,
num_heads=8,
dropout=0.0,
batch_first=True,
),
refine_layer=dict(
type="GroundingRefineClsHead",
embed_dims=256,
output_dim=9,
cls_bias=True,
# cls_layers=True,
),
loss_cls=dict(
type="mmdet.FocalLoss",
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0,
),
loss_reg=dict(
type="DoF9BoxLoss",
loss_weight_wd=1.0,
loss_weight_pcd=0.0,
loss_weight_cd=0.8,
),
sampler=dict(
type="Grounding3DTarget",
cls_weight=1.0,
box_weight=1.0,
num_dn=100,
cost_weight_wd=1.0,
cost_weight_pcd=0.0,
cost_weight_cd=0.8,
with_dn_query=True,
num_classes=284,
embed_dims=256,
),
gt_reg_key="gt_bboxes_3d",
gt_cls_key="tokens_positive",
post_processor=dict(
type="GroundingBox3DPostProcess",
num_output=1000,
),
with_instance_id=False,
),
)
dataset_type = "EmbodiedScanDetGroundingDataset"
data_root = "data"
metainfo = dict(
classes=class_names,
classes_split=(head_labels, common_labels, tail_labels),
box_type_3d="euler-depth",
)
image_wh = (512, 512)
rotate_3rscan = True
sep_token = "[SEP]"
cam_standardization = True
if cam_standardization:
resize = dict(
type="CamIntrisicStandardization",
dst_intrinsic=[
[432.57943431339237, 0.0, 256],
[0.0, 539.8570854208559, 256],
[0.0, 0.0, 1.0],
],
dst_wh=image_wh,
)
else:
resize = dict(type="CustomResize", scale=image_wh, keep_ratio=False)
train_pipeline = [
dict(type="LoadAnnotations3D"),
dict(
type="MultiViewPipeline",
n_images=1,
max_n_images=18,
transforms=[
dict(type="LoadImageFromFile", backend_args=backend_args),
dict(type="LoadDepthFromFile", backend_args=backend_args),
resize,
dict(
type="ResizeCropFlipImage",
data_aug_conf={
"resize_lim": (1.0, 1.0),
"final_dim": image_wh,
"bot_pct_lim": (0.0, 0.05),
"rot_lim": (0, 0),
"H": image_wh[1],
"W": image_wh[0],
"rand_flip": False,
},
),
],
rotate_3rscan=rotate_3rscan,
ordered=True,
),
dict(
type="Pack3DDetInputs",
keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
),
]
if depth_loss:
train_pipeline.append(
dict(
type="DepthProbLabelGenerator",
origin_stride=4,
min_depth=min_depth,
max_depth=max_depth,
num_depth=num_depth,
),
)
test_pipeline = [
dict(type="LoadAnnotations3D"),
dict(
type="MultiViewPipeline",
n_images=1,
max_n_images=50,
ordered=True,
transforms=[
dict(type="LoadImageFromFile", backend_args=backend_args),
dict(type="LoadDepthFromFile", backend_args=backend_args),
resize,
],
rotate_3rscan=rotate_3rscan,
),
dict(
type="Pack3DDetInputs",
keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
),
]
trainval = False
data_version = "v1"
if data_version == "v1":
train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl"
val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl"
train_vg_file = "embodiedscan/embodiedscan_train_vg_all.json"
val_vg_file = "embodiedscan/embodiedscan_val_vg_all.json"
elif data_version == "v1-mini":
train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl"
val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl"
train_vg_file = "embodiedscan/embodiedscan_train_mini_vg.json"
val_vg_file = "embodiedscan/embodiedscan_val_mini_vg.json"
elif data_version == "v2":
train_ann_file = "embodiedscan-v2/embodiedscan_infos_train.pkl"
val_ann_file = "embodiedscan-v2/embodiedscan_infos_val.pkl"
train_vg_file = "embodiedscan-v2/embodiedscan_train_vg.json"
val_vg_file = "embodiedscan-v2/embodiedscan_val_vg.json"
test_ann_file = "embodiedscan/embodiedscan_infos_test.pkl"
test_vg_file = "embodiedscan/embodiedscan_test_vg.json"
train_dataset = dict(
type=dataset_type,
data_root=data_root,
ann_file=train_ann_file,
pipeline=train_pipeline,
test_mode=False,
filter_empty_gt=True,
box_type_3d="Euler-Depth",
metainfo=metainfo,
mode="grounding",
vg_file=train_vg_file,
num_text=10,
sep_token=sep_token,
)
if trainval:
train_dataset = dict(
type="ConcatDataset",
datasets=[
train_dataset,
dict(
type=dataset_type,
data_root=data_root,
ann_file=val_ann_file,
pipeline=train_pipeline,
test_mode=False,
filter_empty_gt=True,
box_type_3d="Euler-Depth",
metainfo=metainfo,
mode="grounding",
vg_file=val_vg_file,
num_text=10,
sep_token=sep_token,
)
],
)
train_dataloader = dict(
batch_size=1,
num_workers=4 if not DEBUG else 0,
persistent_workers=False,
sampler=dict(type="DefaultSampler", shuffle=True),
dataset=train_dataset,
)
val_dataloader = dict(
batch_size=1,
num_workers=4 if not DEBUG else 0,
persistent_workers=False,
drop_last=False,
sampler=dict(type="DefaultSampler", shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=val_ann_file,
pipeline=test_pipeline,
test_mode=True,
filter_empty_gt=True,
box_type_3d="Euler-Depth",
metainfo=metainfo,
mode="grounding",
vg_file=val_vg_file,
),
)
test_dataloader = dict(
batch_size=1,
num_workers=4 if not DEBUG else 0,
persistent_workers=False,
drop_last=False,
sampler=dict(type="DefaultSampler", shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=test_ann_file,
pipeline=[
dict(type="LoadAnnotations3D"),
dict(
type="MultiViewPipeline",
n_images=1,
max_n_images=50,
ordered=True,
transforms=[
dict(type="LoadImageFromFile", backend_args=backend_args),
dict(type="LoadDepthFromFile", backend_args=backend_args),
resize,
],
rotate_3rscan=rotate_3rscan,
),
dict(
type="Pack3DDetInputs",
keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"],
),
],
test_mode=True,
filter_empty_gt=True,
box_type_3d="Euler-Depth",
metainfo=metainfo,
mode="grounding",
vg_file=test_vg_file,
),
)
val_evaluator = dict(
type="GroundingMetric",
collect_dir="/job_data/.dist_test" if not DEBUG else None,
)
test_evaluator = dict(
type="GroundingMetric",
collect_dir="/job_data/.dist_test" if not DEBUG else None,
format_only=True,
submit_info={
'method': 'BIP3D',
'team': 'robot-lab manipulation team',
'authors': ['xuewu lin'],
'e-mail': '878585984@qq.com',
'institution': 'Horizon',
'country': 'China',
},
result_dir="/job_data" if not DEBUG else "./",
)
max_epochs = 2
train_cfg = dict(
type="EpochBasedTrainLoop", max_epochs=max_epochs, val_interval=1
)
val_cfg = dict(type="ValLoop")
test_cfg = dict(type="TestLoop")
lr = 2e-4
optim_wrapper = dict(
type="OptimWrapper",
optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005),
paramwise_cfg=dict(
custom_keys={
"backbone.": dict(lr_mult=0.1),
# "text_encoder": dict(lr_mult=0.05),
"absolute_pos_embed": dict(decay_mult=0.0),
}
),
clip_grad=dict(max_norm=10, norm_type=2),
)
# learning rate
param_scheduler = [
dict(
type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500
),
dict(
type="MultiStepLR",
begin=0,
end=62000,
by_epoch=False,
milestones=[40000, 55000],
gamma=0.1,
),
]
custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)]
default_hooks = dict(
checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3),
)
vis_backends = [
dict(
type="TensorboardVisBackend",
save_dir="/job_tboard" if not DEBUG else "./work-dir",
),
]
visualizer = dict(
type="Visualizer",
vis_backends=vis_backends,
name="visualizer",
)
load_from = "ckpt/bip3d_det.pth"
================================================
FILE: configs/bip3d_grounding_rgb.py
================================================
_base_ = ["./bip3d_grounding.py"]
model = dict(
backbone_3d=None,
neck_3d=None,
spatial_enhancer=dict(with_feature_3d=False),
decoder=dict(deformable_model=dict(with_depth=False)),
)
load_from = "ckpt/bip3d_det_rgb.pth"
================================================
FILE: configs/default_runtime.py
================================================
default_scope = "bip3d"
default_hooks = dict(
timer=dict(type="IterTimerHook"),
logger=dict(
type="LoggerHook",
interval=25,
),
param_scheduler=dict(type="ParamSchedulerHook"),
checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=4),
sampler_seed=dict(type="DistSamplerSeedHook"),
)
# visualization=dict(type='Det3DVisualizationHook'))
env_cfg = dict(
cudnn_benchmark=False,
mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
dist_cfg=dict(backend="nccl"),
)
log_processor = dict(type="LogProcessor", window_size=50, by_epoch=True)
log_level = "INFO"
load_from = None
resume = False
# TODO: support auto scaling lr
randomness = dict(seed=0)
find_unused_parameters = False
================================================
FILE: docs/quick_start.md
================================================
# Quick Start
### Set up python environment
```bash
virtualenv mm_bip3d --python=python3.8
source mm_bip3d/bin/activate
pip3 install --upgrade pip
bip3d_path="path/to/bip3d"
cd ${bip3d_path}
# MMCV recommends installing via a wheel package, url: https://download.openmmlab.com/mmcv/dist/cu{$cuda_version}/torch{$torch_version}/index.html
pip3 install -r requirement.txt
```
### Compile the deformable_aggregation CUDA op
```bash
cd bip3d/ops
python3 setup.py develop
cd ../../
```
### Prepare the data
Download the [EmbodiedScan dataset](https://github.com/OpenRobotLab/EmbodiedScan) and create symbolic links.
```bash
cd ${bip3d_path}
mkdir data
ln -s path/to/embodiedscan ./data/embodiedscan
```
Download datasets [ScanNet](https://github.com/ScanNet/ScanNet), [3RScan](https://github.com/WaldJohannaU/3RScan), [Matterport3D](https://github.com/niessner/Matterport), and optionally download [ARKitScenes](https://github.com/apple/ARKitScenes). Adjust the data directory structure as follows:
```bash
${bip3d_path}
└──data
├──embodiedscan
│ ├──embodiedscan_infos_train.pkl
│ ├──embodiedscan_infos_val.pkl
│ ...
├──3rscan
│ ├──00d42bed-778d-2ac6-86a7-0e0e5f5f5660
│ ...
├──scannet
│ └──posed_images
├──matterport3d
│ ├──17DRP5sb8fy
│ ...
└──arkitscenes
├──Training
└──Validation
```
### Prepare pre-trained weights
Download the required Grounding-DINO pre-trained weights: [Swin-Tiny](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth) and [Swin-Base](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth).
```bash
mkdir ckpt
# Swin-Tiny
wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth -O ckpt/groundingdino_swint_ogc_mmdet-822d7e9d.pth
python tools/ckpt_rename.py ckpt/groundingdino_swint_ogc_mmdet-822d7e9d.pth
# Swin-Base
wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth -O ckpt/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
python tools/ckpt_rename.py ckpt/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
```
Download bert config and pretrain weights from [huggingface](https://huggingface.co/google-bert/bert-base-uncased/tree/main).
```bash
${bip3d_path}
└──ckpt
├──groundingdino_swint_ogc_mmdet-822d7e9d.pth
├──groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
└──bert-base-uncased
├──config.json
├──tokenizer_config.json
├──tokenizer.json
├──pytorch_model.bin
...
```
### Generate anchors by K-means
```bash
mkdir anchor_files
python3 tools/anchor_bbox3d_kmeans.py \
--ann_file data/embodiedscan/embodiedscan_infos_train.pkl \
--output_file anchor_files/embodiedscan_kmeans.npy
```
You can also download the anchor file we provide.
### Modify config
According to personal needs, modify some config items, such as [`DEBUG`](../configs/bip3d_det.py#L8), [`collect_dir`](../configs/bip3d_det.py#L425), [`save_dir`](../configs/bip3d_det.py#L475), and [`load_from`](../configs/bip3d_det.py#L485).
If performing multi-machine training, ensure that [`collect_dir`](../configs/bip3d_det.py#L425) is a shared folder accessible by all machines.
### Run local training and testing
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# train
bash engine.sh configs/xxx.py
# test
bash test.sh configs/xxx.py path/to/checkpoint
```
================================================
FILE: engine.sh
================================================
export PYTHONPATH=./:$PYTHONPATH
CONFIG=configs/bip3d_det.py
if ! [[ -z $1 ]]; then
echo $1
CONFIG=$1
fi
nvcc -V
which nvcc
gpus=(${CUDA_VISIBLE_DEVICES//,/ })
gpu_num=${#gpus[@]}
echo "number of gpus: "${gpu_num}
echo $CONFIG
if [ ${gpu_num} -gt 1 ]
then
bash ./tools/dist_train.sh \
${CONFIG} \
${gpu_num} \
--work-dir=work-dirs
else
python ./tools/train.py \
${CONFIG} \
--work-dir ./work-dir
fi
================================================
FILE: requirements.txt
================================================
einops==0.8.0
huggingface-hub==0.26.5
mmcv==2.1.0
mmdet==3.3.0
mmengine==0.10.4
ninja==1.11.1.3
numpy==1.26.4
opencv-python==4.10.0.84
pytorch3d==0.7.5
requests==2.32.3
scipy==1.14.1
tensorboard==2.18.0
timm==1.0.3
tokenizers==0.21.0
torch==2.1.0+cu118
torchaudio==2.1.0+cu118
torchmetrics==1.6.1
torchvision==0.16.0+cu118
================================================
FILE: test.sh
================================================
export PYTHONPATH=./:$PYTHONPATH
CONFIG=$1
CKPT=$2
nvcc -V
which nvcc
gpus=(${CUDA_VISIBLE_DEVICES//,/ })
gpu_num=${#gpus[@]}
echo "number of gpus: "${gpu_num}
echo "ckeckpoint: "$CKPT
echo $CONFIG
if [ ${gpu_num} -gt 1 ]
then
bash ./tools/dist_test.sh \
${CONFIG} \
$CKPT \
$gpu_num \
--work-dir ./work-dir
else
python3 tools/test.py \
$CONFIG \
$CKPT \
--work-dir ./work-dir
fi
================================================
FILE: tools/anchor_bbox3d_kmeans.py
================================================
import torch
import pickle
import tqdm
from sklearn.cluster import KMeans
import numpy as np
from bip3d.structures.bbox_3d import EulerDepthInstance3DBoxes
def sample(ids, n):
if n == len(ids):
return np.copy(ids)
elif n > len(ids):
return np.concatenate(
[ids, sample(ids, n - len(ids))]
)
else:
interval = len(ids) / n
output = []
for i in range(n):
output.append(ids[int(interval*i)])
return np.array(output)
def kmeans(
ann_file,
output_file,
z_min=-0.2,
z_max=3,
):
ann = pickle.load(open(ann_file, "rb"))
all_cam_bbox = []
for x in tqdm.tqdm(ann["data_list"]):
bbox = np.array([y["bbox_3d"] for y in x["instances"]])
ids = np.arange(len(x["images"]))
ids = sample(ids, 50)
for idx in ids:
image = x["images"][idx]
global2cam = np.linalg.inv(x['axis_align_matrix'] @ image['cam2global'])
_bbox = EulerDepthInstance3DBoxes(np.copy(bbox[image["visible_instance_ids"]]))
mask = torch.logical_and(
_bbox.tensor[:, 2] > z_min,
_bbox.tensor[:, 2] < z_max
)
_bbox = _bbox[mask]
_bbox.transform(global2cam)
all_cam_bbox.append(_bbox.tensor.numpy())
all_cam_bbox = np.concatenate(all_cam_bbox)
print("start to kmeans, please wait")
cluster_cam = KMeans(n_clusters=100).fit(all_cam_bbox).cluster_centers_
cluster_cam[:, 3:6] = np.log(cluster_cam[:, 3:6])
np.save(output_file, cluster_cam)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description='anchor bbox3d kmeans for embodiedscan dataset')
parser.add_argument("ann_file")
parser.add_argument("--output_file")
parser.add_argument("--z_min", defaule=-0.2)
parser.add_argument("--z_max", defaule=3)
args = parser.parse_args()
kmeans(args.ann_file, args.output_file, args.z_min, args.z_max)
================================================
FILE: tools/benchmark.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
from mmengine import MMLogger
from mmengine.config import Config, DictAction
from mmengine.dist import init_dist
from mmengine.registry import init_default_scope
from mmengine.utils import mkdir_or_exist
from mmdet.utils.benchmark import (DataLoaderBenchmark, DatasetBenchmark,
InferenceBenchmark)
from embodiedscan import *
def parse_args():
parser = argparse.ArgumentParser(description='MMDet benchmark')
parser.add_argument('config', help='test config file path')
parser.add_argument('--checkpoint', help='checkpoint file')
parser.add_argument(
'--task',
choices=['inference', 'dataloader', 'dataset'],
default='dataloader',
help='Which task do you want to go to benchmark')
parser.add_argument(
'--repeat-num',
type=int,
default=1,
help='number of repeat times of measurement for averaging the results')
parser.add_argument(
'--max-iter', type=int, default=2000, help='num of max iter')
parser.add_argument(
'--log-interval', type=int, default=50, help='interval of logging')
parser.add_argument(
'--num-warmup', type=int, default=5, help='Number of warmup')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument(
'--dataset-type',
choices=['train', 'val', 'test'],
default='test',
help='Benchmark dataset type. only supports train, val and test')
parser.add_argument(
'--work-dir',
help='the directory to save the file containing '
'benchmark metrics')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def inference_benchmark(args, cfg, distributed, logger):
benchmark = InferenceBenchmark(
cfg,
args.checkpoint,
distributed,
args.fuse_conv_bn,
args.max_iter,
args.log_interval,
args.num_warmup,
logger=logger)
return benchmark
def dataloader_benchmark(args, cfg, distributed, logger):
benchmark = DataLoaderBenchmark(
cfg,
distributed,
args.dataset_type,
args.max_iter,
args.log_interval,
args.num_warmup,
logger=logger)
return benchmark
def dataset_benchmark(args, cfg, distributed, logger):
benchmark = DatasetBenchmark(
cfg,
args.dataset_type,
args.max_iter,
args.log_interval,
args.num_warmup,
logger=logger)
return benchmark
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
init_default_scope(cfg.get('default_scope', 'mmdet'))
distributed = False
if args.launcher != 'none':
init_dist(args.launcher, **cfg.get('env_cfg', {}).get('dist_cfg', {}))
distributed = True
log_file = None
if args.work_dir:
log_file = os.path.join(args.work_dir, 'benchmark.log')
mkdir_or_exist(args.work_dir)
logger = MMLogger.get_instance(
'mmdet', log_file=log_file, log_level='INFO')
benchmark = eval(f'{args.task}_benchmark')(args, cfg, distributed, logger)
benchmark.run(args.repeat_num)
if __name__ == '__main__':
main()
================================================
FILE: tools/ckpt_rename.py
================================================
import os
import torch
def get_renamed_ckpt(file, output="./"):
ckpt_rename = dict()
ckpt = torch.load(file)
if "state_dict" in ckpt:
ckpt = ckpt["state_dict"]
for key,value in ckpt.items():
k = None
if key.startswith("backbone") or key.startswith("neck"):
k = key
elif key.startswith("language_model."):
k = key.replace("language_model.", "text_encoder.")
elif key.startswith("encoder."):
if key.startswith("encoder.layers."):
k = key.replace("encoder.layers.", "feature_enhancer.img_attn_blocks.")
elif key.startswith("encoder.fusion_layers."):
k = key.replace("encoder.fusion_layers.", "feature_enhancer.text_img_attn_blocks.")
elif key.startswith("encoder.text_layers."):
k = key.replace("encoder.text_layers.", "feature_enhancer.text_attn_blocks.")
elif key.startswith("decoder.layers."):
ops = [
"gnn",
"norm",
"text_cross_attn",
"norm",
"deformable",
"norm",
"ffn",
"norm",
"refine",
]
layer_id = int(key.split(".")[2])
name = key.split(".")[3]
for op in ops:
if name == "self_attn":
block_id = 0
elif name == "cross_attn_text":
block_id = 2
elif name == "cross_attn":
block_id = 4
elif name == "ffn":
block_id = 6
elif name == "norms":
norm_id = int(key.split(".")[4])
block_id = (norm_id + 1) * 2 - 1
op_id = block_id + layer_id * len(ops)
k = f"decoder.layers.{op_id}."
if name == "norms":
k += ".".join(key.split(".")[5:])
else:
k += ".".join(key.split(".")[4:])
elif key.startswith("bbox_head."):
layer_id = int(key.split(".")[2])
op_id = 8 + layer_id * 9
if "reg_branches" in key:
k = f"decoder.layers.{op_id}." + ".".join(key.split(".")[3:])
elif "cls_branches" in key:
k = f"decoder.layers.{op_id}.bias"
elif "pts_prob_fc" in key:
k = "spatial_enhancer." + key
elif key in [
"pts_prob_pre_fc.weight",
"pts_prob_pre_fc.bias",
"pts_fc.weight",
"pts_fc.bias",
"fusion_fc.0.layers.0.0.weight",
"fusion_fc.0.layers.0.0.bias",
"fusion_fc.0.layers.1.weight",
"fusion_fc.0.layers.1.bias",
"fusion_fc.1.weight",
"fusion_fc.1.bias",
"fusion_norm.weight",
"fusion_norm.bias",
]:
k = "spatial_enhancer." + key
elif key == "level_embed":
k = "feature_enhancer.level_embed"
elif key == "query_embedding.weight":
k = "decoder.instance_bank.instance_feature"
if k is None:
# print(key)
k = key
if k == "decoder.norm.weight":
print(key)
ckpt_rename[k] = value
path, file_name = os.path.split(file)
file_name = file_name[:-4]+"-rename.pth"
output_file = os.path.join(output, file_name)
torch.save(ckpt_rename, output_file)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description='mmdet Grounding-DINO checkpoint rename to BIP3D')
parser.add_argument("file")
parser.add_argument("--output", default=None)
args = parser.parse_args()
if args.output is None:
output = "./"
get_renamed_ckpt(args.file, output)
================================================
FILE: tools/dist_test.sh
================================================
#!/usr/bin/env bash
CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-11000}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash
CONFIG=$1
GPUS=$2
PORT=${PORT:-12050}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
================================================
FILE: tools/test.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import time
import os
def wait_before_import_config():
t = int(os.environ.get('LOCAL_RANK', 0))
time.sleep(t * 0.5)
def wait_after_import_config():
t = int(os.environ.get('WORLD_SIZE', 0)) - int(os.environ.get('LOCAL_RANK', 0))
time.sleep(t * 0.5)
wait_before_import_config()
from mmengine.config import Config
wait_after_import_config()
import argparse
import os
import os.path as osp
from mmengine.config import Config, ConfigDict, DictAction
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
import torch
torch.multiprocessing.set_sharing_strategy('file_system')
# from embodiedscan.utils import replace_ceph_backend
# TODO: support fuse_conv_bn and format_only
def parse_args():
parser = argparse.ArgumentParser(
description='MMDet3D test (and eval) a model')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument(
'--work-dir',
help='the directory to save the file containing evaluation metrics')
parser.add_argument('--task-name', help='task names')
parser.add_argument('--ceph',
action='store_true',
help='Use ceph as data storage backend')
parser.add_argument('--show',
action='store_true',
help='show prediction results')
parser.add_argument('--show-dir',
help='directory where painted images will be saved. '
'If specified, it will be automatically saved '
'to the work_dir/timestamp/show_dir')
parser.add_argument('--score-thr',
type=float,
default=0.1,
help='bbox score threshold')
parser.add_argument(
'--task',
type=str,
choices=[
'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
'multi-modality_det'
],
help='Determine the visualization method depending on the task.')
parser.add_argument('--wait-time',
type=float,
default=2,
help='the interval of show (s)')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument('--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--tta',
action='store_true',
help='Test time augmentation')
# When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
# will pass the `--local-rank` parameter to `tools/test.py` instead
# of `--local_rank`.
parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def trigger_visualization_hook(cfg, args):
default_hooks = cfg.default_hooks
if 'visualization' in default_hooks:
visualization_hook = default_hooks['visualization']
# Turn on visualization
visualization_hook['draw'] = True
if args.show:
visualization_hook['show'] = True
visualization_hook['wait_time'] = args.wait_time
if args.show_dir:
visualization_hook['test_out_dir'] = args.show_dir
all_task_choices = [
'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
'multi-modality_det'
]
assert args.task in all_task_choices, 'You must set '\
f"'--task' in {all_task_choices} in the command " \
'if you want to use visualization hook'
visualization_hook['vis_task'] = args.task
visualization_hook['score_thr'] = args.score_thr
else:
raise RuntimeError(
'VisualizationHook must be included in default_hooks.'
'refer to usage '
'"visualization=dict(type=\'VisualizationHook\')"')
return cfg
def main():
args = parse_args()
# load config
cfg = Config.fromfile(args.config)
# TODO: We will unify the ceph support approach with other OpenMMLab repos
# if args.ceph:
# cfg = replace_ceph_backend(cfg)
cfg.launcher = args.launcher
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif args.task_name is not None:
cfg.work_dir = osp.join('./work_dirs', args.task_name)
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
cfg.load_from = args.checkpoint
if args.show or args.show_dir:
cfg = trigger_visualization_hook(cfg, args)
if args.tta:
# Currently, we only support tta for 3D segmentation
# TODO: Support tta for 3D detection
assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.'
assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` in config.'
cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model)
# build the runner from config
if 'runner_type' not in cfg:
# build the default runner
runner = Runner.from_cfg(cfg)
else:
# build customized runner from the registry
# if 'runner_type' is set in the cfg
runner = RUNNERS.build(cfg)
# start testing
runner.test()
if __name__ == '__main__':
main()
================================================
FILE: tools/train.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import time
import os
def wait_before_import_config():
t = int(os.environ.get('LOCAL_RANK', 0))
time.sleep(t * 0.5)
def wait_after_import_config():
t = int(os.environ.get('WORLD_SIZE', 0)) - int(os.environ.get('LOCAL_RANK', 0))
time.sleep(t * 0.5)
wait_before_import_config()
from mmengine.config import Config
wait_after_import_config()
import argparse
import logging
import os
import os.path as osp
from mmengine.config import Config, DictAction
from mmengine.logging import print_log
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
from bip3d import *
import torch
# torch.autograd.set_detect_anomaly(True)
torch.multiprocessing.set_sharing_strategy('file_system')
def parse_args():
parser = argparse.ArgumentParser(description='Train a 3D detector')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work-dir', help='the dir to save logs and models')
parser.add_argument('--task-name', help='task names')
parser.add_argument('--amp',
action='store_true',
default=False,
help='enable automatic-mixed-precision training')
parser.add_argument('--auto-scale-lr',
action='store_true',
help='enable automatically scaling LR.')
parser.add_argument(
'--resume',
nargs='?',
type=str,
const='auto',
help='If specify checkpoint path, resume from it, while if not '
'specify, try to auto resume from the latest checkpoint '
'in the work directory.')
parser.add_argument('--ceph',
action='store_true',
help='Use ceph as data storage backend')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument('--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
# When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
# will pass the `--local-rank` parameter to `tools/train.py` instead
# of `--local_rank`.
parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def main():
args = parse_args()
# load config
cfg = Config.fromfile(args.config)
# TODO: We will unify the ceph support approach with other OpenMMLab repos
# if args.ceph:
# cfg = replace_ceph_backend(cfg)
cfg.launcher = args.launcher
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif args.task_name is not None:
cfg.work_dir = osp.join('./work_dirs', args.task_name)
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
# enable automatic-mixed-precision training
if args.amp is True:
optim_wrapper = cfg.optim_wrapper.type
if optim_wrapper == 'AmpOptimWrapper':
print_log('AMP training is already enabled in your config.',
logger='current',
level=logging.WARNING)
else:
assert optim_wrapper == 'OptimWrapper', (
'`--amp` is only supported when the optimizer wrapper type is '
f'`OptimWrapper` but got {optim_wrapper}.')
cfg.optim_wrapper.type = 'AmpOptimWrapper'
cfg.optim_wrapper.loss_scale = 'dynamic'
# enable automatically scaling LR
if args.auto_scale_lr:
if 'auto_scale_lr' in cfg and \
'enable' in cfg.auto_scale_lr and \
'base_batch_size' in cfg.auto_scale_lr:
cfg.auto_scale_lr.enable = True
else:
raise RuntimeError('Can not find "auto_scale_lr" or '
'"auto_scale_lr.enable" or '
'"auto_scale_lr.base_batch_size" in your'
' configuration file.')
# resume is determined in this priority: resume from > auto_resume
if args.resume == 'auto':
cfg.resume = True
cfg.load_from = None
elif args.resume is not None:
cfg.resume = True
cfg.load_from = args.resume
# build the runner from config
if 'runner_type' not in cfg:
# build the default runner
runner = Runner.from_cfg(cfg)
else:
# build customized runner from the registry
# if 'runner_type' is set in the cfg
runner = RUNNERS.build(cfg)
# start training
runner.train()
if __name__ == '__main__':
main()