Repository: HorizonRobotics/BIP3D Branch: main Commit: b1ed6a84faaf Files: 87 Total size: 582.4 KB Directory structure: gitextract_nz633hq0/ ├── .gitignore ├── LICENSE ├── README.md ├── bip3d/ │ ├── __init__.py │ ├── converter/ │ │ ├── extract_occupancy_ann.py │ │ ├── generate_image_3rscan.py │ │ ├── generate_image_matterport3d.py │ │ └── generate_image_scannet.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── embodiedscan_det_grounding_dataset.py │ │ ├── transforms/ │ │ │ ├── __init__.py │ │ │ ├── augmentation.py │ │ │ ├── formatting.py │ │ │ ├── loading.py │ │ │ ├── multiview.py │ │ │ └── transform.py │ │ └── utils.py │ ├── eval/ │ │ ├── __init__.py │ │ ├── indoor_eval.py │ │ └── metrics/ │ │ ├── __init__.py │ │ ├── det_metric.py │ │ └── grounding_metric.py │ ├── grid_mask.py │ ├── models/ │ │ ├── __init__.py │ │ ├── base_target.py │ │ ├── bbox3d_decoder.py │ │ ├── bert.py │ │ ├── data_preprocessors/ │ │ │ ├── __init__.py │ │ │ ├── custom_data_preprocessor.py │ │ │ └── utils.py │ │ ├── deformable_aggregation.py │ │ ├── feature_enhancer.py │ │ ├── instance_bank.py │ │ ├── spatial_enhancer.py │ │ ├── structure.py │ │ ├── target.py │ │ └── utils.py │ ├── ops/ │ │ ├── __init__.py │ │ ├── deformable_aggregation.py │ │ ├── gcc.sh │ │ ├── setup.py │ │ └── src/ │ │ ├── deformable_aggregation.cpp │ │ ├── deformable_aggregation_cuda.cu │ │ ├── deformable_aggregation_with_depth.cpp │ │ └── deformable_aggregation_with_depth_cuda.cu │ ├── registry.py │ ├── structures/ │ │ ├── __init__.py │ │ ├── bbox_3d/ │ │ │ ├── __init__.py │ │ │ ├── base_box3d.py │ │ │ ├── box_3d_mode.py │ │ │ ├── coord_3d_mode.py │ │ │ ├── euler_box3d.py │ │ │ ├── euler_depth_box3d.py │ │ │ └── utils.py │ │ ├── ops/ │ │ │ ├── __init__.py │ │ │ ├── box_np_ops.py │ │ │ ├── iou3d_calculator.py │ │ │ └── transforms.py │ │ └── points/ │ │ ├── __init__.py │ │ ├── base_points.py │ │ ├── cam_points.py │ │ ├── depth_points.py │ │ └── lidar_points.py │ └── utils/ │ ├── __init__.py │ ├── array_converter.py │ ├── default_color_map.py │ ├── dist_utils.py │ ├── line_mesh.py │ └── typing_config.py ├── configs/ │ ├── __init__.py │ ├── bip3d_det.py │ ├── bip3d_det_grounding.py │ ├── bip3d_det_rgb.py │ ├── bip3d_grounding.py │ ├── bip3d_grounding_rgb.py │ └── default_runtime.py ├── docs/ │ └── quick_start.md ├── engine.sh ├── requirements.txt ├── test.sh └── tools/ ├── anchor_bbox3d_kmeans.py ├── benchmark.py ├── ckpt_rename.py ├── dist_test.sh ├── dist_train.sh ├── test.py └── train.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST *.so # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don’t work, or not # install at all, so you might want to exclude Pipfile.lock from version control. Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # VS Code .vscode/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Horizon Robotics Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # BIP3D: Bridging 2D Images and 3D Perception for Embodied Intelligence
Xuewu Lin, Tianwei Lin, Lichao Huang, Hongyu Xie, Zhizhong Su
Code Homepage Hugging Face Paper
## :rocket: News **01/Jun/2025**: We have refactored and integrated the BIP3D code into [robo_orchard_lab](https://github.com/HorizonRobotics/robo_orchard_lab/tree/master/projects/bip3d_grounding), removing the dependency on MM​​ series. The environment is now easier to set up, and the performance are improved. Welcome to try it out! **14/Mar/2025**: Our code has been released. **27/Feb/2025**: Our paper has been accepted by CVPR 2025. **22/Nov/2024**: We release our paper to [Arxiv](https://arxiv.org/abs/2411.14869). ## :open_book: Quick Start [Quick Start](docs/quick_start.md) ## :link: Framework
BIP3D

The Architecture Diagram of BIP3D, where the red stars indicate the parts that have been modified or added compared to the base model, GroundingDINO, and dashed lines indicate optional elements.

## :trophy: Results on EmbodiedScan Benchmark We made several improvements based on the original paper, achieving better 3D perception results. The main improvements include the following two points: 1. **New Fusion Operation**: We enhanced the decoder by replacing the deformable aggregation (DAG) with a 3D deformable attention mechanism (DAT). Specifically, we improved the feature sampling process by transitioning from bilinear interpolation to trilinear interpolation, which leverages depth distribution for more accurate feature extraction. 2. **Mixed Data Training**: To optimize the grounding model's performance, we adopted a mixed-data training strategy by integrating detection data with grounding data during the grounding finetuning process. ### 1. Results on Multi-view 3D Detection Validation Dataset Op DAG denotes deformable aggregation, and DAT denotes 3D deformable attention. Set [`with_depth=True`](configs/bip3d_det.py#L175) to activate the DAT. The metric in the table is `AP@0.25`. For more metrics, please refer to the logs. |Model | Inputs | Op | Overall | Head | Common | Tail | Small | Medium | Large | ScanNet | 3RScan | MP3D | ckpt | log | | :---- | :---: | :---: | :---: |:---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: | :----: | :---: | |[BIP3D](configs/bip3d_det_rgb.py) | RGB | DAG | 16.57|23.29|13.84|12.29|2.67|17.85|12.89|19.71|26.76|8.50 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dag/job.log) | |[BIP3D](configs/bip3d_det_rgb.py) | RGB | DAT | 16.67|22.41|14.19|13.18|3.32|17.25|14.89|20.80|24.18|9.91 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgb_dat/job.log) | |[BIP3D](configs/bip3d_det.py) |RGB-D | DAG | 22.53|28.89|20.51|17.83|6.95|24.21|15.46|24.77|35.29|10.34 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dag/job.log) | |[BIP3D](configs/bip3d_det.py) |RGB-D | DAT | 23.24|31.51|20.20|17.62|7.31|24.09|15.82|26.35|36.29|11.44 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/det_rgbd_dat/job.log) | ### 2. Results on Multi-view 3D Grounding Mini Dataset To train and validate with mini dataset, set [`data_version="v1-mini"`](configs/bip3d_grounding.py#L333). |Model | Inputs | Op | Overall | Easy | Hard | View-dep | View-indep | ScanNet | 3RScan | MP3D | ckpt | log | | :---- | :---: | :---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: |:---: | :----: | |[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAG | 44.00|44.39|39.56|46.05|42.92|48.62|42.47|36.40 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dag/job.log) | |[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAT | 44.43|44.74|41.02|45.17|44.04|49.70|41.81|37.28 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgb_dat/job.log) | |[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAG | 45.79|46.22|40.91|45.93|45.71|48.94|46.61|37.36 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dag/job.log) | |[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAT | 58.47|59.02|52.23|60.20|57.56|66.63|54.79|46.72 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_mini_rgbd_dat/job.log) | ### 3. Results on Multi-view 3D Grounding Validation Dataset |Model | Inputs | Op | Mixed Data | Overall | Easy | Hard | View-dep | View-indep | ScanNet | 3RScan | MP3D | ckpt | log | | :---- | :---: | :---: | :---: |:---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: |:---: | :----: | |[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAG |No| 45.81|46.21|41.34|47.07|45.09|50.40|47.53|32.97 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dag/job.log) | |[BIP3D](configs/bip3d_grounding_rgb.py) | RGB | DAT |No| 47.29|47.82|41.42|48.58|46.56|52.74|47.85|34.60 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgb_dat/job.log) | |[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAG |No| 53.75|53.87|52.43|55.21|52.93|60.05|54.92|38.20 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dag/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dag/job.log) | |[BIP3D](configs/bip3d_grounding.py) | RGB-D | DAT |No|61.36|61.88|55.58|62.43|60.76|66.96|62.75|46.92 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat/job.log) | |[BIP3D](configs/bip3d_det_grounding.py) | RGB-D | DAT |Yes|66.58|66.99|62.07|67.95|65.81|72.43|68.26|51.14 | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat_mixdata/model_checkpoint.pth) | [link](https://huggingface.co/HorizonRobotics/BIP3D/blob/main/grounding_rgbd_dat_mixdata/job.log) | ### 4. [Results on Multi-view 3D Grounding Test Dataset](https://huggingface.co/spaces/AGC2024/visual-grounding-2024) |Model | Overall | Easy | Hard | View-dep | View-indep | ckpt | log | | :---- | :---: | :---: | :---: | :---: | :---:| :---:|:---:| |[EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan) | 39.67 | 40.52 | 30.24 | 39.05 | 39.94 | - | - | |[SAG3D*](https://opendrivelab.github.io/Challenge%202024/multiview_Mi-Robot.pdf) | 46.92 | 47.72 | 38.03 | 46.31 | 47.18 | - | - | |[DenseG*](https://opendrivelab.github.io/Challenge%202024/multiview_THU-LenovoAI.pdf) | 59.59 | 60.39 | 50.81 | 60.50 | 59.20 | - | - | |[BIP3D](configs/bip3d_det_grounding.py) | 67.38 | 68.12 | 59.08 | 67.88 | 67.16 | - | - | |BIP3D-B | 70.53 | 71.22 | 62.91 | 70.69 | 70.47 | - | - | `*` denotes model ensemble, and note that our BIP3D does not use the ensemble trick. This differs from what is mentioned in the paper and shows significant improvements. Our best model, BIP3D-B, is based on GroundingDINO-base and is trained with the addition ARKitScenes dataset. ## :page_facing_up: Citation ``` @article{lin2024bip3d, title={BIP3D: Bridging 2D Images and 3D Perception for Embodied Intelligence}, author={Lin, Xuewu and Lin, Tianwei and Huang, Lichao and Xie, Hongyu and Su, Zhizhong}, journal={arXiv preprint arXiv:2411.14869}, year={2024} } ``` ## :handshake: Acknowledgement [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan) [Sparse4D](https://github.com/HorizonRobotics/Sparse4D) [3D-deformable-attention](https://github.com/IDEA-Research/3D-deformable-attention) [mmdet-GroundingDINO](https://github.com/open-mmlab/mmdetection/tree/main/configs/grounding_dino) ================================================ FILE: bip3d/__init__.py ================================================ from .registry import * ================================================ FILE: bip3d/converter/extract_occupancy_ann.py ================================================ import os import shutil from argparse import ArgumentParser from tqdm import tqdm def extract_occupancy(dataset, src, dst): """Extract occupancy annotations of a single dataset to dataset root.""" print('Processing dataset', dataset) scenes = os.listdir(os.path.join(src, dataset)) dst_dataset = os.path.join(dst, dataset) if not os.path.exists(dst_dataset): print('Missing dataset:', dataset) return for scene in tqdm(scenes): if dataset == 'scannet': dst_scene = os.path.join(dst_dataset, 'scans', scene) else: dst_scene = os.path.join(dst_dataset, scene) if not os.path.exists(dst_scene): print(f'Missing scene {scene} in dataset {dataset}') continue dst_occ = os.path.join(dst_scene, 'occupancy') if not os.path.exists(dst_occ): shutil.copytree(os.path.join(src, dataset, scene), dst_occ) else: files = os.listdir(os.path.join(src, dataset, scene)) for file in files: if not os.path.exists(os.path.join(dst_occ, file)): shutil.copyfile(os.path.join(src, dataset, scene, file), os.path.join(dst_occ, file)) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--src', required=True, help='folder of the occupancy annotations') parser.add_argument('--dst', required=True, help='folder of the raw datasets') args = parser.parse_args() datasets = os.listdir(args.src) for dataset in datasets: extract_occupancy(dataset, args.src, args.dst) ================================================ FILE: bip3d/converter/generate_image_3rscan.py ================================================ import os import zipfile from argparse import ArgumentParser from functools import partial import mmengine def process_scene(path, scene_name): """Process single 3Rscan scene.""" with zipfile.ZipFile(os.path.join(path, scene_name, 'sequence.zip'), 'r') as zip_ref: zip_ref.extractall(os.path.join(path, scene_name, 'sequence')) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--dataset_folder', required=True, help='folder of the dataset.') parser.add_argument('--nproc', type=int, default=8) args = parser.parse_args() mmengine.track_parallel_progress(func=partial(process_scene, args.dataset_folder), tasks=os.listdir(args.dataset_folder), nproc=args.nproc) ================================================ FILE: bip3d/converter/generate_image_matterport3d.py ================================================ import os import zipfile from argparse import ArgumentParser from functools import partial import mmengine def process_scene(path, output_folder, scene_name): """Process single 3Rscan scene.""" files = list(os.listdir(os.path.join(path, scene_name))) for file in files: if not file.endswith(".zip"): continue if file != "sens.zip": continue with zipfile.ZipFile(os.path.join(path, scene_name, file), 'r') as zip_ref: if file == "sens.zip": zip_ref.extractall(os.path.join(output_folder, scene_name, file[:-4])) else: zip_ref.extractall(output_folder) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--dataset_folder', required=True, help='folder of the dataset.') parser.add_argument('--output_folder', required=True, help='output folder of the dataset.') parser.add_argument('--nproc', type=int, default=8) args = parser.parse_args() if not os.path.exists(args.output_folder): os.makedirs(args.output_folder, exist_ok=True) mmengine.track_parallel_progress(func=partial(process_scene, args.dataset_folder, args.output_folder), tasks=os.listdir(args.dataset_folder), nproc=args.nproc) ================================================ FILE: bip3d/converter/generate_image_scannet.py ================================================ # Modified from https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py # noqa import os import struct import zlib from argparse import ArgumentParser from functools import partial import imageio import mmengine import numpy as np COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'} COMPRESSION_TYPE_DEPTH = { -1: 'unknown', 0: 'raw_ushort', 1: 'zlib_ushort', 2: 'occi_ushort' } class RGBDFrame: """Class for single ScanNet RGB-D image processing.""" def load(self, file_handle): """Load basic information of a given RGBD frame.""" self.camera_to_world = np.asarray(struct.unpack( 'f' * 16, file_handle.read(16 * 4)), dtype=np.float32).reshape(4, 4) self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0] self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0] self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0] self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0] self.color_data = b''.join( struct.unpack('c' * self.color_size_bytes, file_handle.read(self.color_size_bytes))) self.depth_data = b''.join( struct.unpack('c' * self.depth_size_bytes, file_handle.read(self.depth_size_bytes))) def decompress_depth(self, compression_type): """Decompress the depth data.""" assert compression_type == 'zlib_ushort' return zlib.decompress(self.depth_data) def decompress_color(self, compression_type): """Decompress the RGB image data.""" assert compression_type == 'jpeg' return imageio.imread(self.color_data) class SensorData: """Class for single ScanNet scene processing. Single scene file contains multiple RGB-D images. """ def __init__(self, filename, fast=False): self.version = 4 self.load(filename, fast) def load(self, filename, fast): """Load a single scene data with multiple RGBD frames.""" with open(filename, 'rb') as f: version = struct.unpack('I', f.read(4))[0] assert self.version == version strlen = struct.unpack('Q', f.read(8))[0] self.sensor_name = b''.join( struct.unpack('c' * strlen, f.read(strlen))) self.intrinsic_color = np.asarray(struct.unpack( 'f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) self.extrinsic_color = np.asarray(struct.unpack( 'f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) self.intrinsic_depth = np.asarray(struct.unpack( 'f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) self.extrinsic_depth = np.asarray(struct.unpack( 'f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack( 'i', f.read(4))[0]] self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack( 'i', f.read(4))[0]] self.color_width = struct.unpack('I', f.read(4))[0] self.color_height = struct.unpack('I', f.read(4))[0] self.depth_width = struct.unpack('I', f.read(4))[0] self.depth_height = struct.unpack('I', f.read(4))[0] self.depth_shift = struct.unpack('f', f.read(4))[0] num_frames = struct.unpack('Q', f.read(8))[0] self.num_frames = num_frames self.frames = [] if fast: index = list(range(num_frames))[::10] else: index = list(range(num_frames)) self.index = index for i in range(num_frames): frame = RGBDFrame() frame.load(f) if i in index: self.frames.append(frame) def export_depth_images(self, output_path): """Export depth images to the output path.""" if not os.path.exists(output_path): os.makedirs(output_path) for f in range(len(self.frames)): depth_data = self.frames[f].decompress_depth( self.depth_compression_type) depth = np.fromstring(depth_data, dtype=np.uint16).reshape( self.depth_height, self.depth_width) imageio.imwrite( os.path.join(output_path, self.index_to_str(self.index[f]) + '.png'), depth) def export_color_images(self, output_path): """Export RGB images to the output path.""" if not os.path.exists(output_path): os.makedirs(output_path) for f in range(len(self.frames)): color = self.frames[f].decompress_color( self.color_compression_type) imageio.imwrite( os.path.join(output_path, self.index_to_str(self.index[f]) + '.jpg'), color) @staticmethod def index_to_str(index): """Convert the sample index to string.""" return str(index).zfill(5) @staticmethod def save_mat_to_file(matrix, filename): """Save a matrix to file.""" with open(filename, 'w') as f: for line in matrix: np.savetxt(f, line[np.newaxis], fmt='%f') def export_poses(self, output_path): """Export camera poses to the output path.""" if not os.path.exists(output_path): os.makedirs(output_path) for f in range(len(self.frames)): self.save_mat_to_file( self.frames[f].camera_to_world, os.path.join(output_path, self.index_to_str(self.index[f]) + '.txt')) def export_intrinsics(self, output_path): """Export the intrinsic matrix to the output path.""" if not os.path.exists(output_path): os.makedirs(output_path) self.save_mat_to_file(self.intrinsic_color, os.path.join(output_path, 'intrinsic.txt')) def export_depth_intrinsics(self, output_path): """Export the depth intrinsic matrix to the output path.""" if not os.path.exists(output_path): os.makedirs(output_path) self.save_mat_to_file(self.intrinsic_depth, os.path.join(output_path, 'depth_intrinsic.txt')) def process_scene(path, fast, idx): """Process single ScanNet scene. Extract RGB images, poses and camera intrinsics. """ try: print(path, fast, idx) data = SensorData(os.path.join(path, idx, f'{idx}.sens'), fast) # output_path = os.path.join('/bucket/output/robot_lab/users/xuewu.lin/scannet_posed_images', idx) output_path = os.path.join('/horizon-bucket/robot_lab/users/xuewu.lin/scannet_posed_images', idx) data.export_color_images(output_path) data.export_intrinsics(output_path) data.export_poses(output_path) data.export_depth_images(output_path) data.export_depth_intrinsics(output_path) except Exception as e: print("bug info: ", path, fast, idx) raise e def process_directory(path, fast, nproc): """Process the files in a directory with parallel support.""" mmengine.track_parallel_progress(func=partial(process_scene, path, fast), # tasks=['scene0175_00'], tasks=['scene0288_01'], # tasks=['scene0288_01', 'scene0175_00'], # tasks=os.listdir(path), nproc=nproc) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--dataset_folder', default=None, help='folder of the dataset.') parser.add_argument('--nproc', type=int, default=8) parser.add_argument('--fast', action='store_true') args = parser.parse_args() if args.dataset_folder is not None: os.chdir(args.dataset_folder) # process train and val scenes if os.path.exists('scans'): process_directory('scans', args.fast, args.nproc) ================================================ FILE: bip3d/datasets/__init__.py ================================================ from .embodiedscan_det_grounding_dataset import EmbodiedScanDetGroundingDataset from .transforms import * # noqa: F401,F403 __all__ = [ 'EmbodiedScanDetGroundingDataset' ] ================================================ FILE: bip3d/datasets/embodiedscan_det_grounding_dataset.py ================================================ import math import pickle import copy import tqdm import os import warnings from typing import Callable, List, Optional, Union import mmengine import numpy as np from mmengine.dataset import BaseDataset, force_full_init from mmengine.fileio import load from mmengine.logging import print_log from bip3d.registry import DATASETS from bip3d.structures import get_box_type from .utils import sample class_names = ( 'adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', 'backpack', 'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', 'baseboard', 'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', 'bench', 'bicycle', 'bidet', 'bin', 'blackboard', 'blanket', 'blinds', 'board', 'body loofah', 'book', 'boots', 'bottle', 'bowl', 'box', 'bread', 'broom', 'brush', 'bucket', 'cabinet', 'calendar', 'camera', 'can', 'candle', 'candlestick', 'cap', 'car', 'carpet', 'cart', 'case', 'chair', 'chandelier', 'cleanser', 'clock', 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'coil', 'column', 'commode', 'computer', 'conducting wire', 'container', 'control', 'copier', 'cosmetics', 'couch', 'counter', 'countertop', 'crate', 'crib', 'cube', 'cup', 'curtain', 'cushion', 'decoration', 'desk', 'detergent', 'device', 'dish rack', 'dishwasher', 'dispenser', 'divider', 'door', 'door knob', 'doorframe', 'doorway', 'drawer', 'dress', 'dresser', 'drum', 'duct', 'dumbbell', 'dustpan', 'dvd', 'eraser', 'excercise equipment', 'fan', 'faucet', 'fence', 'file', 'fire extinguisher', 'fireplace', 'flowerpot', 'flush', 'folder', 'food', 'footstool', 'frame', 'fruit', 'furniture', 'garage door', 'garbage', 'glass', 'globe', 'glove', 'grab bar', 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 'hanger', 'hat', 'headboard', 'headphones', 'heater', 'helmets', 'holder', 'hook', 'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle', 'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', 'ladder', 'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', 'machine', 'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu', 'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', 'napkins', 'notebook', 'ottoman', 'oven', 'pack', 'package', 'pad', 'pan', 'panel', 'paper', 'paper cutter', 'partition', 'pedestal', 'pen', 'person', 'piano', 'picture', 'pillar', 'pillow', 'pipe', 'pitcher', 'plant', 'plate', 'player', 'plug', 'plunger', 'pool', 'pool table', 'poster', 'pot', 'price tag', 'printer', 'projector', 'purse', 'rack', 'radiator', 'radio', 'rail', 'range hood', 'refrigerator', 'remote control', 'ridge', 'rod', 'roll', 'roof', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen', 'seasoning', 'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel', 'shower', 'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', 'socket', 'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', 'stapler', 'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', 'structure', 'sunglasses', 'support', 'switch', 'table', 'tablet', 'teapot', 'telephone', 'thermostat', 'tissue', 'tissue box', 'toaster', 'toilet', 'toilet paper', 'toiletry', 'tool', 'toothbrush', 'toothpaste', 'towel', 'toy', 'tray', 'treadmill', 'trophy', 'tube', 'tv', 'umbrella', 'urn', 'utensil', 'vacuum cleaner', 'vanity', 'vase', 'vent', 'ventilation', 'wardrobe', 'washbasin', 'washing machine', 'water cooler', 'water heater', 'window', 'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap') head_labels = [ 48, 177, 82, 179, 37, 243, 28, 277, 32, 84, 215, 145, 182, 170, 22, 72, 30, 141, 65, 257, 221, 225, 52, 75, 231, 158, 236, 156, 47, 74, 6, 18, 71, 242, 217, 251, 66, 263, 5, 45, 14, 73, 278, 198, 24, 23, 196, 252, 19, 135, 26, 229, 183, 200, 107, 272, 246, 269, 125, 59, 279, 15, 163, 258, 57, 195, 51, 88, 97, 58, 102, 36, 137, 31, 80, 160, 155, 61, 238, 96, 190, 25, 219, 152, 142, 201, 274, 249, 178, 192 ] common_labels = [ 189, 164, 101, 205, 273, 233, 131, 180, 86, 220, 67, 268, 224, 270, 53, 203, 237, 226, 10, 133, 248, 41, 55, 16, 199, 134, 99, 185, 2, 20, 234, 194, 253, 35, 174, 8, 223, 13, 91, 262, 230, 121, 49, 63, 119, 162, 79, 168, 245, 267, 122, 104, 100, 1, 176, 280, 140, 209, 259, 143, 165, 147, 117, 85, 105, 95, 109, 207, 68, 175, 106, 60, 4, 46, 171, 204, 111, 211, 108, 120, 157, 222, 17, 264, 151, 98, 38, 261, 123, 78, 118, 127, 240, 124 ] tail_labels = [ 76, 149, 173, 250, 275, 255, 34, 77, 266, 283, 112, 115, 186, 136, 256, 40, 254, 172, 9, 212, 213, 181, 154, 94, 191, 193, 3, 130, 146, 70, 128, 167, 126, 81, 7, 11, 148, 228, 239, 247, 21, 42, 89, 153, 161, 244, 110, 0, 29, 114, 132, 159, 218, 232, 260, 56, 92, 116, 282, 33, 113, 138, 12, 188, 44, 150, 197, 271, 169, 206, 90, 235, 103, 281, 184, 208, 216, 202, 214, 241, 129, 210, 276, 64, 27, 87, 139, 227, 187, 62, 43, 50, 69, 93, 144, 166, 265, 54, 83, 39 ] @DATASETS.register_module() class EmbodiedScanDetGroundingDataset(BaseDataset): def __init__( self, data_root: str, ann_file: str, vg_file=None, metainfo: Optional[dict] = None, pipeline: List[Union[dict, Callable]] = [], test_mode: bool = False, load_eval_anns: bool = True, filter_empty_gt: bool = True, remove_dontcare: bool = False, box_type_3d: str = "Euler-Depth", dataset_length=None, mode="detection", max_n_images=50, n_images_per_sample=1, drop_last_per_scene=False, part=None, temporal=False, num_text=1, tokens_positive_rebuild=True, sep_token="[SEP]", **kwargs, ): self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) self.filter_empty_gt = filter_empty_gt self.remove_dontcare = remove_dontcare self.load_eval_anns = load_eval_anns self.dataset_length = dataset_length self.part = part self.mode = mode assert self.mode in ["detection", "continuous", "grounding"] super().__init__( ann_file=ann_file, metainfo=metainfo, data_root=data_root, pipeline=pipeline, test_mode=test_mode, serialize_data=self.mode == "detection", **kwargs, ) if self.mode == "continuous": self.max_n_images = max_n_images self.n_images_per_sample = n_images_per_sample self.drop_last_per_scene = drop_last_per_scene self.convert_to_continuous() elif self.mode == "grounding": self.vg_file = vg_file self.num_text = num_text self.tokens_positive_rebuild = tokens_positive_rebuild self.sep_token = sep_token self.load_language_data() self.data_bytes, self.data_address = self._serialize_data() self.serialize_data = True print_log(f"dataset length : {self.__len__()}") def process_metainfo(self): assert "categories" in self._metainfo if "classes" not in self._metainfo: self._metainfo.setdefault( "classes", list(self._metainfo["categories"].keys()) ) self.label_mapping = np.full( max(list(self._metainfo["categories"].values())) + 1, -1, dtype=int ) for key, value in self._metainfo["categories"].items(): if key in self._metainfo["classes"]: self.label_mapping[value] = self._metainfo["classes"].index( key ) def parse_data_info(self, info: dict): info["box_type_3d"] = self.box_type_3d info["axis_align_matrix"] = self._get_axis_align_matrix(info) info["scan_id"] = info["sample_idx"] ann_dataset = info["sample_idx"].split("/")[0] if ann_dataset == "matterport3d": info["depth_shift"] = 4000.0 else: info["depth_shift"] = 1000.0 # Because multi-view settings are different from original designs # we temporarily follow the ori design in ImVoxelNet info["img_path"] = [] info["depth_img_path"] = [] if "cam2img" in info: cam2img = info["cam2img"].astype(np.float32) else: cam2img = [] extrinsics = [] for i in range(len(info["images"])): img_path = os.path.join( self.data_prefix.get("img_path", ""), info["images"][i]["img_path"], ) depth_img_path = os.path.join( self.data_prefix.get("img_path", ""), info["images"][i]["depth_path"], ) info["img_path"].append(img_path) info["depth_img_path"].append(depth_img_path) align_global2cam = np.linalg.inv( info["axis_align_matrix"] @ info["images"][i]["cam2global"] ) extrinsics.append(align_global2cam.astype(np.float32)) if "cam2img" not in info: cam2img.append(info["images"][i]["cam2img"].astype(np.float32)) info["depth2img"] = dict( extrinsic=extrinsics, intrinsic=cam2img, origin=np.array([0.0, 0.0, 0.5]).astype(np.float32), ) if "depth_cam2img" not in info: info["depth_cam2img"] = cam2img if not self.test_mode: info["ann_info"] = self.parse_ann_info(info) if self.test_mode and self.load_eval_anns: info["ann_info"] = self.parse_ann_info(info) info["eval_ann_info"] = self._remove_dontcare(info["ann_info"]) return info def parse_ann_info(self, info: dict): """Process the `instances` in data info to `ann_info`. Args: info (dict): Info dict. Returns: dict: Processed `ann_info`. """ ann_info = None if "instances" in info and len(info["instances"]) > 0: ann_info = dict( gt_bboxes_3d=np.zeros( (len(info["instances"]), 9), dtype=np.float32 ), gt_labels_3d=np.zeros( (len(info["instances"]),), dtype=np.int64 ), gt_names=[], bbox_id=np.zeros((len(info["instances"]),), dtype=np.int64) - 1, ) for idx, instance in enumerate(info["instances"]): ann_info["gt_bboxes_3d"][idx] = instance["bbox_3d"] ann_info["gt_labels_3d"][idx] = self.label_mapping[ instance["bbox_label_3d"] ] ann_info["gt_names"].append( self._metainfo["classes"][ann_info["gt_labels_3d"][idx]] if ann_info["gt_labels_3d"][idx] >= 0 else "others" ) ann_info["bbox_id"][idx] = instance["bbox_id"] # pack ann_info for return if ann_info is None: ann_info = dict() ann_info["gt_bboxes_3d"] = np.zeros((0, 9), dtype=np.float32) ann_info["gt_labels_3d"] = np.zeros((0,), dtype=np.int64) ann_info["bbox_id"] = np.zeros((0,), dtype=np.int64) - 1 ann_info["gt_names"] = [] # post-processing/filtering ann_info if not empty gt if "visible_instance_ids" in info["images"][0]: ids = [] for i in range(len(info["images"])): ids.append(info["images"][i]["visible_instance_ids"]) mask_length = ann_info["gt_labels_3d"].shape[0] ann_info["visible_instance_masks"] = self._ids2masks( ids, mask_length ) if self.remove_dontcare: ann_info = self._remove_dontcare(ann_info) ann_dataset = info["sample_idx"].split("/")[0] ann_info["gt_bboxes_3d"] = self.box_type_3d( ann_info["gt_bboxes_3d"], box_dim=ann_info["gt_bboxes_3d"].shape[-1], with_yaw=True, origin=(0.5, 0.5, 0.5), ) return ann_info @staticmethod def _get_axis_align_matrix(info: dict): """Get axis_align_matrix from info. If not exist, return identity mat. Args: info (dict): Info of a single sample data. Returns: np.ndarray: 4x4 transformation matrix. """ if "axis_align_matrix" in info: return np.array(info["axis_align_matrix"]) else: warnings.warn( "axis_align_matrix is not found in ScanNet data info, please " "use new pre-process scripts to re-generate ScanNet data" ) return np.eye(4).astype(np.float32) def _ids2masks(self, ids, mask_length): """Change visible_instance_ids to visible_instance_masks.""" masks = [] for idx in range(len(ids)): mask = np.zeros((mask_length,), dtype=bool) mask[ids[idx]] = 1 masks.append(mask) return masks def _remove_dontcare(self, ann_info: dict): """Remove annotations that do not need to be cared. -1 indicates dontcare in MMDet3d. Args: ann_info (dict): Dict of annotation infos. The instance with label `-1` will be removed. Returns: dict: Annotations after filtering. """ img_filtered_annotations = {} filter_mask = ann_info["gt_labels_3d"] > -1 for key in ann_info.keys(): if key == "instances": img_filtered_annotations[key] = ann_info[key] elif key == "visible_instance_masks": img_filtered_annotations[key] = [] for idx in range(len(ann_info[key])): img_filtered_annotations[key].append( ann_info[key][idx][filter_mask] ) elif key == "gt_names": img_filtered_annotations[key] = [ x for i, x in enumerate(ann_info[key]) if filter_mask[i] ] else: img_filtered_annotations[key] = ann_info[key][filter_mask] return img_filtered_annotations def load_data_list(self): annotations = load(self.ann_file) if not isinstance(annotations, dict): raise TypeError( f"The annotations loaded from annotation file " f"should be a dict, but got {type(annotations)}!" ) if "data_list" not in annotations or "metainfo" not in annotations: raise ValueError( "Annotation must have data_list and metainfo " "keys" ) metainfo = annotations["metainfo"] raw_data_list = annotations["data_list"] # Meta information load from annotation file will not influence the # existed meta information load from `BaseDataset.METAINFO` and # `metainfo` arguments defined in constructor. for k, v in metainfo.items(): self._metainfo.setdefault(k, v) self.process_metainfo() # load and parse data_infos. data_list = [] for raw_data_info in tqdm.tqdm( raw_data_list, mininterval=10, desc=f"Loading {'Test' if self.test_mode else 'Train'} dataset", ): if self.part is not None: valid = False for x in self.part: if x in raw_data_info["sample_idx"]: valid = True break if not valid: continue data_info = self.parse_data_info(raw_data_info) if data_info is None: continue assert isinstance(data_info, dict) data_list.append(data_info) if ( self.dataset_length is not None and len(data_list) >= self.dataset_length ): break return data_list @staticmethod def _get_axis_align_matrix(info: dict): if 'axis_align_matrix' in info: return np.array(info['axis_align_matrix']) else: warnings.warn( 'axis_align_matrix is not found in ScanNet data info, please ' 'use new pre-process scripts to re-generate ScanNet data') return np.eye(4).astype(np.float32) @staticmethod def _is_view_dep(text): """Check whether to augment based on sr3d utterance.""" rels = [ 'front', 'behind', 'back', 'left', 'right', 'facing', 'leftmost', 'rightmost', 'looking', 'across' ] words = set(text.split()) return any(rel in words for rel in rels) def convert_info_to_scan(self): self.scans = dict() for data in self.data_list: scan_id = data['scan_id'] self.scans[scan_id] = data self.scan_ids = list(self.scans.keys()) def load_language_data(self): self.convert_info_to_scan() if isinstance(self.vg_file, str): language_annotations = load(os.path.join(self.data_root, self.vg_file)) else: language_annotations = [] for x in self.vg_file: language_annotations.extend(load(os.path.join(self.data_root, x))) if self.dataset_length is not None: interval = len(language_annotations) / self.dataset_length output = [] for i in range(self.dataset_length): output.append(ids[int(interval*i)]) language_annotations = output self.data_list = language_annotations self.scan_id_to_data_idx = {} for scan_id in self.scan_ids: self.scan_id_to_data_idx[scan_id] = [] for i, d in enumerate(self.data_list): self.scan_id_to_data_idx[d["scan_id"]].append(i) def convert_to_continuous(self): self.convert_info_to_scan() data_list = [] self.flag = [] self.image_id_dict = {} for flag, scan_id in enumerate(self.scan_ids): total_n = len(self.scans[scan_id]["images"]) sample_n = min(self.max_n_images, total_n) ids = sample(total_n, sample_n, True).tolist() self.image_id_dict[scan_id] = ids if self.n_images_per_sample > 1: if self.drop_last_per_scene: sample_n = math.floor(sample_n / self.n_images_per_sample) else: sample_n = math.ceil(sample_n / self.n_images_per_sample) ids = [ ids[i*self.n_images_per_sample : (i+1)*self.n_images_per_sample] for i in range(sample_n) ] data_list.extend( [dict(scan_id=scan_id, image_id=i) for i in ids] ) self.flag.extend([flag] * len(ids)) self.data_list = data_list self.flag = np.array(self.flag) def get_data_info_continuous(self, data_info): scan_id = data_info["scan_id"] data = copy.deepcopy(self.scans[scan_id]) img_idx = data_info["image_id"] if isinstance(img_idx, int): img_idx = [img_idx] for k in ["images", "img_path", "depth_img_path"]: data[k] = index(data[k], img_idx) seen_image_id = self.image_id_dict[scan_id] seen_image_id = seen_image_id[:seen_image_id.index(img_idx[0])] if len(seen_image_id) != 0: last_visible_mask = index( data["ann_info"]["visible_instance_masks"], seen_image_id ) last_visible_mask = np.stack(last_visible_mask).any(axis=0) else: last_visible_mask = data["ann_info"]["visible_instance_masks"][0] * False visible_instance_masks = index( data["ann_info"]["visible_instance_masks"], img_idx ) current_visible_mask = np.stack(visible_instance_masks).any(axis=0) ignore_mask = np.logical_and( last_visible_mask, ~current_visible_mask ) all_visible_mask = np.logical_or( last_visible_mask, current_visible_mask, ) data["visible_instance_masks"] = visible_instance_masks data["all_visible_mask"] = all_visible_mask data["ignore_mask"] = ignore_mask data["depth2img"]["extrinsic"] = index( data["depth2img"]["extrinsic"], img_idx ) if isinstance(data["depth2img"]["intrinsic"], list): data["depth2img"]["intrinsic"] = index( data["depth2img"]["intrinsic"], img_idx ) return data def merge_grounding_data(self, data_infos): output = dict( text="", ) for key in ["target_id", "distractor_ids", "target", "anchors", "anchor_ids", "tokens_positive"]: if key in data_infos[0]: output[key] = [] for data_info in data_infos: if "target_id" in data_info and data_info["target_id"] in output["target_id"]: continue if self.tokens_positive_rebuild and "target" in data_info: start_idx = data_info["text"].find(data_info["target"]) end_idx = start_idx + len(data_info["target"]) tokens_positive = [[start_idx, end_idx]] elif "tokens_positive" in data_info: tokens_positive = data_info["tokens_positive"] else: tokens_positive = None if len(output["text"]) == 0: output["text"] = data_info["text"] else: if tokens_positive is not None: tokens_positive = np.array(tokens_positive) tokens_positive += len(output["text"]) + len(self.sep_token) tokens_positive = tokens_positive.tolist() output["text"] += self.sep_token + data_info["text"] if tokens_positive is not None: output["tokens_positive"].append(tokens_positive) for k in ["target_id", "distractor_ids", "target", "anchors", "anchor_ids"]: if k not in data_info: continue output[k].append(data_info[k]) output["scan_id"] = data_infos[0]["scan_id"] return output def get_data_info_grounding(self, data_info): flags = {} if "distractor_ids" in data_info: flags["is_unique"] = len(data_info["distractor_ids"]) == 0 flags["is_hard"] = len(data_info["distractor_ids"]) > 3 if "text" in data_info: flags["is_view_dep"] = self._is_view_dep(data_info["text"]) scan_id = data_info["scan_id"] scan_data = copy.deepcopy(self.scans[scan_id]) data_info = [data_info] if self.num_text > 1: data_idx = self.scan_id_to_data_idx[scan_id] sample_idx = sample( len(data_idx), max(min(int(np.random.rand()*self.num_text), len(data_idx))-1, 1), fix_interval=False ) for i in sample_idx: data_info.append(super().get_data_info(data_idx[i])) data_info = self.merge_grounding_data(data_info) scan_data["text"] = data_info["text"] if "ann_info" in scan_data and "target" in data_info: tokens_positive = [] obj_idx = [] for i, (target_name, id) in enumerate( zip(data_info["target"], data_info["target_id"]) ): mask = np.logical_and( scan_data["ann_info"]["bbox_id"] == id, np.array(scan_data["ann_info"]["gt_names"]) == target_name ) if np.sum(mask) != 1: continue obj_idx.append(np.where(mask)[0][0]) tokens_positive.append(data_info["tokens_positive"][i]) obj_idx = np.array(obj_idx, dtype=np.int32) scan_data["ann_info"]["gt_bboxes_3d"] = scan_data["ann_info"]["gt_bboxes_3d"][obj_idx] scan_data["ann_info"]["gt_labels_3d"] = scan_data["ann_info"]["gt_labels_3d"][obj_idx] scan_data["ann_info"]["gt_names"] = [ scan_data["ann_info"]["gt_names"][i] for i in obj_idx ] if "visible_instance_masks" in scan_data["ann_info"]: scan_data["ann_info"]["visible_instance_masks"] = [ visible_instance_mask[obj_idx] for visible_instance_mask in scan_data["ann_info"]["visible_instance_masks"] ] scan_data["tokens_positive"] = tokens_positive scan_data["eval_ann_info"] = scan_data["ann_info"] scan_data["eval_ann_info"].update(flags) elif "tokens_positive" in data_info: scan_data["tokens_positive"] = data_info.get("tokens_positive") return scan_data @force_full_init def get_data_info(self, idx): data_info = super().get_data_info(idx) if self.mode == "detection": return data_info elif self.mode == "continuous": return self.get_data_info_continuous(data_info) elif self.mode == "grounding": return self.get_data_info_grounding(data_info) def index(input, idx): if isinstance(idx, int): idx = [idx] output = [] for i in idx: output.append(input[i]) return output ================================================ FILE: bip3d/datasets/transforms/__init__.py ================================================ from .augmentation import ( GlobalRotScaleTrans, RandomFlip3D, ResizeCropFlipImage, ) from .formatting import Pack3DDetInputs from .loading import LoadAnnotations3D, LoadDepthFromFile from .multiview import MultiViewPipeline from .transform import ( CategoryGroundingDataPrepare, CamIntrisicStandardization, CustomResize, DepthProbLabelGenerator, ) ================================================ FILE: bip3d/datasets/transforms/augmentation.py ================================================ from typing import List, Union import cv2 import numpy as np from mmcv.transforms import BaseTransform from mmdet.datasets.transforms import RandomFlip from bip3d.registry import TRANSFORMS @TRANSFORMS.register_module() class RandomFlip3D(RandomFlip): """Flip the points & bbox. If the input dict contains the key "flip", then the flag will be used, otherwise it will be randomly decided by a ratio specified in the init method. Required Keys: - points (np.float32) - gt_bboxes_3d (np.float32) Modified Keys: - points (np.float32) - gt_bboxes_3d (np.float32) Added Keys: - points (np.float32) - pcd_trans (np.float32) - pcd_rotation (np.float32) - pcd_rotation_angle (np.float32) - pcd_scale_factor (np.float32) Args: sync_2d (bool): Whether to apply flip according to the 2D images. If True, it will apply the same flip as that to 2D images. If False, it will decide whether to flip randomly and independently to that of 2D images. Defaults to True. flip_2d (bool): Whether to apply flip for the img data. If True, it will adopt the flip augmentation for the img. False occurs on bev augmentation for bev-based image 3d det. Defaults to True. flip_3d (bool): Whether to apply flip for the 3d point cloud data. If True, it will adopt the flip augmentation for the point cloud. Defaults to True. flip_ratio_bev_horizontal (float): The flipping probability in horizontal direction. Defaults to 0.0. flip_ratio_bev_vertical (float): The flipping probability in vertical direction. Defaults to 0.0. flip_box3d (bool): Whether to flip bounding box. In most of the case, the box should be fliped. In cam-based bev detection, this is set to False, since the flip of 2D images does not influence the 3D box. Defaults to True. """ def __init__(self, sync_2d: bool = True, flip_2d: bool = True, flip_3d: bool = True, flip_ratio_bev_horizontal: float = 0.0, flip_ratio_bev_vertical: float = 0.0, flip_box3d: bool = True, update_lidar2cam: bool = False, **kwargs) -> None: # `flip_ratio_bev_horizontal` is equal to # for flip prob of 2d image when # `sync_2d` is True super(RandomFlip3D, self).__init__(prob=flip_ratio_bev_horizontal, direction='horizontal', **kwargs) self.sync_2d = sync_2d self.flip_2d = flip_2d self.flip_3d = flip_3d self.flip_ratio_bev_horizontal = flip_ratio_bev_horizontal self.flip_ratio_bev_vertical = flip_ratio_bev_vertical self.flip_box3d = flip_box3d self.update_lidar2cam = update_lidar2cam if flip_ratio_bev_horizontal is not None: assert isinstance(flip_ratio_bev_horizontal, (int, float)) \ and 0 <= flip_ratio_bev_horizontal <= 1 if flip_ratio_bev_vertical is not None: assert isinstance(flip_ratio_bev_vertical, (int, float)) \ and 0 <= flip_ratio_bev_vertical <= 1 def transform(self, input_dict: dict) -> dict: """Call function to flip points, values in the ``bbox3d_fields`` and also flip 2D image and its annotations. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Flipped results, 'flip', 'flip_direction', 'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added into result dict. """ # flip 2D image and its annotations if self.flip_2d: # only handle the 2D image if 'img' in input_dict: super(RandomFlip3D, self).transform(input_dict) flip = input_dict.get('flip', False) if flip: input_dict = self.random_flip_data_2d(input_dict) if self.flip_3d: # only handle the 3D points if self.sync_2d and 'img' in input_dict: # TODO check if this is necessary in FOCS3D input_dict['pcd_horizontal_flip'] = input_dict['flip'] input_dict['pcd_vertical_flip'] = False else: if 'pcd_horizontal_flip' not in input_dict: if np.random.rand() < self.flip_ratio_bev_horizontal: flip_horizontal = True else: flip_horizontal = False input_dict['pcd_horizontal_flip'] = flip_horizontal if 'pcd_vertical_flip' not in input_dict: if np.random.rand() < self.flip_ratio_bev_vertical: flip_vertical = True else: flip_vertical = False input_dict['pcd_vertical_flip'] = flip_vertical if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] if input_dict['pcd_horizontal_flip']: self.random_flip_data_3d(input_dict, 'horizontal') input_dict['transformation_3d_flow'].extend(['HF']) if input_dict['pcd_vertical_flip']: self.random_flip_data_3d(input_dict, 'vertical') input_dict['transformation_3d_flow'].extend(['VF']) if self.update_lidar2cam: self._transform_lidar2cam(input_dict) return input_dict def random_flip_data_3d(self, input_dict: dict, direction: str = 'horizontal') -> None: """Flip 3D data randomly. `random_flip_data_3d` should take these situations into consideration: - 1. LIDAR-based 3d detection - 2. LIDAR-based 3d segmentation - 3. vision-only detection - 4. multi-modality 3d detection. Args: input_dict (dict): Result dict from loading pipeline. direction (str): Flip direction. Defaults to 'horizontal'. Returns: dict: Flipped results, 'points', 'bbox3d_fields' keys are updated in the result dict. """ assert direction in ['horizontal', 'vertical'] if self.flip_box3d: if 'gt_bboxes_3d' in input_dict: if 'points' in input_dict: input_dict['points'] = input_dict['gt_bboxes_3d'].flip( direction, points=input_dict['points']) else: # vision-only detection input_dict['gt_bboxes_3d'].flip(direction) else: input_dict['points'].flip(direction) def random_flip_data_2d(self, input_dict: dict, direction: str = 'horizontal') -> dict: if 'centers_2d' in input_dict: assert self.sync_2d is True and direction == 'horizontal', \ 'Only support sync_2d=True and horizontal flip with images' w = input_dict['img_shape'][1] input_dict['centers_2d'][..., 0] = \ w - input_dict['centers_2d'][..., 0] # need to modify the horizontal position of camera center # along u-axis in the image (flip like centers2d) # ['cam2img'][0][2] = c_u # see more details and examples at # https://github.com/open-mmlab/mmdetection3d/pull/744 input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2] if 'fov_ori2aug' not in input_dict: fov_ori2aug = np.eye(4, 4) else: fov_ori2aug = input_dict['fov_ori2aug'] # get the value of w w = input_dict['img_shape'][1] # flip_matrix[0,0] = -1 # flip_matrix[0,3] = w # fov_ori2aug = np.matmul(fov_ori2aug, flip_matrix) fov_ori2aug[0] *= -1 fov_ori2aug[0, 3] += w input_dict['fov_ori2aug'] = fov_ori2aug return input_dict def _flip_on_direction(self, results: dict) -> None: """Function to flip images, bounding boxes, semantic segmentation map and keypoints. Add the override feature that if 'flip' is already in results, use it to do the augmentation. """ if 'flip' not in results: cur_dir = self._choose_direction() else: # `flip_direction` works only when `flip` is True. # For example, in `MultiScaleFlipAug3D`, `flip_direction` is # 'horizontal' but `flip` is False. if results['flip']: assert 'flip_direction' in results, 'flip and flip_direction ' 'must exist simultaneously' cur_dir = results['flip_direction'] else: cur_dir = None if cur_dir is None: results['flip'] = False results['flip_direction'] = None else: results['flip'] = True results['flip_direction'] = cur_dir self._flip(results) def _transform_lidar2cam(self, results: dict) -> None: """TODO.""" aug_matrix = np.eye(4) if results.get('pcd_horizontal_flip', False): aug_matrix[1, 1] *= -1 if results.get('pcd_vertical_flip', False): aug_matrix[0, 0] *= -1 lidar2cam_list = [] for lidar2cam in results['lidar2cam']: lidar2cam = np.array(lidar2cam) lidar2cam = np.matmul(lidar2cam, aug_matrix) lidar2cam_list.append(lidar2cam.tolist()) results['lidar2cam'] = lidar2cam_list def __repr__(self) -> str: """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(sync_2d={self.sync_2d},' repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})' return repr_str @TRANSFORMS.register_module() class GlobalRotScaleTrans(BaseTransform): """Apply global rotation, scaling and translation to a 3D scene. Required Keys: - points (np.float32) - gt_bboxes_3d (np.float32) Modified Keys: - points (np.float32) - gt_bboxes_3d (np.float32) Added Keys: - points (np.float32) - pcd_trans (np.float32) - pcd_rotation (np.float32) - pcd_rotation_angle (np.float32) - pcd_scale_factor (np.float32) Args: rot_range (list[float]): Range of rotation angle. Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]). rot_dof (int): DoF of rotation noise. Defaults to 1. scale_ratio_range (list[float]): Range of scale ratio. Defaults to [0.95, 1.05]. translation_std (list[float]): The standard deviation of translation noise applied to a scene, which is sampled from a gaussian distribution whose standard deviation is set by ``translation_std``. Defaults to [0, 0, 0]. shift_height (bool): Whether to shift height. (the fourth dimension of indoor points) when scaling. Defaults to False. """ def __init__(self, rot_range: Union[List[float], int, float] = [-0.78539816, 0.78539816], rot_dof: int = 1, scale_ratio_range: List[float] = [0.95, 1.05], translation_std: List[int] = [0, 0, 0], shift_height: bool = False, update_lidar2cam: bool = False) -> None: seq_types = (list, tuple, np.ndarray) if not isinstance(rot_range, seq_types): assert isinstance(rot_range, (int, float)), \ f'unsupported rot_range type {type(rot_range)}' rot_range = [-rot_range, rot_range] self.rot_range = rot_range self.rot_dof = rot_dof self.update_lidar2cam = update_lidar2cam assert isinstance(scale_ratio_range, seq_types), \ f'unsupported scale_ratio_range type {type(scale_ratio_range)}' self.scale_ratio_range = scale_ratio_range if not isinstance(translation_std, seq_types): assert isinstance(translation_std, (int, float)), \ f'unsupported translation_std type {type(translation_std)}' translation_std = [ translation_std, translation_std, translation_std ] assert all([std >= 0 for std in translation_std]), \ 'translation_std should be positive' self.translation_std = translation_std self.shift_height = shift_height def transform(self, input_dict: dict) -> dict: """Private function to rotate, scale and translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated in the result dict. """ if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] self._rot_bbox_points(input_dict) if 'pcd_scale_factor' not in input_dict: self._random_scale(input_dict) self._scale_bbox_points(input_dict) self._trans_bbox_points(input_dict) input_dict['transformation_3d_flow'].extend(['R', 'S', 'T']) if self.update_lidar2cam: self._transform_lidar2cam(input_dict) return input_dict def _trans_bbox_points(self, input_dict: dict) -> None: """Private function to translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after translation, 'points', 'pcd_trans' and `gt_bboxes_3d` is updated in the result dict. """ translation_std = np.array(self.translation_std, dtype=np.float32) trans_factor = np.random.normal(scale=translation_std, size=3).T if 'points' in input_dict: input_dict['points'].translate(trans_factor) input_dict['pcd_trans'] = trans_factor if 'gt_bboxes_3d' in input_dict: input_dict['gt_bboxes_3d'].translate(trans_factor) def _rot_bbox_points(self, input_dict: dict) -> None: """Private function to rotate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after rotation, 'points', 'pcd_rotation' and `gt_bboxes_3d` is updated in the result dict. """ rotation = self.rot_range if self.rot_dof == 1: noise_rotation = np.random.uniform(rotation[0], rotation[1]) noise_rotation *= -1 elif self.rot_dof > 1: noise_rotation = np.array([ -np.random.uniform(rotation[0], rotation[1]), -np.random.uniform(rotation[0], rotation[1]), -np.random.uniform(rotation[0], rotation[1]) ]) else: raise NotImplementedError # TODO delete this. And -1 is to align the rotation with # the version of 0.17. # if 'gt_bboxes_3d' in input_dict and \ # len(input_dict['gt_bboxes_3d'].tensor) != 0: if 'gt_bboxes_3d' in input_dict: # rotate points with bboxes if 'points' in input_dict: points, rot_mat_T = input_dict['gt_bboxes_3d'].rotate( noise_rotation, input_dict['points']) input_dict['points'] = points else: rot_mat_T = input_dict['gt_bboxes_3d'].rotate(noise_rotation) elif 'points' in input_dict: # if no bbox in input_dict, only rotate points rot_mat_T = input_dict['points'].rotate(noise_rotation) input_dict['pcd_rotation'] = rot_mat_T input_dict['pcd_rotation_angle'] = noise_rotation def _scale_bbox_points(self, input_dict: dict) -> None: """Private function to scale bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points' and `gt_bboxes_3d` is updated in the result dict. """ scale = input_dict['pcd_scale_factor'] if 'points' in input_dict: points = input_dict['points'] points.scale(scale) if self.shift_height: assert 'height' in points.attribute_dims.keys(), \ 'setting shift_height=True \ but points have no height attribute' points.tensor[:, points.attribute_dims['height']] *= scale input_dict['points'] = points if 'gt_bboxes_3d' in input_dict and \ len(input_dict['gt_bboxes_3d'].tensor) != 0: input_dict['gt_bboxes_3d'].scale(scale) def _random_scale(self, input_dict: dict) -> None: """Private function to randomly set the scale factor. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'pcd_scale_factor' are updated in the result dict. """ scale_factor = np.random.uniform(self.scale_ratio_range[0], self.scale_ratio_range[1]) input_dict['pcd_scale_factor'] = scale_factor def _transform_lidar2cam(self, input_dict: dict) -> None: aug_matrix = np.eye(4) if 'pcd_rotation' in input_dict: aug_matrix[:3, :3] = input_dict['pcd_rotation'].T.numpy( ) * input_dict['pcd_scale_factor'] else: aug_matrix[:3, :3] = np.eye(3).view( 1, 3, 3) * input_dict['pcd_scale_factor'] aug_matrix[:3, -1] = input_dict['pcd_trans'].reshape(1, 3) aug_matrix[-1, -1] = 1.0 aug_matrix = np.linalg.inv(aug_matrix) lidar2cam_list = [] for lidar2cam in input_dict["depth2img"]["extrinsic"]: # for lidar2cam in input_dict['lidar2cam']: lidar2cam = np.array(lidar2cam) lidar2cam = np.matmul(lidar2cam, aug_matrix) lidar2cam_list.append(lidar2cam.tolist()) input_dict["depth2img"]["extrinsic"] = lidar2cam_list # input_dict['lidar2cam'] = lidar2cam_list def __repr__(self) -> str: """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(rot_range={self.rot_range},' repr_str += f' scale_ratio_range={self.scale_ratio_range},' repr_str += f' translation_std={self.translation_std},' repr_str += f' shift_height={self.shift_height})' return repr_str @TRANSFORMS.register_module() class ResizeCropFlipImage(object): def __init__(self, data_aug_conf, test_mode=False): self.data_aug_conf = data_aug_conf self.test_mode = test_mode def get_augmentation(self): if self.data_aug_conf is None: return None H, W = self.data_aug_conf["H"], self.data_aug_conf["W"] fH, fW = self.data_aug_conf["final_dim"] if not self.test_mode: resize = np.random.uniform(*self.data_aug_conf["resize_lim"]) resize_dims = (int(W * resize), int(H * resize)) newW, newH = resize_dims crop_h = ( int( (1 - np.random.uniform(*self.data_aug_conf["bot_pct_lim"])) * newH ) - fH ) crop_w = int(np.random.uniform(0, max(0, newW - fW))) crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) flip = False if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]): flip = True rotate = np.random.uniform(*self.data_aug_conf["rot_lim"]) # rotate_3d = np.random.uniform(*self.data_aug_conf["rot3d_range"]) rotate_3d = 0 else: resize = max(fH / H, fW / W) resize_dims = (int(W * resize), int(H * resize)) newW, newH = resize_dims crop_h = ( int((1 - np.mean(self.data_aug_conf["bot_pct_lim"])) * newH) - fH ) crop_w = int(max(0, newW - fW) / 2) crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) flip = False rotate = 0 rotate_3d = 0 aug_config = { "resize": resize, "resize_dims": resize_dims, "crop": crop, "flip": flip, "rotate": rotate, "rotate_3d": rotate_3d, } return aug_config def __call__(self, results): img = results["img"] aug_configs = self.get_augmentation() resize = aug_configs.get("resize", 1) results["img"], extend_matrix = self._transform(img, aug_configs) results["trans_mat"] = extend_matrix if "depth_img" in results: results["depth_img"], _ = self._transform(results["depth_img"], aug_configs) return results def _transform(self, img, aug_configs): H, W = img.shape[:2] resize = aug_configs.get("resize", 1) resize_dims = (int(W * resize), int(H * resize)) crop = aug_configs.get("crop", [0, 0, *resize_dims]) flip = aug_configs.get("flip", False) rotate = aug_configs.get("rotate", 0) transform_matrix = np.eye(3) transform_matrix[:2, :2] *= resize transform_matrix[:2, 2] -= np.array(crop[:2]) if flip: flip_matrix = np.array( [[-1, 0, crop[2] - crop[0]], [0, 1, 0], [0, 0, 1]] ) transform_matrix = flip_matrix @ transform_matrix rotate = rotate / 180 * np.pi rot_matrix = np.array( [ [np.cos(rotate), np.sin(rotate), 0], [-np.sin(rotate), np.cos(rotate), 0], [0, 0, 1], ] ) rot_center = np.array([crop[2] - crop[0], crop[3] - crop[1]]) / 2 rot_matrix[:2, 2] = -rot_matrix[:2, :2] @ rot_center + rot_center transform_matrix = rot_matrix @ transform_matrix extend_matrix = np.eye(4) extend_matrix[:3, :3] = transform_matrix img = cv2.warpAffine(img, extend_matrix[:2, :3], img.shape[:2][::-1]) return img, extend_matrix ================================================ FILE: bip3d/datasets/transforms/formatting.py ================================================ from typing import List, Sequence, Union import mmengine import numpy as np import torch from mmcv.transforms import BaseTransform from mmengine.structures import InstanceData, PixelData from bip3d.registry import TRANSFORMS from bip3d.structures.bbox_3d import BaseInstance3DBoxes from bip3d.structures.points import BasePoints from bip3d.utils.typing_config import Det3DDataElement, PointData def to_tensor( data: Union[torch.Tensor, np.ndarray, Sequence, int, float]) -> torch.Tensor: """Convert objects of various python types to :obj:`torch.Tensor`. Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, :class:`Sequence`, :class:`int` and :class:`float`. Args: data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to be converted. Returns: torch.Tensor: the converted data. """ if isinstance(data, torch.Tensor): return data elif isinstance(data, np.ndarray): if data.dtype is np.dtype('float64'): data = data.astype(np.float32) return torch.from_numpy(data) elif isinstance(data, Sequence) and not mmengine.is_str(data): return torch.tensor(data) elif isinstance(data, int): return torch.LongTensor([data]) elif isinstance(data, float): return torch.FloatTensor([data]) else: raise TypeError(f'type {type(data)} cannot be converted to tensor.') @TRANSFORMS.register_module() class Pack3DDetInputs(BaseTransform): INPUTS_KEYS = ['points', 'img', "depth_img"] # to be compatible with depths in bevdepth INSTANCEDATA_3D_KEYS = [ 'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d' ] INSTANCEDATA_2D_KEYS = [ 'gt_bboxes', 'gt_bboxes_labels', ] SEG_KEYS = [ 'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask', 'gt_semantic_seg' ] def __init__( self, keys: dict, meta_keys: dict = ( 'img_path', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img', 'cam2img', 'pad_shape', 'depth_map_path', 'scale_factor', 'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'num_pts_feats', 'pcd_trans', 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle', 'lidar_path', 'transformation_3d_flow', 'trans_mat', 'affine_aug', 'sweep_img_metas', 'ori_cam2img', 'cam2global', 'crop_offset', 'img_crop_offset', 'resize_img_shape', 'lidar2cam', 'ori_lidar2img', 'num_ref_frames', 'num_views', 'ego2global', 'fov_ori2aug', 'ego2cam', 'axis_align_matrix', 'text', 'tokens_positive', 'scan_id', "distractor_bboxes", "anchor_bboxes", "trans_mat", "ignore_mask", )): self.keys = keys self.meta_keys = meta_keys def _remove_prefix(self, key: str) -> str: if key.startswith('gt_'): key = key[3:] return key def transform(self, results: Union[dict, List[dict]]): """Method to pack the input data. when the value in this dict is a list, it usually is in Augmentations Testing. Args: results (dict | list[dict]): Result dict from the data pipeline. Returns: dict | List[dict]: - 'inputs' (dict): The forward data of models. It usually contains following keys: - points - img - 'data_samples' (:obj:`Det3DDataSample`): The annotation info of the sample. """ # augtest if isinstance(results, list): if len(results) == 1: # simple test return self.pack_single_results(results[0]) pack_results = [] for single_result in results: pack_results.append(self.pack_single_results(single_result)) return pack_results # norm training and simple testing elif isinstance(results, dict): return self.pack_single_results(results) else: raise NotImplementedError def pack_single_results(self, results: dict) -> dict: """Method to pack the single input data. when the value in this dict is a list, it usually is in Augmentations Testing. Args: results (dict): Result dict from the data pipeline. Returns: dict: A dict contains - 'inputs' (dict): The forward data of models. It usually contains following keys: - points - img - 'data_samples' (:obj:`Det3DDataSample`): The annotation info of the sample. """ if 'points' in results: if isinstance(results['points'], BasePoints): results['points'] = results['points'].tensor # multi-sweep points elif isinstance(results['points'], list): if isinstance(results['points'][0], BasePoints): for idx in range(len(results['points'])): results['points'][idx] = results['points'][idx].tensor for key in ["img", "depth_img"]: if key not in results: continue if isinstance(results[key], list): # process multiple imgs in single frame imgs = np.stack(results[key], axis=0) if len(imgs.shape) <= 3: imgs = imgs[..., None] if imgs.flags.c_contiguous: imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous() else: imgs = to_tensor( np.ascontiguousarray(imgs.transpose(0, 3, 1, 2))) results[key] = imgs else: img = results[key] if len(img.shape) < 3: img = np.expand_dims(img, -1) # To improve the computational speed by by 3-5 times, apply: # `torch.permute()` rather than `np.transpose()`. # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 # for more details if img.flags.c_contiguous: img = to_tensor(img).permute(2, 0, 1).contiguous() else: img = to_tensor( np.ascontiguousarray(img.transpose(2, 0, 1))) results[key] = img for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask', 'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d' ]: if key not in results: continue if isinstance(results[key], list): results[key] = [to_tensor(res) for res in results[key]] else: results[key] = to_tensor(results[key]) if 'gt_bboxes_3d' in results: # multi-sweep version if isinstance(results['gt_bboxes_3d'], list): if not isinstance(results['gt_bboxes_3d'][0], BaseInstance3DBoxes): for idx in range(len(results['gt_bboxes_3d'])): results['gt_bboxes_3d'][idx] = to_tensor( results['gt_bboxes_3d'][idx]) elif not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes): results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d']) if 'gt_semantic_seg' in results: results['gt_semantic_seg'] = to_tensor( results['gt_semantic_seg'][None]) if 'gt_seg_map' in results: results['gt_seg_map'] = results['gt_seg_map'][None, ...] if 'depth_map' in results: results['depth_map'] = to_tensor(results['depth_map']) data_sample = Det3DDataElement() gt_instances_3d = InstanceData() gt_instances = InstanceData() gt_pts_seg = PointData() gt_depth_map = PixelData() data_metas = {} for key in self.meta_keys: if key in results: data_metas[key] = results[key] # TODO: unify ScanNet multi-view info with nuScenes and Waymo elif 'images' in results and isinstance(results['images'], dict): if len(results['images'].keys()) == 1: cam_type = list(results['images'].keys())[0] # single-view image if key in results['images'][cam_type]: data_metas[key] = results['images'][cam_type][key] else: # multi-view image img_metas = [] cam_types = list(results['images'].keys()) for cam_type in cam_types: if key in results['images'][cam_type]: img_metas.append(results['images'][cam_type][key]) if len(img_metas) > 0: data_metas[key] = img_metas elif 'lidar_points' in results and isinstance( results['lidar_points'], dict): if key in results['lidar_points']: data_metas[key] = results['lidar_points'][key] data_sample.set_metainfo(data_metas) inputs = {} for key in self.keys: if key in results: if key in self.INPUTS_KEYS: inputs[key] = results[key] elif key in self.INSTANCEDATA_3D_KEYS: gt_instances_3d[self._remove_prefix(key)] = results[key] elif key in self.INSTANCEDATA_2D_KEYS: if key == 'gt_bboxes_labels': gt_instances['labels'] = results[key] else: gt_instances[self._remove_prefix(key)] = results[key] elif key in self.SEG_KEYS: gt_pts_seg[self._remove_prefix(key)] = results[key] elif key == 'depth_map': gt_depth_map.set_data(dict(data=results[key])) elif key == 'gt_occupancy': data_sample.gt_occupancy = to_tensor( results['gt_occupancy']) if isinstance(results['gt_occupancy_masks'], list): data_sample.gt_occupancy_masks = [ to_tensor(mask) for mask in results['gt_occupancy_masks'] ] else: data_sample.gt_occupancy_masks = to_tensor( results['gt_occupancy_masks']) else: raise NotImplementedError(f'Please modified ' f'`Pack3DDetInputs` ' f'to put {key} to ' f'corresponding field') data_sample.gt_instances_3d = gt_instances_3d data_sample.gt_instances = gt_instances data_sample.gt_pts_seg = gt_pts_seg data_sample.gt_depth_map = gt_depth_map if 'eval_ann_info' in results: data_sample.eval_ann_info = results['eval_ann_info'] else: data_sample.eval_ann_info = None packed_results = dict() packed_results['data_samples'] = data_sample packed_results['inputs'] = inputs return packed_results def __repr__(self) -> str: """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(keys={self.keys})' repr_str += f'(meta_keys={self.meta_keys})' return repr_str ================================================ FILE: bip3d/datasets/transforms/loading.py ================================================ from typing import Optional import mmcv import mmengine import numpy as np from mmcv.transforms import BaseTransform from mmdet.datasets.transforms import LoadAnnotations from bip3d.registry import TRANSFORMS @TRANSFORMS.register_module() class LoadDepthFromFile(BaseTransform): """Load a depth image from file. Required Keys: - depth_img_path Modified Keys: - depth_img - depth_img_shape Args: imdecode_backend (str): The image decoding backend type. The backend argument for :func:`mmcv.imfrombytes`. See :func:`mmcv.imfrombytes` for details. Defaults to 'cv2'. ignore_empty (bool): Whether to allow loading empty image or file path not existent. Defaults to False. backend_args (dict, optional): Instantiates the corresponding file backend. It may contain `backend` key to specify the file backend. If it contains, the file backend corresponding to this value will be used and initialized with the remaining values, otherwise the corresponding file backend will be selected based on the prefix of the file path. Defaults to None. New in version 2.0.0rc4. """ def __init__(self, imdecode_backend: str = 'cv2', ignore_empty: bool = False, *, backend_args: Optional[dict] = None): self.ignore_empty = ignore_empty self.imdecode_backend = imdecode_backend self.backend_args = None if backend_args is not None: self.backend_args = backend_args.copy() def transform(self, results: dict): """Functions to load image. Args: results (dict): Result dict from :class:`mmengine.dataset.BaseDataset`. Returns: dict: The dict contains loaded image and meta information. """ filename = results['depth_img_path'] depth_shift = results['depth_shift'] try: depth_img_bytes = mmengine.fileio.get( filename, backend_args=self.backend_args) depth_img = mmcv.imfrombytes(depth_img_bytes, flag='unchanged', backend=self.imdecode_backend).astype( np.float32) / depth_shift except Exception as e: if self.ignore_empty: return None else: raise e results['depth_img'] = depth_img return results def __repr__(self): repr_str = (f'{self.__class__.__name__}(' f'ignore_empty={self.ignore_empty}, ' f"imdecode_backend='{self.imdecode_backend}', ") if self.backend_args is not None: repr_str += f'backend_args={self.backend_args})' else: repr_str += f'backend_args={self.backend_args})' return repr_str # TODO : refine @TRANSFORMS.register_module() class LoadAnnotations3D(LoadAnnotations): """Load Annotations3D. Load instance mask and semantic mask of points and encapsulate the items into related fields. Required Keys: - ann_info (dict) - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` | :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`): 3D ground truth bboxes. Only when `with_bbox_3d` is True - gt_labels_3d (np.int64): Labels of ground truths. Only when `with_label_3d` is True. - gt_bboxes (np.float32): 2D ground truth bboxes. Only when `with_bbox` is True. - gt_labels (np.ndarray): Labels of ground truths. Only when `with_label` is True. - depths (np.ndarray): Only when `with_bbox_depth` is True. - centers_2d (np.ndarray): Only when `with_bbox_depth` is True. - attr_labels (np.ndarray): Attribute labels of instances. Only when `with_attr_label` is True. - pts_instance_mask_path (str): Path of instance mask file. Only when `with_mask_3d` is True. - pts_semantic_mask_path (str): Path of semantic mask file. Only when `with_seg_3d` is True. - pts_panoptic_mask_path (str): Path of panoptic mask file. Only when both `with_panoptic_3d` is True. Added Keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` | :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`): 3D ground truth bboxes. Only when `with_bbox_3d` is True - gt_labels_3d (np.int64): Labels of ground truths. Only when `with_label_3d` is True. - gt_bboxes (np.float32): 2D ground truth bboxes. Only when `with_bbox` is True. - gt_labels (np.int64): Labels of ground truths. Only when `with_label` is True. - depths (np.float32): Only when `with_bbox_depth` is True. - centers_2d (np.ndarray): Only when `with_bbox_depth` is True. - attr_labels (np.int64): Attribute labels of instances. Only when `with_attr_label` is True. - pts_instance_mask (np.int64): Instance mask of each point. Only when `with_mask_3d` is True. - pts_semantic_mask (np.int64): Semantic mask of each point. Only when `with_seg_3d` is True. Args: with_bbox_3d (bool): Whether to load 3D boxes. Defaults to True. with_label_3d (bool): Whether to load 3D labels. Defaults to True. with_attr_label (bool): Whether to load attribute label. Defaults to False. with_mask_3d (bool): Whether to load 3D instance masks for points. Defaults to False. with_seg_3d (bool): Whether to load 3D semantic masks for points. Defaults to False. with_bbox (bool): Whether to load 2D boxes. Defaults to False. with_label (bool): Whether to load 2D labels. Defaults to False. with_mask (bool): Whether to load 2D instance masks. Defaults to False. with_seg (bool): Whether to load 2D semantic masks. Defaults to False. with_bbox_depth (bool): Whether to load 2.5D boxes. Defaults to False. with_panoptic_3d (bool): Whether to load 3D panoptic masks for points. Defaults to False. poly2mask (bool): Whether to convert polygon annotations to bitmasks. Defaults to True. seg_3d_dtype (str): String of dtype of 3D semantic masks. Defaults to 'np.int64'. seg_offset (int): The offset to split semantic and instance labels from panoptic labels. Defaults to None. dataset_type (str): Type of dataset used for splitting semantic and instance labels. Defaults to None. backend_args (dict, optional): Arguments to instantiate the corresponding backend. Defaults to None. """ def __init__(self, with_bbox_3d: bool = True, with_label_3d: bool = True, with_depth_map: bool = False, with_attr_label: bool = False, with_mask_3d: bool = False, with_seg_3d: bool = False, with_bbox: bool = False, with_label: bool = False, with_mask: bool = False, with_seg: bool = False, with_bbox_depth: bool = False, with_panoptic_3d: bool = False, with_visible_instance_masks: bool = False, with_occupancy: bool = False, with_visible_occupancy_masks: bool = False, poly2mask: bool = True, seg_3d_dtype: str = 'np.int64', seg_offset: int = None, dataset_type: str = None, backend_args: Optional[dict] = None): super().__init__(with_bbox=with_bbox, with_label=with_label, with_mask=with_mask, with_seg=with_seg, poly2mask=poly2mask, backend_args=backend_args) self.with_bbox_3d = with_bbox_3d self.with_bbox_depth = with_bbox_depth self.with_label_3d = with_label_3d self.with_depth_map = with_depth_map self.with_attr_label = with_attr_label self.with_mask_3d = with_mask_3d self.with_seg_3d = with_seg_3d self.with_panoptic_3d = with_panoptic_3d self.with_visible_instance_masks = with_visible_instance_masks self.with_occupancy = with_occupancy self.with_visible_occupancy_masks = with_visible_occupancy_masks self.seg_3d_dtype = eval(seg_3d_dtype) self.seg_offset = seg_offset self.dataset_type = dataset_type def _load_bboxes_3d(self, results: dict): """Private function to move the 3D bounding box annotation from `ann_info` field to the root of `results`. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box annotations. """ results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d'] return results def _load_bboxes_depth(self, results: dict): """Private function to load 2.5D bounding box annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 2.5D bounding box annotations. """ results['depths'] = results['ann_info']['depths'] results['centers_2d'] = results['ann_info']['centers_2d'] return results def _load_labels_3d(self, results: dict): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded label annotations. """ results['gt_labels_3d'] = results['ann_info']['gt_labels_3d'] return results def _load_attr_labels(self, results: dict): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded label annotations. """ results['attr_labels'] = results['ann_info']['attr_labels'] return results def _load_depth_map(self, results: dict): img_filename = results['img_path'] pts_filename = img_filename.replace('samples', 'depth_points') + '.bin' results['depth_map_path'] = pts_filename if self.file_client is None: self.file_client = mmengine.FileClient(**self.backend_args) try: pts_bytes = self.file_client.get(pts_filename) points = np.frombuffer(pts_bytes, dtype=np.float32) except ConnectionError: mmengine.check_file_exist(pts_filename) if pts_filename.endswith('.npy'): points = np.load(pts_filename) else: points = np.fromfile(pts_filename, dtype=np.float32) pts_img = points.reshape(-1, 3) img_shape = results['ori_shape'] depth_img = np.zeros(img_shape, dtype=np.float32) iy = np.round(pts_img[:, 1]).astype(np.int64) ix = np.round(pts_img[:, 0]).astype(np.int64) depth_img[iy, ix] = pts_img[:, 2] results['depth_map'] = depth_img return results def _load_masks_3d(self, results: dict): """Private function to load 3D mask annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D mask annotations. """ pts_instance_mask_path = results['pts_instance_mask_path'] try: mask_bytes = mmengine.fileio.get(pts_instance_mask_path, backend_args=self.backend_args) pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int64) except ConnectionError: mmengine.check_file_exist(pts_instance_mask_path) pts_instance_mask = np.fromfile(pts_instance_mask_path, dtype=np.int64) results['pts_instance_mask'] = pts_instance_mask # 'eval_ann_info' will be passed to evaluator if 'eval_ann_info' in results: results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask return results def _load_semantic_seg_3d(self, results: dict): """Private function to load 3D semantic segmentation annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing the semantic segmentation annotations. """ pts_semantic_mask_path = results['pts_semantic_mask_path'] try: mask_bytes = mmengine.fileio.get(pts_semantic_mask_path, backend_args=self.backend_args) # add .copy() to fix read-only bug pts_semantic_mask = np.frombuffer(mask_bytes, dtype=self.seg_3d_dtype).copy() except ConnectionError: mmengine.check_file_exist(pts_semantic_mask_path) pts_semantic_mask = np.fromfile(pts_semantic_mask_path, dtype=np.int64) if self.dataset_type == 'semantickitti': pts_semantic_mask = pts_semantic_mask.astype(np.int64) pts_semantic_mask = pts_semantic_mask % self.seg_offset # nuScenes loads semantic and panoptic labels from different files. results['pts_semantic_mask'] = pts_semantic_mask # 'eval_ann_info' will be passed to evaluator if 'eval_ann_info' in results: results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask return results def _load_panoptic_3d(self, results: dict): """Private function to load 3D panoptic segmentation annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing the panoptic segmentation annotations. """ pts_panoptic_mask_path = results['pts_panoptic_mask_path'] try: mask_bytes = mmengine.fileio.get(pts_panoptic_mask_path, backend_args=self.backend_args) # add .copy() to fix read-only bug pts_panoptic_mask = np.frombuffer(mask_bytes, dtype=self.seg_3d_dtype).copy() except ConnectionError: mmengine.check_file_exist(pts_panoptic_mask_path) pts_panoptic_mask = np.fromfile(pts_panoptic_mask_path, dtype=np.int64) if self.dataset_type == 'semantickitti': pts_semantic_mask = pts_panoptic_mask.astype(np.int64) pts_semantic_mask = pts_semantic_mask % self.seg_offset elif self.dataset_type == 'nuscenes': pts_semantic_mask = pts_semantic_mask // self.seg_offset results['pts_semantic_mask'] = pts_semantic_mask # We can directly take panoptic labels as instance ids. pts_instance_mask = pts_panoptic_mask.astype(np.int64) results['pts_instance_mask'] = pts_instance_mask # 'eval_ann_info' will be passed to evaluator if 'eval_ann_info' in results: results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask return results def _load_visible_instance_masks(self, results: dict): """Private function to move the 3D bounding box annotation from `ann_info` field to the root of `results`. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box annotations. """ results['visible_instance_masks'] = results['ann_info'][ 'visible_instance_masks'] return results def _load_occupancy(self, results: dict): """Private function to move the 3D bounding box annotation from `ann_info` field to the root of `results`. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box annotations. """ results['gt_occupancy'] = results['ann_info']['gt_occupancy'] return results def _load_visible_occupancy_masks(self, results: dict): """Private function to move the 3D bounding box annotation from `ann_info` field to the root of `results`. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box annotations. """ results['visible_occupancy_masks'] = results['ann_info'][ 'visible_occupancy_masks'] return results def _load_bboxes(self, results: dict): """Private function to load bounding box annotations. The only difference is it remove the proceess for `ignore_flag` Args: results (dict): Result dict from :obj:`mmcv.BaseDataset`. Returns: dict: The dict contains loaded bounding box annotations. """ results['gt_bboxes'] = results['ann_info']['gt_bboxes'] def _load_labels(self, results: dict): """Private function to load label annotations. Args: results (dict): Result dict from :obj :obj:`mmcv.BaseDataset`. Returns: dict: The dict contains loaded label annotations. """ results['gt_bboxes_labels'] = results['ann_info']['gt_bboxes_labels'] def transform(self, results: dict): """Function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box, label, mask and semantic segmentation annotations. """ results = super().transform(results) if self.with_bbox_3d: results = self._load_bboxes_3d(results) if self.with_bbox_depth: results = self._load_bboxes_depth(results) if self.with_label_3d: results = self._load_labels_3d(results) if self.with_depth_map: results = self._load_depth_map(results) if self.with_attr_label: results = self._load_attr_labels(results) if self.with_panoptic_3d: results = self._load_panoptic_3d(results) if self.with_mask_3d: results = self._load_masks_3d(results) if self.with_seg_3d: results = self._load_semantic_seg_3d(results) if self.with_visible_instance_masks: results = self._load_visible_instance_masks(results) if self.with_occupancy: results = self._load_occupancy(results) if self.with_visible_occupancy_masks: results = self._load_visible_occupancy_masks(results) return results def __repr__(self): """str: Return a string that describes the module.""" indent_str = ' ' repr_str = self.__class__.__name__ + '(\n' repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, ' repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, ' repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, ' repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, ' repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, ' repr_str += f'{indent_str}with_panoptic_3d={self.with_panoptic_3d}, ' repr_str += f'{indent_str}with_bbox={self.with_bbox}, ' repr_str += f'{indent_str}with_label={self.with_label}, ' repr_str += f'{indent_str}with_mask={self.with_mask}, ' repr_str += f'{indent_str}with_seg={self.with_seg}, ' repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, ' repr_str += f'{indent_str}poly2mask={self.poly2mask}), ' repr_str += f'{indent_str}seg_offset={self.seg_offset}), ' repr_str += f'{indent_str}with_visible_instance_masks=' repr_str += f'{self.with_visible_instance_masks}), ' repr_str += f'{indent_str}with_occupancy={self.with_occupancy}), ' repr_str += f'{indent_str}with_visible_occupancy_masks=' repr_str += f'{self.with_visible_occupancy_masks})' return repr_str ================================================ FILE: bip3d/datasets/transforms/multiview.py ================================================ import copy import random import numpy as np import torch from mmcv.transforms import BaseTransform, Compose from bip3d.registry import TRANSFORMS from bip3d.structures.points import get_points_type from ..utils import sample @TRANSFORMS.register_module() class MultiViewPipeline(BaseTransform): """HARD CODE""" def __init__( self, transforms, n_images, max_n_images=None, ordered=False, rotate_3rscan=False, ): super().__init__() self.transforms = Compose(transforms) self.n_images = n_images self.max_n_images = ( max_n_images if max_n_images is not None else n_images ) self.ordered = ordered self.rotate_3rscan = rotate_3rscan def transform(self, results: dict): """Transform function. Args: results (dict): Result dict from loading pipeline. Returns: dict: output dict after transformation. """ imgs = [] img_paths = [] points = [] intrinsics = [] extrinsics = [] depth_imgs = [] trans_mat = [] total_n = len(results["img_path"]) sample_n = min(max(self.n_images, total_n), self.max_n_images) ids = sample(total_n, sample_n, self.ordered) for i in ids.tolist(): _results = dict() _results["img_path"] = results["img_path"][i] if "depth_img_path" in results: _results["depth_img_path"] = results["depth_img_path"][i] if isinstance(results["depth_cam2img"], list): _results["depth_cam2img"] = results["depth_cam2img"][i] _results["cam2img"] = results["depth2img"]["intrinsic"][i] else: _results["depth_cam2img"] = results["depth_cam2img"] _results["cam2img"] = results["cam2img"] _results["depth_shift"] = results["depth_shift"] _results = self.transforms(_results) if "depth_shift" in _results: _results.pop("depth_shift") if "img" in _results: imgs.append(_results["img"]) img_paths.append(_results["img_path"]) if "depth_img" in _results: depth_imgs.append(_results["depth_img"]) if "points" in _results: points.append(_results["points"]) if _results.get("modify_cam2img"): intrinsics.append(_results["cam2img"]) elif isinstance(results["depth2img"]["intrinsic"], list): intrinsics.append(results["depth2img"]["intrinsic"][i]) else: intrinsics.append(results["depth2img"]["intrinsic"]) extrinsics.append(results["depth2img"]["extrinsic"][i]) if "trans_mat" in _results: trans_mat.append(_results["trans_mat"]) for key in _results.keys(): if key not in ["img", "points", "img_path"]: results[key] = _results[key] if len(imgs): if self.rotate_3rscan and "3rscan" in img_paths[0]: imgs = [np.transpose(x, (1, 0, 2)) for x in imgs] rot_mat = np.array( [ [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], ] ) rot_mat = [np.linalg.inv(x) @ rot_mat @ x for x in intrinsics] extrinsics = [x @ y for x, y in zip(rot_mat, extrinsics)] results["scale_factor"] = results["scale_factor"][::-1] results["ori_shape"] = results["ori_shape"][::-1] results["img"] = imgs results["img_path"] = img_paths if len(depth_imgs): if self.rotate_3rscan and "3rscan" in img_paths[0]: depth_imgs = [np.transpose(x, (1, 0)) for x in depth_imgs] results["depth_img"] = depth_imgs if len(points): results["points"] = points if ( "ann_info" in results and "visible_instance_masks" in results["ann_info"] ): results["visible_instance_masks"] = [ results["ann_info"]["visible_instance_masks"][i] for i in ids ] results["ann_info"]["visible_instance_masks"] = results[ "visible_instance_masks" ] elif "visible_instance_masks" in results: results["visible_instance_masks"] = [ results["visible_instance_masks"][i] for i in ids ] results["depth2img"]["intrinsic"] = intrinsics results["depth2img"]["extrinsic"] = extrinsics if len(trans_mat) != 0: results["depth2img"]["trans_mat"] = trans_mat return results ================================================ FILE: bip3d/datasets/transforms/transform.py ================================================ from typing import List, Optional, Tuple, Union import cv2 import copy import numpy as np from numpy import random import mmcv import torch from PIL import Image from mmcv.transforms import BaseTransform, Resize from bip3d.registry import TRANSFORMS from bip3d.structures.bbox_3d import points_cam2img, points_img2cam from bip3d.structures.points import BasePoints, get_points_type @TRANSFORMS.register_module() class CategoryGroundingDataPrepare(BaseTransform): def __init__( self, classes, training, max_class=None, sep_token="[SEP]", filter_others=True, z_range=None, filter_invisible=False, instance_mask_key="visible_instance_masks", ): self.classes = list(classes) self.training = training self.max_class = max_class self.sep_token = sep_token self.filter_others = filter_others self.z_range = z_range self.filter_invisible = filter_invisible self.instance_mask_key = instance_mask_key def transform(self, results): visible_instance_masks = results.get(self.instance_mask_key) if isinstance(visible_instance_masks, (list, tuple)): visible_instance_masks = np.stack(visible_instance_masks).any(axis=0) ignore_mask = results.get("ignore_mask") gt_names = results["ann_info"]["gt_names"] if self.filter_others and "gt_labels_3d" in results: mask = results["gt_labels_3d"] >= 0 results["gt_labels_3d"] = results["gt_labels_3d"][mask] results["gt_bboxes_3d"] = results["gt_bboxes_3d"][mask] visible_instance_masks = visible_instance_masks[mask] gt_names = [x for i, x in enumerate(gt_names) if mask[i]] if ignore_mask is not None: ignore_mask = ignore_mask[mask] if ( self.z_range is not None and "gt_labels_3d" in results and self.training ): mask = torch.logical_and( results["gt_bboxes_3d"].tensor[..., 2] >= self.z_range[0], results["gt_bboxes_3d"].tensor[..., 2] <= self.z_range[1], ).numpy() results["gt_labels_3d"] = results["gt_labels_3d"][mask] results["gt_bboxes_3d"] = results["gt_bboxes_3d"][mask] visible_instance_masks = visible_instance_masks[mask] gt_names = [x for i, x in enumerate(gt_names) if mask[i]] if ignore_mask is not None: ignore_mask = ignore_mask[mask] if self.training or self.filter_invisible: results["gt_labels_3d"] = results["gt_labels_3d"][ visible_instance_masks ] results["gt_bboxes_3d"] = results["gt_bboxes_3d"][ visible_instance_masks ] gt_names = [ x for i, x in enumerate(gt_names) if visible_instance_masks[i] ] if ignore_mask is not None: ignore_mask = ignore_mask[visible_instance_masks] results["ignore_mask"] = ignore_mask if self.training: if ( self.max_class is not None and len(self.classes) > self.max_class ): classes = copy.deepcopy(gt_names) random.shuffle(self.classes) for c in self.classes: if c in classes: continue classes.append(c) if len(classes) >= self.max_class: break random.shuffle(classes) else: classes = copy.deepcopy(self.classes) else: classes = copy.deepcopy(self.classes) gt_names = classes results["text"] = self.sep_token.join(classes) tokens_positive = [] for name in gt_names: start = results["text"].find( self.sep_token + name + self.sep_token ) if start == -1: if results["text"].startswith(name + self.sep_token): start = 0 else: start = results["text"].find(self.sep_token + name) + len( self.sep_token ) else: start += len(self.sep_token) end = start + len(name) tokens_positive.append([[start, end]]) results["tokens_positive"] = tokens_positive return results @TRANSFORMS.register_module() class CamIntrisicStandardization(BaseTransform): def __init__(self, dst_intrinsic, dst_wh): if not isinstance(dst_intrinsic, np.ndarray): dst_intrinsic = np.array(dst_intrinsic) if dst_intrinsic.shape[0] == 3: tmp = np.eye(4) tmp[:3, :3] = dst_intrinsic dst_intrinsic = tmp self.dst_intrinsic = dst_intrinsic self.dst_wh = dst_wh u, v = np.arange(dst_wh[0]), np.arange(dst_wh[1]) u = np.repeat(u[None], dst_wh[1], 0) v = np.repeat(v[:, None], dst_wh[0], 1) uv = np.stack([u, v, np.ones_like(u)], axis=-1) self.dst_pts = uv @ np.linalg.inv(self.dst_intrinsic[:3, :3]).T def transform(self, results): src_intrinsic = results["cam2img"][:3, :3] src_uv = self.dst_pts @ src_intrinsic.T src_uv = src_uv.astype(np.float32) if "depth_img" in results and results["img"].shape[:2] != results["depth_img"].shape[:2]: results["depth_img"] = cv2.resize( results["depth_img"], results["img"].shape[:2][::-1], interpolation=cv2.INTER_LINEAR, ) for key in ["img", "depth_img"]: if key not in results: continue results[key] = cv2.remap( results[key], src_uv[..., 0], src_uv[..., 1], cv2.INTER_NEAREST, ) # warp_mat = self.dst_intrinsic[:3, :3] @ np.linalg.inv(src_intrinsic) # cv2.warpAffine(results[key], warp_mat[:2, :3], self.dst_wh) results["cam2img"] = copy.deepcopy(self.dst_intrinsic) results["depth_cam2img"] = copy.deepcopy(self.dst_intrinsic) results["scale"] = self.dst_wh results['img_shape'] = results["img"].shape[:2] results['scale_factor'] = (1, 1) results['keep_ratio'] = False results["modify_cam2img"] = True return results @TRANSFORMS.register_module() class CustomResize(Resize): def _resize_img(self, results: dict, key="img"): """Resize images with ``results['scale']``.""" if results.get(key, None) is not None: if self.keep_ratio: img, scale_factor = mmcv.imrescale( results[key], results['scale'], interpolation=self.interpolation, return_scale=True, backend=self.backend) # the w_scale and h_scale has minor difference # a real fix should be done in the mmcv.imrescale in the future new_h, new_w = img.shape[:2] h, w = results[key].shape[:2] w_scale = new_w / w h_scale = new_h / h else: img, w_scale, h_scale = mmcv.imresize( results[key], results['scale'], interpolation=self.interpolation, return_scale=True, backend=self.backend) results[key] = img if key == "img": results['img_shape'] = img.shape[:2] results['scale_factor'] = (w_scale, h_scale) results['keep_ratio'] = self.keep_ratio def transform(self, results: dict): if self.scale: results['scale'] = self.scale else: img_shape = results['img'].shape[:2] results['scale'] = _scale_size(img_shape[::-1], self.scale_factor) # type: ignore self._resize_img(results) self._resize_bboxes(results) self._resize_seg(results) self._resize_keypoints(results) if "depth_img" in results: self._resize_img(results, key="depth_img") return results @TRANSFORMS.register_module() class DepthProbLabelGenerator(BaseTransform): def __init__( self, max_depth=10, min_depth=0.25, num_depth=64, stride=[8, 16, 32, 64], origin_stride=1, ): self.max_depth = max_depth self.min_depth = min_depth self.num_depth = num_depth self.stride = [x//origin_stride for x in stride] self.origin_stride = origin_stride def transform(self, input_dict): depth = input_dict["inputs"]["depth_img"].cpu().numpy() if self.origin_stride != 1: H, W = depth.shape[-2:] depth = np.transpose(depth, (0, 2, 3, 1)) depth = [mmcv.imresize( x, (W//self.origin_stride, H//self.origin_stride), interpolation="nearest", ) for x in depth] depth = np.stack(depth)[:,None] depth = np.clip( depth, a_min=self.min_depth, a_max=self.max_depth, ) depth_anchor = np.linspace( self.min_depth, self.max_depth, self.num_depth)[:, None, None] distance = np.abs(depth - depth_anchor) mask = distance < (depth_anchor[1] - depth_anchor[0]) depth_gt = np.where(mask, depth_anchor, 0) y = depth_gt.sum(axis=1, keepdims=True) - depth_gt depth_valid_mask = depth > 0 depth_prob_gt = np.where( (depth_gt != 0) & depth_valid_mask, (depth - y) / (depth_gt - y), 0, ) views, _, H, W = depth.shape gt = [] gt_map = [] for s in self.stride: gt_tmp = np.reshape( depth_prob_gt, (views, self.num_depth, H//s, s, W//s, s)) gt_tmp = gt_tmp.sum(axis=-1).sum(axis=3) mask_tmp = depth_valid_mask.reshape(views, 1, H//s, s, W//s, s) mask_tmp = mask_tmp.sum(axis=-1).sum(axis=3) gt_tmp /= np.clip(mask_tmp, a_min=1, a_max=None) # gt_map.append(np.transpose(gt_tmp, (0, 2, 3, 1))) gt_tmp = gt_tmp.reshape(views, self.num_depth, -1) gt_tmp = np.transpose(gt_tmp, (0, 2, 1)) gt.append(gt_tmp) gt = np.concatenate(gt, axis=1) gt = np.clip(gt, a_min=0.0, a_max=1.0) input_dict["inputs"]["depth_prob_gt"] = torch.from_numpy(gt).to( input_dict["inputs"]["depth_img"]) return input_dict ================================================ FILE: bip3d/datasets/utils.py ================================================ import numpy as np from scipy.spatial.transform import Rotation def sample(total_n, sample_n, fix_interval=True): ids = np.arange(total_n) if sample_n == total_n: return ids elif sample_n > total_n: return np.concatenate([ ids, sample(total_n, sample_n - total_n, fix_interval) ]) elif fix_interval: interval = total_n / sample_n output = [] for i in range(sample_n): output.append(ids[int(interval * i)]) return np.array(output) return np.random.choice(ids, sample_n, replace=False) def xyzrpy_to_matrix(input): output = np.eye(4) output[:3, :3] = Rotation.from_euler("xyz", np.array(input[3:])).as_matrix() output[:3, 3] = input[:3] return output ================================================ FILE: bip3d/eval/__init__.py ================================================ from .metrics import GroundingMetric, IndoorDetMetric __all__ = ['IndoorDetMetric', 'GroundingMetric'] ================================================ FILE: bip3d/eval/indoor_eval.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. import numpy as np import torch from mmengine.logging import print_log from terminaltables import AsciiTable # from bip3d.visualization.default_color_map import DEFAULT_COLOR_MAP import matplotlib.pyplot as plt def average_precision(recalls, precisions, mode='area'): """Calculate average precision (for single or multiple scales). Args: recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) or (num_dets, ). precisions (np.ndarray): Precisions with shape of (num_scales, num_dets) or (num_dets, ). mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] Returns: float or np.ndarray: Calculated average precision. """ if recalls.ndim == 1: recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape assert recalls.ndim == 2 num_scales = recalls.shape[0] ap = np.zeros(num_scales, dtype=np.float32) if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) for i in range(num_scales): ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] ap[i] = np.sum( (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) elif mode == '11points': for i in range(num_scales): for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[i, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap[i] += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') return ap def eval_det_cls(pred, gt, iou_thr=None, area_range=None): """Generic functions to compute precision/recall for object detection for a single class. Args: pred (dict): Predictions mapping from image id to bounding boxes and scores. gt (dict): Ground truths mapping from image id to bounding boxes. iou_thr (list[float]): A list of iou thresholds. Return: tuple (np.ndarray, np.ndarray, float): Recalls, precisions and average precision. """ # {img_id: {'bbox': box structure, 'det': matched list}} class_recs = {} npos = 0 # figure out the bbox code size first gt_bbox_code_size = 9 pred_bbox_code_size = 9 for img_id in gt.keys(): if len(gt[img_id]) != 0: gt_bbox_code_size = gt[img_id][0].tensor.shape[1] break for img_id in pred.keys(): if len(pred[img_id][0]) != 0: pred_bbox_code_size = pred[img_id][0][0].tensor.shape[1] break assert gt_bbox_code_size == pred_bbox_code_size for img_id in gt.keys(): cur_gt_num = len(gt[img_id]) if cur_gt_num != 0: gt_cur = torch.zeros([cur_gt_num, gt_bbox_code_size], dtype=torch.float32) for i in range(cur_gt_num): gt_cur[i] = gt[img_id][i].tensor bbox = gt[img_id][0].new_box(gt_cur) else: bbox = gt[img_id] det = [[False] * len(bbox) for i in iou_thr] npos += len(bbox) class_recs[img_id] = {'bbox': bbox, 'det': det} if area_range is not None and len(bbox) > 0: area = bbox.tensor[:, 3:6].cumprod(dim=-1)[..., -1] ignore = torch.logical_or(area < area_range[0], area > area_range[1]) npos -= ignore.sum().item() class_recs[img_id]["ignore"] = ignore # construct dets image_ids = [] confidence = [] ious = [] for img_id in pred.keys(): cur_num = len(pred[img_id]) if cur_num == 0: continue pred_cur = torch.zeros((cur_num, pred_bbox_code_size), dtype=torch.float32) box_idx = 0 for box, score in pred[img_id]: image_ids.append(img_id) confidence.append(score) # handle outlier (too thin) predicted boxes w, l, h = box.tensor[0, 3:6] faces = [w * l, w * h, h * l] if torch.any(box.tensor.new_tensor(faces) < 2e-4): # print('Find small predicted boxes,', # 'and clamp short edges to 2e-2 meters.') box.tensor[:, 3:6] = torch.clamp(box.tensor[:, 3:6], min=2e-2) pred_cur[box_idx] = box.tensor box_idx += 1 pred_cur = box.new_box(pred_cur) gt_cur = class_recs[img_id]['bbox'] if len(gt_cur) > 0: # calculate iou in each image iou_cur = pred_cur.overlaps(pred_cur, gt_cur) for i in range(cur_num): ious.append(iou_cur[i]) else: for i in range(cur_num): ious.append(np.zeros(1)) confidence = np.array(confidence) # sort by confidence sorted_ind = np.argsort(-confidence) image_ids = [image_ids[x] for x in sorted_ind] ious = [ious[x] for x in sorted_ind] # go down dets and mark TPs and FPs num_images = len(image_ids) tp_thr = [np.zeros(num_images) for i in iou_thr] fp_thr = [np.zeros(num_images) for i in iou_thr] for d in range(num_images): R = class_recs[image_ids[d]] iou_max = -np.inf BBGT = R['bbox'] cur_iou = ious[d] if len(BBGT) > 0: # compute overlaps for j in range(len(BBGT)): # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...])) iou = cur_iou[j] if iou > iou_max: iou_max = iou jmax = j for iou_idx, thresh in enumerate(iou_thr): if iou_max > thresh: if "ignore" in R and R["ignore"][jmax]: continue if not R['det'][iou_idx][jmax]: tp_thr[iou_idx][d] = 1. R['det'][iou_idx][jmax] = 1 else: fp_thr[iou_idx][d] = 1. else: fp_thr[iou_idx][d] = 1. ret = [] for iou_idx, thresh in enumerate(iou_thr): # compute precision recall fp = np.cumsum(fp_thr[iou_idx]) tp = np.cumsum(tp_thr[iou_idx]) recall = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = average_precision(recall, precision) ret.append((recall, precision, ap)) return ret def eval_map_recall(pred, gt, ovthresh=None, area_range=None): """Evaluate mAP and recall. Generic functions to compute precision/recall for object detection for multiple classes. Args: pred (dict): Information of detection results, which maps class_id and predictions. gt (dict): Information of ground truths, which maps class_id and ground truths. ovthresh (list[float], optional): iou threshold. Default: None. Return: tuple[dict]: dict results of recall, AP, and precision for all classes. """ ret_values = {} for classname in gt.keys(): if classname in pred: ret_values[classname] = eval_det_cls(pred[classname], gt[classname], ovthresh, area_range) recall = [{} for i in ovthresh] precision = [{} for i in ovthresh] ap = [{} for i in ovthresh] for label in gt.keys(): for iou_idx, thresh in enumerate(ovthresh): if label in pred: recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][ label] = ret_values[label][iou_idx] else: recall[iou_idx][label] = np.zeros(1) precision[iou_idx][label] = np.zeros(1) ap[iou_idx][label] = np.zeros(1) return recall, precision, ap def save_pr_curve(recs, precs, metrics): color = list(DEFAULT_COLOR_MAP.values()) for rec, prec, metric in zip(recs, precs, metrics): plt.figure(figsize=(20, 18)) fontsize = 50 font = 'Times New Roman' for k in rec.keys(): if np.isnan(rec[k]).any(): continue plt.plot(rec[k], prec[k]) plt.xlabel('Recall', fontsize=fontsize) plt.ylabel('Precision', fontsize=fontsize) plt.savefig(f'./pr_curver_{metric}.png',dpi=500.) def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None, box_mode_3d=None, classes_split=None, part="Overall"): """Indoor Evaluation. Evaluate the result of the detection. Args: gt_annos (list[dict]): Ground truth annotations. dt_annos (list[dict]): Detection annotations. the dict includes the following keys - labels_3d (torch.Tensor): Labels of boxes. - bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bounding boxes in Depth coordinate. - scores_3d (torch.Tensor): Scores of boxes. metric (list[float]): IoU thresholds for computing average precisions. label2cat (tuple): Map from label to category. logger (logging.Logger | str, optional): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Return: dict[str, float]: Dict of results. """ assert len(dt_annos) == len(gt_annos) pred = {} # map {class_id: pred} gt = {} # map {class_id: gt} for img_id in range(len(dt_annos)): # parse detected annotations det_anno = dt_annos[img_id] for i in range(len(det_anno['labels_3d'])): label = det_anno['labels_3d'].numpy()[i] bbox = det_anno['bboxes_3d'].convert_to(box_mode_3d)[i] score = det_anno['scores_3d'].numpy()[i] if label not in pred: pred[int(label)] = {} if img_id not in pred[label]: pred[int(label)][img_id] = [] if label not in gt: gt[int(label)] = {} if img_id not in gt[label]: gt[int(label)][img_id] = [] pred[int(label)][img_id].append((bbox, score)) # parse gt annotations gt_anno = gt_annos[img_id] gt_boxes = gt_anno['gt_bboxes_3d'] labels_3d = gt_anno['gt_labels_3d'] for i in range(len(labels_3d)): label = labels_3d[i] bbox = gt_boxes[i] if label not in gt: gt[label] = {} if img_id not in gt[label]: gt[label][img_id] = [] gt[label][img_id].append(bbox) rec, prec, ap = eval_map_recall(pred, gt, metric) # filter nan results ori_keys = list(ap[0].keys()) for key in ori_keys: if np.isnan(ap[0][key][0]): for r in rec: del r[key] for p in prec: del p[key] for a in ap: del a[key] ret_dict = dict() header = ['classes'] table_columns = [[label2cat[label] for label in ap[0].keys()] + [part]] for i, iou_thresh in enumerate(metric): header.append(f'AP_{iou_thresh:.2f}') header.append(f'AR_{iou_thresh:.2f}') rec_list = [] for label in ap[i].keys(): ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float( ap[i][label][0]) ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list( ap[i].values()))) table_columns.append(list(map(float, list(ap[i].values())))) table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] for label in rec[i].keys(): ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float( rec[i][label][-1]) rec_list.append(rec[i][label][-1]) ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list)) table_columns.append(list(map(float, rec_list))) table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] table_data = [header] table_rows = list(zip(*table_columns)) table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) if classes_split is not None: splits = ['head', 'common', 'tail'] for idx in range(len(splits)): header = [f'{splits[idx]}_classes'] # init the category list/column cat_list = [] for label in classes_split[idx]: if label in ap[0]: cat_list.append(label2cat[label]) table_columns = [cat_list + [part]] for i, iou_thresh in enumerate(metric): header.append(f'AP_{iou_thresh:.2f}') header.append(f'AR_{iou_thresh:.2f}') ap_list = [] for label in classes_split[idx]: if label in ap[i]: ap_list.append(float(ap[i][label][0])) mean_ap = float(np.mean(ap_list)) table_columns.append(list(map(float, ap_list))) table_columns[-1] += [mean_ap] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] rec_list = [] for label in classes_split[idx]: if label in rec[i]: rec_list.append(rec[i][label][-1]) mean_rec = float(np.mean(rec_list)) table_columns.append(list(map(float, rec_list))) table_columns[-1] += [mean_rec] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] table_data = [header] table_rows = list(zip(*table_columns)) table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) area_split = { "small": [0, 0.2**3], "medium": [0.2**3, 1.0**3], "large": [1.0**3, float("inf")], } if area_split is not None: header = ["area"] for i, iou_thresh in enumerate(metric): header.append(f'AP_{iou_thresh:.2f}') header.append(f'AR_{iou_thresh:.2f}') table_rows = [] area_ret_dict = dict() for idx, (area, area_range) in enumerate(area_split.items()): # header = [area] rec, prec, ap = eval_map_recall(pred, gt, metric, area_range) ori_keys = list(ap[0].keys()) for key in ori_keys: if np.isnan(ap[0][key][0]): for r in rec: del r[key] for p in prec: del p[key] for a in ap: del a[key] # table_columns = [[f"{area}_{part}"]] table_rows.append([f"{area}_{part}"]) for i, iou_thresh in enumerate(metric): # header.append(f'AP_{iou_thresh:.2f}') # header.append(f'AR_{iou_thresh:.2f}') rec_list = [] for label in ap[i].keys(): area_ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float( ap[i][label][0]) area_ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list( ap[i].values()))) table_rows[-1].append(f"{area_ret_dict[f'mAP_{iou_thresh:.2f}']:.4f}") # table_columns.append([f"{area_ret_dict[f'mAP_{iou_thresh:.2f}']:.4f}"]) for label in rec[i].keys(): area_ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float( rec[i][label][-1]) rec_list.append(rec[i][label][-1]) area_ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list)) table_rows[-1].append(f"{area_ret_dict[f'mAR_{iou_thresh:.2f}']:.4f}") # table_columns.append([f"{area_ret_dict[f'mAR_{iou_thresh:.2f}']:.4f}"]) # table_data = [header] # table_rows = list(zip(*table_columns)) # table_data += table_rows # table = AsciiTable(table_data) # table.inner_footing_row_border = True # print_log('\n' + table.table, logger=logger) table_data = [header] table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = False print_log('\n' + table.table, logger=logger) return ret_dict ================================================ FILE: bip3d/eval/metrics/__init__.py ================================================ from .det_metric import IndoorDetMetric from .grounding_metric import GroundingMetric # from .occupancy_metric import OccupancyMetric __all__ = ['IndoorDetMetric', 'GroundingMetric'] ================================================ FILE: bip3d/eval/metrics/det_metric.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. import logging from collections import OrderedDict from typing import Dict, List, Optional, Sequence, Union import numpy as np from terminaltables import AsciiTable from mmdet.evaluation import eval_map from mmengine.dist import (broadcast_object_list, collect_results, is_main_process, get_rank) from mmengine.evaluator import BaseMetric from mmengine.evaluator.metric import _to_cpu from mmengine.logging import MMLogger, print_log from bip3d.registry import METRICS from bip3d.structures import get_box_type from ..indoor_eval import indoor_eval @METRICS.register_module() class IndoorDetMetric(BaseMetric): """Indoor scene evaluation metric. Args: iou_thr (float or List[float]): List of iou threshold when calculate the metric. Defaults to [0.25, 0.5]. collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'. prefix (str, optional): The prefix that will be added in the metric names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, self.default_prefix will be used instead. Defaults to None. """ def __init__(self, iou_thr: List[float] = [0.25, 0.5], collect_device: str = 'cpu', prefix: Optional[str] = None, batchwise_anns: bool = False, save_result_path=None, eval_part=None, **kwargs): super(IndoorDetMetric, self).__init__(prefix=prefix, collect_device=collect_device, **kwargs) self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr self.batchwise_anns = batchwise_anns self.save_result_path = save_result_path if eval_part is None: eval_part = ["scannet", "3rscan", "matterport3d", "arkit"] self.eval_part = eval_part def process(self, data_batch: dict, data_samples: Sequence[dict]): """Process one batch of data samples and predictions. The processed results should be stored in ``self.results``, which will be used to compute the metrics when all batches have been processed. Args: data_batch (dict): A batch of data from the dataloader. data_samples (Sequence[dict]): A batch of outputs from the model. """ for data_sample in data_samples: pred_3d = data_sample['pred_instances_3d'] eval_ann_info = data_sample['eval_ann_info'] eval_ann_info["scan_id"] = data_sample["scan_id"] cpu_pred_3d = dict() for k, v in pred_3d.items(): if hasattr(v, 'to'): cpu_pred_3d[k] = v.to('cpu') else: cpu_pred_3d[k] = v self.results.append((eval_ann_info, cpu_pred_3d)) def compute_metrics(self, results: list, part="Overall", split=True): """Compute the metrics from processed results. Args: results (list): The processed results of each batch. Returns: Dict[str, float]: The computed metrics. The keys are the names of the metrics, and the values are corresponding results. """ logger: MMLogger = MMLogger.get_current_instance() ann_infos = [] pred_results = [] for eval_ann, sinlge_pred_results in results: ann_infos.append(eval_ann) pred_results.append(sinlge_pred_results) # some checkpoints may not record the key "box_type_3d" box_type_3d, box_mode_3d = get_box_type( self.dataset_meta.get('box_type_3d', 'depth')) print_log(f"eval : {len(ann_infos)}, {len(pred_results)}, rank: {get_rank()}") ret_dict = indoor_eval(ann_infos, pred_results, self.iou_thr, self.dataset_meta['classes'], logger=logger, box_mode_3d=box_mode_3d, classes_split=self.dataset_meta.get( 'classes_split', None) if split else None, part=part) return ret_dict def evaluate(self, size: int): """Evaluate the model performance of the whole dataset after processing all batches. Args: size (int): Length of the entire validation dataset. When batch size > 1, the dataloader may pad some data samples to make sure all ranks have the same length of dataset slice. The ``collect_results`` function will drop the padded data based on this size. Returns: dict: Evaluation metrics dict on the val dataset. The keys are the names of the metrics, and the values are corresponding results. """ if len(self.results) == 0: print_log( f'{self.__class__.__name__} got empty `self.results`. Please ' 'ensure that the processed results are properly added into ' '`self.results` in `process` method.', logger='current', level=logging.WARNING) print_log( f"number of results: " f"{len(self.results)}, size: {size}, " f"batchwise_anns: {self.batchwise_anns}, rank: {get_rank()}, " f"collect_dir: {self.collect_dir}" ) if self.batchwise_anns: # the actual dataset length/size is the len(self.results) if self.collect_device == 'cpu': results = collect_results(self.results, len(self.results), self.collect_device, tmpdir=self.collect_dir) else: results = collect_results(self.results, len(self.results), self.collect_device) else: if self.collect_device == 'cpu': results = collect_results(self.results, size, self.collect_device, tmpdir=self.collect_dir) else: results = collect_results(self.results, size, self.collect_device) print_log( f"number of collected results: " f"{len(results) if results is not None else 'None'}," f"rank: {get_rank()}" ) if is_main_process(): # cast all tensors in results list to cpu results = _to_cpu(results) # import pdb; pdb.set_trace() if self.save_result_path is not None: import pickle import copy _ret = [] for x in results: x = list(copy.deepcopy(x)) x[0]["gt_bboxes_3d"] = x[0]["gt_bboxes_3d"].cpu().numpy() x[1]["bboxes_3d"] = x[1]["bboxes_3d"].cpu().numpy() x[1]["scores_3d"] = x[1]["scores_3d"].cpu().numpy() x[1]["labels_3d"] = x[1]["labels_3d"].cpu().numpy() x[1]["target_scores_3d"] = x[1]["target_scores_3d"].cpu().numpy() _ret.append(x) pickle.dump(_ret, open(os.path.join(self.save_result_path, "results.pkl"), "wb")) _metrics = self.compute_metrics(results) # type: ignore for part in self.eval_part: # for part in []: print(f"=================== {part} =========================") part_ret = [x for x in results if part in x[0]["scan_id"]] if len(part_ret) > 0: _metrics_part = self.compute_metrics(part_ret, part=part) _metrics_part = {f"{part}--{k}": v for k, v in _metrics_part.items()} _metrics.update(_metrics_part) # Add prefix to metric names if self.prefix: _metrics = { '/'.join((self.prefix, k)): v for k, v in _metrics.items() } metrics = [_metrics] else: metrics = [None] # type: ignore broadcast_object_list(metrics) if is_main_process(): table = [] header = ["Summary"] for i, iou_thresh in enumerate(self.iou_thr): header.append(f'AP_{iou_thresh:.2f}') header.append(f'AR_{iou_thresh:.2f}') table.append(header) for part in self.eval_part + ["Overall"]: table_data = [part] for i, iou_thresh in enumerate(self.iou_thr): key = f"{part+'--' if part!='Overall' else ''}mAP_{iou_thresh:.2f}" if key in _metrics: table_data.append(f"{_metrics[key]:.4f}") else: table_data.append(f"0.xxxx") key = f"{part+'--' if part!='Overall' else ''}mAR_{iou_thresh:.2f}" if key in _metrics: table_data.append(f"{_metrics[key]:.4f}") else: table_data.append(f"0.xxxx") table.append(table_data) table = AsciiTable(table) table.inner_footing_row_border = True logger: MMLogger = MMLogger.get_current_instance() print_log('\n' + table.table, logger=logger) # reset the results list self.results.clear() return metrics[0] @METRICS.register_module() class Indoor2DMetric(BaseMetric): """indoor 2d predictions evaluation metric. Args: iou_thr (float or List[float]): List of iou threshold when calculate the metric. Defaults to [0.5]. collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'. prefix (str, optional): The prefix that will be added in the metric names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, self.default_prefix will be used instead. Defaults to None. """ def __init__(self, iou_thr: Union[float, List[float]] = [0.5], collect_device: str = 'cpu', prefix: Optional[str] = None): super(Indoor2DMetric, self).__init__(prefix=prefix, collect_device=collect_device) self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: """Process one batch of data samples and predictions. The processed results should be stored in ``self.results``, which will be used to compute the metrics when all batches have been processed. Args: data_batch (dict): A batch of data from the dataloader. data_samples (Sequence[dict]): A batch of outputs from the model. """ for data_sample in data_samples: pred = data_sample['pred_instances'] eval_ann_info = data_sample['eval_ann_info'] ann = dict(labels=eval_ann_info['gt_bboxes_labels'], bboxes=eval_ann_info['gt_bboxes']) pred_bboxes = pred['bboxes'].cpu().numpy() pred_scores = pred['scores'].cpu().numpy() pred_labels = pred['labels'].cpu().numpy() dets = [] for label in range(len(self.dataset_meta['classes'])): index = np.where(pred_labels == label)[0] pred_bbox_scores = np.hstack( [pred_bboxes[index], pred_scores[index].reshape((-1, 1))]) dets.append(pred_bbox_scores) self.results.append((ann, dets)) def compute_metrics(self, results: list) -> Dict[str, float]: """Compute the metrics from processed results. Args: results (list): The processed results of each batch. Returns: Dict[str, float]: The computed metrics. The keys are the names of the metrics, and the values are corresponding results. """ logger: MMLogger = MMLogger.get_current_instance() annotations, preds = zip(*results) eval_results = OrderedDict() for iou_thr_2d_single in self.iou_thr: mean_ap, _ = eval_map(preds, annotations, scale_ranges=None, iou_thr=iou_thr_2d_single, dataset=self.dataset_meta['classes'], logger=logger) eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap return eval_results ================================================ FILE: bip3d/eval/metrics/grounding_metric.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. import os from typing import Dict, List, Optional, Sequence import pickle import mmengine from mmengine.evaluator import BaseMetric from mmengine.logging import MMLogger, print_log from terminaltables import AsciiTable from bip3d.registry import METRICS from bip3d.structures import EulerDepthInstance3DBoxes @METRICS.register_module() class GroundingMetric(BaseMetric): """Lanuage grounding evaluation metric. We calculate the grounding performance based on the alignment score of each bbox with the input prompt. Args: iou_thr (float or List[float]): List of iou threshold when calculate the metric. Defaults to [0.25, 0.5]. collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'. prefix (str, optional): The prefix that will be added in the metric names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, self.default_prefix will be used instead. Defaults to None. format_only (bool): Whether to only inference the predictions without evaluation. Defaults to False. result_dir (str): Dir to save results, e.g., if result_dir = './', the result file will be './test_results.json'. Defaults to ''. """ def __init__(self, iou_thr: List[float] = [0.25, 0.5], collect_device: str = 'cpu', prefix: Optional[str] = None, format_only=False, submit_info=None, result_dir='', **kwargs) -> None: super(GroundingMetric, self).__init__(prefix=prefix, collect_device=collect_device, **kwargs) self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr self.prefix = prefix self.format_only = format_only self.result_dir = result_dir self.submit_info = submit_info if submit_info is not None else {} def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: """Process one batch of data samples and predictions. The processed results should be stored in ``self.results``, which will be used to compute the metrics when all batches have been processed. Args: data_batch (dict): A batch of data from the dataloader. data_samples (Sequence[dict]): A batch of outputs from the model. """ for data_sample in data_samples: pred_3d = data_sample['pred_instances_3d'] eval_ann_info = data_sample['eval_ann_info'] eval_ann_info["scan_id"] = data_sample["scan_id"] cpu_pred_3d = dict() for k, v in pred_3d.items(): if hasattr(v, 'to'): cpu_pred_3d[k] = v.to('cpu') else: cpu_pred_3d[k] = v self.results.append((eval_ann_info, cpu_pred_3d)) def ground_eval(self, gt_annos, det_annos, logger=None, part="Overall"): assert len(det_annos) == len(gt_annos) pred = {} gt = {} object_types = [ 'Easy', 'Hard', 'View-Dep', 'View-Indep', 'Unique', 'Multi', 'Overall' ] for t in self.iou_thr: for object_type in object_types: pred.update({object_type + '@' + str(t): 0}) gt.update({object_type + '@' + str(t): 1e-14}) for sample_id in range(len(det_annos)): det_anno = det_annos[sample_id] gt_anno = gt_annos[sample_id] target_scores = det_anno['target_scores_3d'] # (num_query, ) bboxes = det_anno['bboxes_3d'] gt_bboxes = gt_anno['gt_bboxes_3d'] bboxes = EulerDepthInstance3DBoxes(bboxes.tensor, origin=(0.5, 0.5, 0.5)) gt_bboxes = EulerDepthInstance3DBoxes(gt_bboxes.tensor, origin=(0.5, 0.5, 0.5)) view_dep = gt_anno['is_view_dep'] hard = gt_anno['is_hard'] unique = gt_anno['is_unique'] box_index = target_scores.argsort(dim=-1, descending=True)[:10] top_bbox = bboxes[box_index] iou = top_bbox.overlaps(top_bbox, gt_bboxes) # (num_query, 1) for t in self.iou_thr: threshold = iou > t found = int(threshold.any()) if view_dep: gt['View-Dep@' + str(t)] += 1 pred['View-Dep@' + str(t)] += found else: gt['View-Indep@' + str(t)] += 1 pred['View-Indep@' + str(t)] += found if hard: gt['Hard@' + str(t)] += 1 pred['Hard@' + str(t)] += found else: gt['Easy@' + str(t)] += 1 pred['Easy@' + str(t)] += found if unique: gt['Unique@' + str(t)] += 1 pred['Unique@' + str(t)] += found else: gt['Multi@' + str(t)] += 1 pred['Multi@' + str(t)] += found gt['Overall@' + str(t)] += 1 pred['Overall@' + str(t)] += found header = ['Type'] header.extend(object_types) ret_dict = {} for t in self.iou_thr: if part == "Overall": table_columns = [[f'AP@{t:.2f}']] else: table_columns = [[f'{part}_AP@{t:.2f}']] for object_type in object_types: metric = object_type + '@' + str(t) value = pred[metric] / max(gt[metric], 1) ret_dict[metric] = value table_columns.append([f'{value:.4f}']) table_data = [header] table_rows = list(zip(*table_columns)) table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) return ret_dict def compute_metrics(self, results: list) -> Dict[str, float]: """Compute the metrics from processed results after all batches have been processed. Args: results (list): The processed results of each batch. Returns: Dict[str, float]: The computed metrics. The keys are the names of the metrics, and the values are corresponding results. """ logger: MMLogger = MMLogger.get_current_instance() # noqa annotations, preds = zip(*results) ret_dict = {} if self.format_only: # preds is a list of dict results = [] for pred in preds: result = dict() # convert the Euler boxes to the numpy array to save bboxes_3d = pred['bboxes_3d'].tensor scores_3d = pred['scores_3d'] # Note: hard-code save top-20 predictions # eval top-10 predictions during the test phase by default box_index = scores_3d.argsort(dim=-1, descending=True)[:20] top_bboxes_3d = bboxes_3d[box_index] top_scores_3d = scores_3d[box_index] result['bboxes_3d'] = top_bboxes_3d.numpy() result['scores_3d'] = top_scores_3d.numpy() results.append(result) mmengine.dump(results, os.path.join(self.result_dir, 'test_results.json')) for x in results: x["bboxes_3d"] = x["bboxes_3d"].tolist() x["scores_3d"] = x["scores_3d"].tolist() submission = { 'method': 'xxx', 'team': 'xxx', 'authors': ['xxx'], 'e-mail': 'xxx', 'institution': 'xxx', 'country': 'xxx', 'results': results, } submission.update(self.submit_info) pickle.dump(submission, open(os.path.join(self.result_dir, 'submission.pkl'), "wb")) return ret_dict ret_dict = self.ground_eval(annotations, preds) for part in ["scannet", "3rscan", "matterport3d", "arkit"]: ann = [x for x in annotations if part in x["scan_id"]] pred = [y for x,y in zip(annotations, preds) if part in x["scan_id"]] if len(pred) > 0: part_ret_dict= self.ground_eval(ann, pred, part=part) part_ret_dict = {f"{part}--{k}": v for k, v in part_ret_dict.items()} ret_dict.update(part_ret_dict) return ret_dict ================================================ FILE: bip3d/grid_mask.py ================================================ import torch import torch.nn as nn import numpy as np from PIL import Image class Grid(object): def __init__( self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0 ): self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode = mode self.st_prob = prob self.prob = prob def set_prob(self, epoch, max_epoch): self.prob = self.st_prob * epoch / max_epoch def __call__(self, img, label): if np.random.rand() > self.prob: return img, label h = img.size(1) w = img.size(2) self.d1 = 2 self.d2 = min(h, w) hh = int(1.5 * h) ww = int(1.5 * w) d = np.random.randint(self.d1, self.d2) if self.ratio == 1: self.l = np.random.randint(1, d) else: self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh // d): s = d * i + st_h t = min(s + self.l, hh) mask[s:t, :] *= 0 if self.use_w: for i in range(ww // d): s = d * i + st_w t = min(s + self.l, ww) mask[:, s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[ (hh - h) // 2 : (hh - h) // 2 + h, (ww - w) // 2 : (ww - w) // 2 + w, ] mask = torch.from_numpy(mask).float() if self.mode == 1: mask = 1 - mask mask = mask.expand_as(img) if self.offset: offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float() offset = (1 - mask) * offset img = img * mask + offset else: img = img * mask return img, label class GridMask(object): def __init__( self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0 ): super(GridMask, self).__init__() self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode = mode self.st_prob = prob self.prob = prob def set_prob(self, epoch, max_epoch): self.prob = self.st_prob * epoch / max_epoch # + 1.#0.5 def __call__(self, x, offset=None): if np.random.rand() > self.prob: return x n, c, h, w = x.size() x = x.view(-1, h, w) hh = int(1.5 * h) ww = int(1.5 * w) d = np.random.randint(2, h) self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh // d): s = d * i + st_h t = min(s + self.l, hh) mask[s:t, :] *= 0 if self.use_w: for i in range(ww // d): s = d * i + st_w t = min(s + self.l, ww) mask[:, s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[ (hh - h) // 2 : (hh - h) // 2 + h, (ww - w) // 2 : (ww - w) // 2 + w, ] mask = torch.tensor(mask).float().cuda() if self.mode == 1: mask = 1 - mask mask = mask.expand_as(x) if offset is not None: x = x.view(n, c, h, w) mask = mask.view(n, c, h, w) x = x * mask + offset * (1 - mask) return x elif self.offset: offset = ( torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)) .float() .cuda() ) x = x * mask + offset * (1 - mask) else: x = x * mask return x.view(n, c, h, w) ================================================ FILE: bip3d/models/__init__.py ================================================ from .structure import * from .feature_enhancer import * from .spatial_enhancer import * from .bbox3d_decoder import * from .instance_bank import * from .target import * from .data_preprocessors import * from .deformable_aggregation import * from .bert import * ================================================ FILE: bip3d/models/base_target.py ================================================ from torch import nn from abc import ABC, abstractmethod __all__ = ["BaseTargetWithDenoising"] class BaseTargetWithDenoising(ABC, nn.Module): def __init__(self, num_dn=0, num_temp_dn_groups=0): super(BaseTargetWithDenoising, self).__init__() self.num_dn = num_dn self.num_temp_dn_groups = num_temp_dn_groups self.dn_metas = None @abstractmethod def sample(self, cls_pred, box_pred, cls_target, box_target): """ Perform Hungarian matching between predictions and ground truth, returning the matched ground truth corresponding to the predictions along with the corresponding regression weights. """ def get_dn_anchors(self, cls_target, box_target, *args, **kwargs): """ Generate noisy instances for the current frame, with a total of 'self.num_dn_groups' groups. """ return None def update_dn(self, instance_feature, anchor, *args, **kwargs): """ Insert the previously saved 'self.dn_metas' into the noisy instances of the current frame. """ def cache_dn( self, dn_instance_feature, dn_anchor, dn_cls_target, valid_mask, dn_id_target, ): """ Randomly save information for 'self.num_temp_dn_groups' groups of temporal noisy instances to 'self.dn_metas'. """ if self.num_temp_dn_groups < 0: return self.dn_metas = dict(dn_anchor=dn_anchor[:, : self.num_temp_dn_groups]) ================================================ FILE: bip3d/models/bbox3d_decoder.py ================================================ import math from typing import List, Optional, Tuple, Union import numpy as np import torch from torch import nn from torch.cuda.amp.autocast_mode import autocast from mmcv.cnn import Linear, Scale from mmcv.ops import nms3d_normal, nms3d from mmengine.model import BaseModel from mmengine.structures import InstanceData from mmdet.models.layers import SinePositionalEncoding from mmdet.models.layers.transformer.deformable_detr_layers import ( DeformableDetrTransformerEncoder as DDTE, ) from mmdet.models.layers.transformer.utils import MLP from mmdet.utils import reduce_mean from mmdet.models.detectors.glip import create_positive_map_label_to_token from mmdet.models.dense_heads.atss_vlfusion_head import ( convert_grounding_to_cls_scores, ) from bip3d.registry import MODELS, TASK_UTILS from bip3d.structures.bbox_3d.utils import rotation_3d_in_euler from .utils import ( deformable_format, wasserstein_distance, permutation_corner_distance, center_distance, get_positive_map, get_entities, linear_act_ln, ) __all__ = ["BBox3DDecoder"] X, Y, Z, W, L, H, ALPHA, BETA, GAMMA = range(9) def decode_box(box, min_size=None, max_size=None): size = box[..., 3:6].exp() # size = box[..., 3:6] if min_size is not None or max_size is not None: size = size.clamp(min=min_size, max=max_size) box = torch.cat( [box[..., :3], size, box[..., 6:]], dim=-1, ) return box @MODELS.register_module() class BBox3DDecoder(BaseModel): def __init__( self, instance_bank: dict, anchor_encoder: dict, graph_model: dict, norm_layer: dict, ffn: dict, deformable_model: dict, refine_layer: dict, num_decoder: int = 6, num_single_frame_decoder: int = -1, temp_graph_model: dict = None, text_cross_attn: dict = None, loss_cls: dict = None, loss_reg: dict = None, post_processor: dict = None, sampler: dict = None, gt_cls_key: str = "gt_labels_3d", gt_reg_key: str = "gt_bboxes_3d", gt_id_key: str = "instance_id", with_instance_id: bool = True, task_prefix: str = "det", reg_weights: List = None, operation_order: Optional[List[str]] = None, cls_threshold_to_reg: float = -1, look_forward_twice: bool = False, init_cfg: dict = None, **kwargs, ): super().__init__(init_cfg) self.num_decoder = num_decoder self.num_single_frame_decoder = num_single_frame_decoder self.gt_cls_key = gt_cls_key self.gt_reg_key = gt_reg_key self.gt_id_key = gt_id_key self.with_instance_id = with_instance_id self.task_prefix = task_prefix self.cls_threshold_to_reg = cls_threshold_to_reg self.look_forward_twice = look_forward_twice if reg_weights is None: self.reg_weights = [1.0] * 9 else: self.reg_weights = reg_weights if operation_order is None: operation_order = [ "gnn", "norm", "text_cross_attn", "norm", "deformable", "norm", "ffn", "norm", "refine", ] * num_decoder self.operation_order = operation_order # =========== build modules =========== def build(cfg, registry=MODELS): if cfg is None: return None return registry.build(cfg) self.instance_bank = build(instance_bank) self.anchor_encoder = build(anchor_encoder) self.sampler = build(sampler, TASK_UTILS) self.post_processor = build(post_processor, TASK_UTILS) self.loss_cls = build(loss_cls) self.loss_reg = build(loss_reg) self.op_config_map = { "temp_gnn": temp_graph_model, "gnn": graph_model, "norm": norm_layer, "ffn": ffn, "deformable": deformable_model, "text_cross_attn": text_cross_attn, "refine": refine_layer, } self.layers = nn.ModuleList( [ build(self.op_config_map.get(op, None)) for op in self.operation_order ] ) self.embed_dims = self.instance_bank.embed_dims self.norm = nn.LayerNorm(self.embed_dims) def init_weights(self): from mmengine.model import constant_init for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for i, op in enumerate(self.operation_order): if op == "refine": m = self.layers[i] constant_init(m.layers[-2], 0, bias=0) constant_init(m.layers[-1], 1) nn.init.constant_(m.layers[-2].bias.data[2:], 0.0) def forward( self, feature_maps, text_dict=None, batch_inputs=None, depth_prob=None, **kwargs, ): batch_size = feature_maps[0].shape[0] feature_maps = list(deformable_format(feature_maps)) # ========= get instance info ============ if ( self.sampler.dn_metas is not None and self.sampler.dn_metas["dn_anchor"].shape[0] != batch_size ): self.sampler.dn_metas = None ( instance_feature, anchor, temp_instance_feature, temp_anchor, time_interval, ) = self.instance_bank.get( batch_size, batch_inputs, dn_metas=self.sampler.dn_metas ) # ========= prepare for denosing training ============ # 1. get dn metas: noisy-anchors and corresponding GT # 2. concat learnable instances and noisy instances # 3. get attention mask attn_mask = None dn_metas = None temp_dn_reg_target = None if self.training and hasattr(self.sampler, "get_dn_anchors"): dn_metas = self.sampler.get_dn_anchors( batch_inputs[self.gt_cls_key], batch_inputs[self.gt_reg_key], text_dict=text_dict, label=batch_inputs["gt_labels_3d"], ) if dn_metas is not None: ( dn_anchor, dn_reg_target, dn_cls_target, dn_attn_mask, valid_mask, dn_query, ) = dn_metas num_dn_anchor = dn_anchor.shape[1] if dn_anchor.shape[-1] != anchor.shape[-1]: remain_state_dims = anchor.shape[-1] - dn_anchor.shape[-1] dn_anchor = torch.cat( [ dn_anchor, dn_anchor.new_zeros( batch_size, num_dn_anchor, remain_state_dims ), ], dim=-1, ) anchor = torch.cat([anchor, dn_anchor], dim=1) if dn_query is None: dn_query = instance_feature.new_zeros( batch_size, num_dn_anchor, instance_feature.shape[-1] ), instance_feature = torch.cat( [instance_feature, dn_query], dim=1, ) num_instance = instance_feature.shape[1] num_free_instance = num_instance - num_dn_anchor attn_mask = anchor.new_ones( (num_instance, num_instance), dtype=torch.bool ) attn_mask[:num_free_instance, :num_free_instance] = False attn_mask[num_free_instance:, num_free_instance:] = dn_attn_mask else: num_dn_anchor = None num_free_instance = None anchor_embed = self.anchor_encoder(anchor) if temp_anchor is not None: temp_anchor_embed = self.anchor_encoder(temp_anchor) else: temp_anchor_embed = None # =================== forward the layers ==================== prediction = [] classification = [] quality = [] _anchor = None for i, (op, layer) in enumerate( zip(self.operation_order, self.layers) ): if self.layers[i] is None: continue elif op == "temp_gnn": instance_feature = layer( query=instance_feature, key=temp_instance_feature, value=temp_instance_feature, query_pos=anchor_embed, key_pos=temp_anchor_embed, attn_mask=( attn_mask if temp_instance_feature is None else None ), ) elif op == "gnn": instance_feature = layer( query=instance_feature, key=instance_feature, value=instance_feature, query_pos=anchor_embed, key_pos=anchor_embed, attn_mask=attn_mask, ) elif op == "norm" or op == "ffn": instance_feature = layer(instance_feature) elif op == "deformable": instance_feature = layer( instance_feature, anchor, anchor_embed, feature_maps, batch_inputs, depth_prob=depth_prob, ) elif op == "text_cross_attn": text_feature = text_dict["embedded"] instance_feature = layer( query=instance_feature, key=text_feature, value=text_feature, query_pos=anchor_embed, key_padding_mask=~text_dict["text_token_mask"], key_pos=0, ) elif op == "refine": _instance_feature = self.norm(instance_feature) if self.look_forward_twice: if _anchor is None: _anchor = anchor.clone() _anchor, cls, qt = layer( _instance_feature, _anchor, anchor_embed, time_interval=time_interval, text_feature=text_feature, text_token_mask=text_dict["text_token_mask"], ) prediction.append(_anchor) anchor = layer( instance_feature, anchor, anchor_embed, time_interval=time_interval, )[0] anchor_embed = self.anchor_encoder(anchor) _anchor = anchor anchor = anchor.detach() else: anchor, cls, qt = layer( _instance_feature, anchor, anchor_embed, time_interval=time_interval, text_feature=text_feature, text_token_mask=text_dict["text_token_mask"], ) anchor_embed = self.anchor_encoder(anchor) prediction.append(anchor) classification.append(cls) quality.append(qt) if len(prediction) == self.num_single_frame_decoder: instance_feature, anchor = self.instance_bank.update( instance_feature, anchor if _anchor is None else _anchor, cls, num_dn_anchor ) anchor_embed = self.anchor_encoder(anchor) if self.look_forward_twice: _anchor = anchor anchor = anchor.detach() if dn_metas is not None: num_instance = instance_feature.shape[1] attn_mask = anchor.new_ones( (num_instance, num_instance), dtype=torch.bool ) attn_mask[:-num_dn_anchor, :-num_dn_anchor] = False attn_mask[-num_dn_anchor:, -num_dn_anchor:] = dn_attn_mask if ( len(prediction) > self.num_single_frame_decoder and temp_anchor_embed is not None ): temp_anchor_embed = anchor_embed[ :, : self.instance_bank.num_temp_instances ] else: raise NotImplementedError(f"{op} is not supported.") output = {} # split predictions of learnable instances and noisy instances if dn_metas is not None: dn_classification = [ x[:, -num_dn_anchor:] for x in classification ] classification = [x[:, :-num_dn_anchor] for x in classification] dn_prediction = [x[:, -num_dn_anchor:] for x in prediction] prediction = [x[:, :-num_dn_anchor] for x in prediction] quality = [ x[:, :-num_dn_anchor] if x is not None else None for x in quality ] output.update( { "dn_prediction": dn_prediction, "dn_classification": dn_classification, "dn_reg_target": dn_reg_target, "dn_cls_target": dn_cls_target, "dn_valid_mask": valid_mask, } ) if temp_dn_reg_target is not None: output.update( { "temp_dn_reg_target": temp_dn_reg_target, "temp_dn_cls_target": temp_dn_cls_target, "temp_dn_valid_mask": temp_valid_mask, # "dn_id_target": dn_id_target, } ) dn_cls_target = temp_dn_cls_target valid_mask = temp_valid_mask dn_instance_feature = instance_feature[:, -num_dn_anchor:] dn_anchor = anchor[:, -num_dn_anchor:] instance_feature = instance_feature[:, :-num_dn_anchor] anchor_embed = anchor_embed[:, :-num_dn_anchor] anchor = anchor[:, :-num_dn_anchor] cls = cls[:, :-num_dn_anchor] output.update( { "classification": classification, "prediction": prediction, "quality": quality, "instance_feature": instance_feature, "anchor_embed": anchor_embed, } ) # cache current instances for temporal modeling self.instance_bank.cache( instance_feature, anchor, cls, batch_inputs, feature_maps ) if self.with_instance_id: instance_id = self.instance_bank.get_instance_id( cls, anchor, self.decoder.score_threshold ) output["instance_id"] = instance_id return output def loss(self, model_outs, data, text_dict=None): # ===================== prediction losses ====================== cls_scores = model_outs["classification"] reg_preds = model_outs["prediction"] quality = model_outs["quality"] output = {} for decoder_idx, (cls, reg, qt) in enumerate( zip(cls_scores, reg_preds, quality) ): reg = reg[..., : len(self.reg_weights)] reg = decode_box(reg) cls_target, reg_target, reg_weights, ignore_mask = self.sampler.sample( cls, reg, data[self.gt_cls_key], data[self.gt_reg_key], text_dict=text_dict, ignore_mask=data.get("ignore_mask"), ) reg_target = reg_target[..., : len(self.reg_weights)] reg_target_full = reg_target.clone() mask = torch.logical_not(torch.all(reg_target == 0, dim=-1)) mask = mask.reshape(-1) if ignore_mask is not None: ignore_mask = ~ignore_mask.reshape(-1) mask = torch.logical_and(mask, ignore_mask) ignore_mask = ignore_mask.tile(1, cls.shape[-1]) mask_valid = mask.clone() num_pos = max( reduce_mean(torch.sum(mask).to(dtype=reg.dtype)), 1.0 ) cls = cls.flatten(end_dim=1) cls_target = cls_target.flatten(end_dim=1) token_mask = torch.logical_not(cls.isinf()) cls = cls[token_mask] cls_target = cls_target[token_mask] if ignore_mask is None: cls_loss = self.loss_cls(cls, cls_target, avg_factor=num_pos) else: ignore_mask = ignore_mask[token_mask] cls_loss = self.loss_cls( cls[ignore_mask], cls_target[ignore_mask], avg_factor=num_pos, weight=cls_mask[ignore_mask], ) reg_weights = reg_weights * reg.new_tensor(self.reg_weights) reg_target = reg_target.flatten(end_dim=1)[mask] reg = reg.flatten(end_dim=1)[mask] reg_weights = reg_weights.flatten(end_dim=1)[mask] reg_target = torch.where( reg_target.isnan(), reg.new_tensor(0.0), reg_target ) # cls_target = cls_target[mask] if qt is not None: qt = qt.flatten(end_dim=1)[mask] reg_loss = self.loss_reg( reg, reg_target, weight=reg_weights, avg_factor=num_pos, prefix=f"{self.task_prefix}_", suffix=f"_{decoder_idx}", quality=qt, # cls_target=cls_target, ) output[f"{self.task_prefix}_loss_cls_{decoder_idx}"] = cls_loss output.update(reg_loss) if "dn_prediction" not in model_outs: return output # ===================== denoising losses ====================== dn_cls_scores = model_outs["dn_classification"] dn_reg_preds = model_outs["dn_prediction"] ( dn_valid_mask, dn_cls_target, dn_reg_target, dn_pos_mask, reg_weights, num_dn_pos, ) = self.prepare_for_dn_loss(model_outs) for decoder_idx, (cls, reg) in enumerate( zip(dn_cls_scores, dn_reg_preds) ): if ( "temp_dn_valid_mask" in model_outs and decoder_idx == self.num_single_frame_decoder ): ( dn_valid_mask, dn_cls_target, dn_reg_target, dn_pos_mask, reg_weights, num_dn_pos, ) = self.prepare_for_dn_loss(model_outs, prefix="temp_") cls = cls.flatten(end_dim=1)[dn_valid_mask] mask = torch.logical_not(cls.isinf()) cls_loss = self.loss_cls( cls[mask], dn_cls_target[mask], avg_factor=num_dn_pos, ) reg = reg.flatten(end_dim=1)[dn_valid_mask][dn_pos_mask][ ..., : len(self.reg_weights) ] reg = decode_box(reg) reg_loss = self.loss_reg( reg, dn_reg_target, avg_factor=num_dn_pos, weight=reg_weights, prefix=f"{self.task_prefix}_", suffix=f"_dn_{decoder_idx}", ) output[f"{self.task_prefix}_loss_cls_dn_{decoder_idx}"] = cls_loss output.update(reg_loss) return output def prepare_for_dn_loss(self, model_outs, prefix=""): dn_valid_mask = model_outs[f"{prefix}dn_valid_mask"].flatten(end_dim=1) dn_cls_target = model_outs[f"{prefix}dn_cls_target"].flatten( end_dim=1 )[dn_valid_mask] dn_reg_target = model_outs[f"{prefix}dn_reg_target"].flatten( end_dim=1 )[dn_valid_mask][..., : len(self.reg_weights)] dn_pos_mask = dn_cls_target.sum(dim=-1) > 0 dn_reg_target = dn_reg_target[dn_pos_mask] reg_weights = dn_reg_target.new_tensor(self.reg_weights)[None].tile( dn_reg_target.shape[0], 1 ) num_dn_pos = max( reduce_mean(torch.sum(dn_valid_mask).to(dtype=reg_weights.dtype)), 1.0, ) return ( dn_valid_mask, dn_cls_target, dn_reg_target, dn_pos_mask, reg_weights, num_dn_pos, ) def post_process( self, model_outs, text_dict, batch_inputs, batch_data_samples, results_in_data_samples=True, ): results = self.post_processor( model_outs["classification"], model_outs["prediction"], model_outs.get("instance_id"), model_outs.get("quality"), text_dict=text_dict, batch_inputs=batch_inputs, ) if results_in_data_samples: for i, ret in enumerate(results): instances = InstanceData() for k, v in ret.items(): if k == "bboxes_3d": type = batch_data_samples[i].metainfo["box_type_3d"] v = type( v, box_dim=v.shape[1], origin=(0.5, 0.5, 0.5), ) instances.__setattr__(k, v) batch_data_samples[i].pred_instances_3d = instances return batch_data_samples return results @MODELS.register_module() class GroundingRefineClsHead(BaseModel): def __init__( self, embed_dims=256, output_dim=9, scale=None, cls_layers=False, cls_bias=True, ): super().__init__() self.embed_dims = embed_dims self.output_dim = output_dim self.refine_state = list(range(output_dim)) self.scale = scale self.layers = nn.Sequential( *linear_act_ln(embed_dims, 2, 2), # MLP(embed_dims, embed_dims, embed_dims, 2), # nn.LayerNorm(self.embed_dims), # MLP(embed_dims, embed_dims, embed_dims, 2), Linear(self.embed_dims, self.output_dim), Scale([1.0] * self.output_dim), ) if cls_layers: self.cls_layers = nn.Sequential( MLP(embed_dims, embed_dims, embed_dims, 2), nn.LayerNorm(self.embed_dims), ) else: self.cls_layers = nn.Identity() if cls_bias: bias_value = -math.log((1 - 0.01) / 0.01) self.bias = nn.Parameter( torch.Tensor([bias_value]), requires_grad=True ) else: self.bias = None def forward( self, instance_feature: torch.Tensor, anchor: torch.Tensor = None, anchor_embed: torch.Tensor = None, time_interval: torch.Tensor = 1.0, text_feature=None, text_token_mask=None, **kwargs, ): if anchor_embed is not None: feature = instance_feature + anchor_embed else: feature = instance_feature output = self.layers(feature) if self.scale is not None: output = output * output.new_tensor(self.scale) if anchor is not None: output = output + anchor if text_feature is not None: cls = self.cls_layers( instance_feature) @ text_feature.transpose(-1, -2) cls = cls / math.sqrt(instance_feature.shape[-1]) if self.bias is not None: cls = cls + self.bias if text_token_mask is not None: cls.masked_fill_(~text_token_mask[:, None, :], float("-inf")) else: cls = None return output, cls, None @MODELS.register_module() class DoF9BoxLoss(nn.Module): def __init__( self, loss_weight_wd=1.0, loss_weight_pcd=0.0, loss_weight_cd=0.8, decode_pred=False, ): super().__init__() self.loss_weight_wd = loss_weight_wd self.loss_weight_pcd = loss_weight_pcd self.loss_weight_cd = loss_weight_cd self.decode_pred = decode_pred def forward( self, box, box_target, weight=None, avg_factor=None, prefix="", suffix="", **kwargs, ): if box_target.shape[0] == 0: loss = box.sum() * 0 return {f"{prefix}loss_box{suffix}": loss} if self.decode_pred: box = decode_box(box) loss = 0 if self.loss_weight_wd > 0: loss += self.loss_weight_wd * wasserstein_distance(box, box_target) if self.loss_weight_pcd > 0: loss += self.loss_weight_pcd * permutation_corner_distance( box, box_target ) if self.loss_weight_cd > 0: loss += self.loss_weight_cd * center_distance(box, box_target) if avg_factor is None: loss = loss.mean() else: loss = loss.sum() / avg_factor output = {f"{prefix}loss_box{suffix}": loss} return output @TASK_UTILS.register_module() class GroundingBox3DPostProcess: def __init__( self, num_output: int = 300, score_threshold: Optional[float] = None, sorted: bool = True, ): super(GroundingBox3DPostProcess, self).__init__() self.num_output = num_output self.score_threshold = score_threshold self.sorted = sorted def __call__( self, cls_scores, box_preds, instance_id=None, quality=None, output_idx=-1, text_dict=None, batch_inputs=None, ): cls_scores = cls_scores[output_idx].sigmoid() if "tokens_positive" in batch_inputs: tokens_positive_maps = get_positive_map( batch_inputs["tokens_positive"], text_dict, ) label_to_token = [ create_positive_map_label_to_token(x, plus=1) for x in tokens_positive_maps ] cls_scores = convert_grounding_to_cls_scores( cls_scores, label_to_token ) entities = get_entities( batch_inputs["text"], batch_inputs["tokens_positive"], ) else: cls_scores, _ = cls_scores.max(dim=-1, keepdim=True) entities = batch_inputs["text"] # if squeeze_cls: # cls_scores, cls_ids = cls_scores.max(dim=-1) # cls_scores = cls_scores.unsqueeze(dim=-1) box_preds = box_preds[output_idx] bs, num_pred, num_cls = cls_scores.shape num_output = min(self.num_output, num_pred*num_cls) cls_scores, indices = cls_scores.flatten(start_dim=1).topk( num_output, dim=1, sorted=self.sorted ) # if not squeeze_cls: cls_ids = indices % num_cls if self.score_threshold is not None: mask = cls_scores >= self.score_threshold if quality[output_idx] is None: quality = None if quality is not None: centerness = quality[output_idx][..., CNS] centerness = torch.gather(centerness, 1, indices // num_cls) cls_scores_origin = cls_scores.clone() cls_scores *= centerness.sigmoid() cls_scores, idx = torch.sort(cls_scores, dim=1, descending=True) # if not squeeze_cls: cls_ids = torch.gather(cls_ids, 1, idx) if self.score_threshold is not None: mask = torch.gather(mask, 1, idx) indices = torch.gather(indices, 1, idx) output = [] for i in range(bs): category_ids = cls_ids[i] # if squeeze_cls: # category_ids = category_ids[indices[i]] scores = cls_scores[i] box = box_preds[i, indices[i] // num_cls] if self.score_threshold is not None: category_ids = category_ids[mask[i]] scores = scores[mask[i]] box = box[mask[i]] # nms_idx = nms3d( # box[..., :7], # scores, # iou_threshold=0.4 # ) # box = box[nms_idx] # scores = scores[nms_idx] # category_ids = category_ids[nms_idx] if quality is not None: scores_origin = cls_scores_origin[i] if self.score_threshold is not None: scores_origin = scores_origin[mask[i]] box = decode_box(box, 0.1, 20) category_ids = category_ids.cpu() label_names = [] for id in category_ids.tolist(): if isinstance(entities[i], (tuple, list)): label_names.append(entities[i][id]) else: label_names.append(entities[i]) output.append( { "bboxes_3d": box.cpu(), "scores_3d": scores.cpu(), "labels_3d": category_ids, "target_scores_3d": scores.cpu(), "label_names": label_names, } ) if quality is not None: output[-1]["cls_scores"] = scores_origin.cpu() if instance_id is not None: ids = instance_id[i, indices[i]] if self.score_threshold is not None: ids = ids[mask[i]] output[-1]["instance_ids"] = ids return output @MODELS.register_module() class DoF9BoxEncoder(nn.Module): def __init__( self, embed_dims, rot_dims=3, output_fc=True, in_loops=1, out_loops=2, ): super().__init__() self.embed_dims = embed_dims def embedding_layer(input_dims, output_dims): return nn.Sequential( *linear_act_ln(output_dims, in_loops, out_loops, input_dims) ) if not isinstance(embed_dims, (list, tuple)): embed_dims = [embed_dims] * 5 self.pos_fc = embedding_layer(3, embed_dims[0]) self.size_fc = embedding_layer(3, embed_dims[1]) self.yaw_fc = embedding_layer(rot_dims, embed_dims[2]) self.rot_dims = rot_dims if output_fc: self.output_fc = embedding_layer(embed_dims[-1], embed_dims[-1]) else: self.output_fc = None def forward(self, box_3d: torch.Tensor): pos_feat = self.pos_fc(box_3d[..., [X, Y, Z]]) if box_3d.shape[-1] == 3: return pos_feat size_feat = self.size_fc(box_3d[..., [W, L, H]]) yaw_feat = self.yaw_fc(box_3d[..., ALPHA : ALPHA + self.rot_dims]) output = pos_feat + size_feat + yaw_feat if self.output_fc is not None: output = self.output_fc(output) return output @MODELS.register_module() class SparseBox3DKeyPointsGenerator(nn.Module): def __init__( self, embed_dims=256, num_learnable_pts=0, fix_scale=None, ): super(SparseBox3DKeyPointsGenerator, self).__init__() self.embed_dims = embed_dims self.num_learnable_pts = num_learnable_pts if fix_scale is None: fix_scale = ((0.0, 0.0, 0.0),) self.fix_scale = torch.tensor(fix_scale) self.num_pts = len(self.fix_scale) + num_learnable_pts if num_learnable_pts > 0: self.learnable_fc = Linear(self.embed_dims, num_learnable_pts * 3) def forward( self, anchor, instance_feature=None, T_cur2temp_list=None, cur_timestamp=None, temp_timestamps=None, ): bs, num_anchor = anchor.shape[:2] size = anchor[..., None, [W, L, H]].exp() key_points = self.fix_scale.to(anchor) * size if self.num_learnable_pts > 0 and instance_feature is not None: learnable_scale = ( self.learnable_fc(instance_feature) .reshape(bs, num_anchor, self.num_learnable_pts, 3) .sigmoid() - 0.5 ) key_points = torch.cat( [key_points, learnable_scale * size], dim=-2 ) key_points = rotation_3d_in_euler( key_points.flatten(0, 1), anchor[..., [ALPHA, BETA, GAMMA]].flatten(0, 1), ).unflatten(0, (bs, num_anchor)) key_points = key_points + anchor[..., None, [X, Y, Z]] if ( cur_timestamp is None or temp_timestamps is None or len(temp_timestamps) == 0 ) and T_cur2temp_list is None: return key_points temp_key_points_list = [] velocity = anchor[..., VX:] for i, t_time in enumerate(temp_timestamps): time_interval = cur_timestamp - t_time translation = ( velocity * time_interval.to(dtype=velocity.dtype)[:, None, None] ) temp_key_points = key_points - translation[:, :, None] if T_cur2temp_list is not None: T_cur2temp = T_cur2temp_list[i].to(dtype=key_points.dtype) temp_key_points = T_cur2temp[:, None, None, :3] @ torch.cat( [ temp_key_points, torch.ones_like(temp_key_points[..., :1]), ], dim=-1, ).unsqueeze(-1) temp_key_points = temp_key_points.squeeze(-1) temp_key_points_list.append(temp_key_points) return key_points, temp_key_points_list @staticmethod def anchor_projection( anchor, T_src2dst_list, src_timestamp=None, dst_timestamps=None, time_intervals=None, ): dst_anchors = [] for i in range(len(T_src2dst_list)): vel = anchor[..., VX:] vel_dim = vel.shape[-1] T_src2dst = torch.unsqueeze( T_src2dst_list[i].to(dtype=anchor.dtype), dim=1 ) center = anchor[..., [X, Y, Z]] if time_intervals is not None: time_interval = time_intervals[i] elif src_timestamp is not None and dst_timestamps is not None: time_interval = (src_timestamp - dst_timestamps[i]).to( dtype=vel.dtype ) else: time_interval = None if time_interval is not None: translation = vel.transpose(0, -1) * time_interval translation = translation.transpose(0, -1) center = center - translation center = ( torch.matmul( T_src2dst[..., :3, :3], center[..., None] ).squeeze(dim=-1) + T_src2dst[..., :3, 3] ) size = anchor[..., [W, L, H]] yaw = torch.matmul( T_src2dst[..., :2, :2], anchor[..., [COS_YAW, SIN_YAW], None], ).squeeze(-1) yaw = yaw[..., [1, 0]] vel = torch.matmul( T_src2dst[..., :vel_dim, :vel_dim], vel[..., None] ).squeeze(-1) dst_anchor = torch.cat([center, size, yaw, vel], dim=-1) dst_anchors.append(dst_anchor) return dst_anchors @staticmethod def distance(anchor): return torch.norm(anchor[..., :2], p=2, dim=-1) ================================================ FILE: bip3d/models/bert.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections import OrderedDict from typing import Sequence import torch from mmengine.model import BaseModel from torch import nn try: from transformers import AutoTokenizer, BertConfig from transformers import BertModel as HFBertModel except ImportError: AutoTokenizer = None HFBertModel = None from bip3d.registry import MODELS def generate_masks_with_special_tokens_and_transfer_map( tokenized, special_tokens_list ): """Generate attention mask between each pair of special tokens. Only token pairs in between two special tokens are attended to and thus the attention mask for these pairs is positive. Args: input_ids (torch.Tensor): input ids. Shape: [bs, num_token] special_tokens_mask (list): special tokens mask. Returns: Tuple(Tensor, Tensor): - attention_mask is the attention mask between each tokens. Only token pairs in between two special tokens are positive. Shape: [bs, num_token, num_token]. - position_ids is the position id of tokens within each valid sentence. The id starts from 0 whenenver a special token is encountered. Shape: [bs, num_token] """ input_ids = tokenized["input_ids"] bs, num_token = input_ids.shape # special_tokens_mask: # bs, num_token. 1 for special tokens. 0 for normal tokens special_tokens_mask = torch.zeros( (bs, num_token), device=input_ids.device ).bool() for special_token in special_tokens_list: special_tokens_mask |= input_ids == special_token # idxs: each row is a list of indices of special tokens idxs = torch.nonzero(special_tokens_mask) # generate attention mask and positional ids attention_mask = ( torch.eye(num_token, device=input_ids.device) .bool() .unsqueeze(0) .repeat(bs, 1, 1) ) position_ids = torch.zeros((bs, num_token), device=input_ids.device) previous_col = 0 for i in range(idxs.shape[0]): row, col = idxs[i] if col == 0: # if (col == 0) or (col == num_token - 1): attention_mask[row, col, col] = True position_ids[row, col] = 0 else: # attention_mask[row, previous_col + 1:col, # previous_col + 1:col] = True # position_ids[row, previous_col + 1:col] = torch.arange( # 0, col - previous_col - 1, device=input_ids.device) attention_mask[ row, previous_col + 1 : col + 1, previous_col + 1 : col + 1 ] = True position_ids[row, previous_col + 1 : col + 1] = torch.arange( 0, col - previous_col, device=input_ids.device ) previous_col = col return attention_mask, position_ids.to(torch.long) @MODELS.register_module() class BertModel(BaseModel): """BERT model for language embedding only encoder. Args: name (str, optional): name of the pretrained BERT model from HuggingFace. Defaults to bert-base-uncased. max_tokens (int, optional): maximum number of tokens to be used for BERT. Defaults to 256. pad_to_max (bool, optional): whether to pad the tokens to max_tokens. Defaults to True. use_sub_sentence_represent (bool, optional): whether to use sub sentence represent introduced in `Grounding DINO `. Defaults to False. special_tokens_list (list, optional): special tokens used to split subsentence. It cannot be None when `use_sub_sentence_represent` is True. Defaults to None. add_pooling_layer (bool, optional): whether to adding pooling layer in bert encoder. Defaults to False. num_layers_of_embedded (int, optional): number of layers of the embedded model. Defaults to 1. use_checkpoint (bool, optional): whether to use gradient checkpointing. Defaults to False. """ def __init__( self, name: str = "bert-base-uncased", max_tokens: int = 256, pad_to_max: bool = True, use_sub_sentence_represent: bool = False, special_tokens_list: list = None, add_pooling_layer: bool = False, num_layers_of_embedded: int = 1, use_checkpoint: bool = False, return_tokenized: bool = False, **kwargs ) -> None: super().__init__(**kwargs) self.max_tokens = max_tokens self.pad_to_max = pad_to_max self.return_tokenized = return_tokenized if AutoTokenizer is None: raise RuntimeError( "transformers is not installed, please install it by: " "pip install transformers." ) self.tokenizer = AutoTokenizer.from_pretrained(name) self.language_backbone = nn.Sequential( OrderedDict( [ ( "body", BertEncoder( name, add_pooling_layer=add_pooling_layer, num_layers_of_embedded=num_layers_of_embedded, use_checkpoint=use_checkpoint, ), ) ] ) ) self.use_sub_sentence_represent = use_sub_sentence_represent if self.use_sub_sentence_represent: assert ( special_tokens_list is not None ), "special_tokens should not be None \ if use_sub_sentence_represent is True" self.special_tokens = self.tokenizer.convert_tokens_to_ids( special_tokens_list ) def forward(self, captions: Sequence[str], **kwargs) -> dict: """Forward function.""" device = next(self.language_backbone.parameters()).device tokenized = self.tokenizer.batch_encode_plus( captions, max_length=self.max_tokens, padding="max_length" if self.pad_to_max else "longest", return_special_tokens_mask=True, return_tensors="pt", truncation=True, ).to(device) input_ids = tokenized.input_ids if self.use_sub_sentence_represent: attention_mask, position_ids = ( generate_masks_with_special_tokens_and_transfer_map( tokenized, self.special_tokens ) ) token_type_ids = tokenized["token_type_ids"] else: attention_mask = tokenized.attention_mask position_ids = None token_type_ids = None tokenizer_input = { "input_ids": input_ids, "attention_mask": attention_mask, "position_ids": position_ids, "token_type_ids": token_type_ids, } language_dict_features = self.language_backbone(tokenizer_input) if self.use_sub_sentence_represent: language_dict_features["position_ids"] = position_ids language_dict_features["text_token_mask"] = ( tokenized.attention_mask.bool() ) if self.return_tokenized: language_dict_features["tokenized"] = tokenized return language_dict_features class BertEncoder(nn.Module): """BERT encoder for language embedding. Args: name (str): name of the pretrained BERT model from HuggingFace. Defaults to bert-base-uncased. add_pooling_layer (bool): whether to add a pooling layer. num_layers_of_embedded (int): number of layers of the embedded model. Defaults to 1. use_checkpoint (bool): whether to use gradient checkpointing. Defaults to False. """ def __init__( self, name: str, add_pooling_layer: bool = False, num_layers_of_embedded: int = 1, use_checkpoint: bool = False, ): super().__init__() if BertConfig is None: raise RuntimeError( "transformers is not installed, please install it by: " "pip install transformers." ) config = BertConfig.from_pretrained(name) config.gradient_checkpointing = use_checkpoint # only encoder self.model = HFBertModel.from_pretrained( name, add_pooling_layer=add_pooling_layer, config=config ) self.language_dim = config.hidden_size self.num_layers_of_embedded = num_layers_of_embedded def forward(self, x) -> dict: mask = x["attention_mask"] outputs = self.model( input_ids=x["input_ids"], attention_mask=mask, position_ids=x["position_ids"], token_type_ids=x["token_type_ids"], output_hidden_states=True, ) # outputs has 13 layers, 1 input layer and 12 hidden layers encoded_layers = outputs.hidden_states[1:] features = torch.stack( encoded_layers[-self.num_layers_of_embedded :], 1 ).mean(1) # language embedding has shape [len(phrase), seq_len, language_dim] features = features / self.num_layers_of_embedded if mask.dim() == 2: embedded = features * mask.unsqueeze(-1).float() else: embedded = features results = { "embedded": embedded, "masks": mask, "hidden": encoded_layers[-1], } return results ================================================ FILE: bip3d/models/data_preprocessors/__init__.py ================================================ from .custom_data_preprocessor import CustomDet3DDataPreprocessor __all__ = ["CustomDet3DDataPreprocessor"] ================================================ FILE: bip3d/models/data_preprocessors/custom_data_preprocessor.py ================================================ import math from numbers import Number from typing import Dict, List, Optional, Sequence, Tuple, Union import numpy as np import torch from mmdet.models import DetDataPreprocessor from mmdet.models.utils.misc import samplelist_boxtype2tensor from mmengine.model import stack_batch from mmengine.structures import InstanceData from mmengine.utils import is_seq_of from torch import Tensor from torch.nn import functional as F from bip3d.registry import MODELS from bip3d.utils.typing_config import ConfigType, SampleList from bip3d.structures.bbox_3d import get_proj_mat_by_coord_type from .utils import multiview_img_stack_batch @MODELS.register_module() class CustomDet3DDataPreprocessor(DetDataPreprocessor): """Points / Image pre-processor for point clouds / vision-only / multi- modality 3D detection tasks. It provides the data pre-processing as follows - Collate and move image and point cloud data to the target device. - 1) For image data: - Pad images in inputs to the maximum size of current batch with defined ``pad_value``. The padding size can be divisible by a defined ``pad_size_divisor``. - Stack images in inputs to batch_imgs. - Convert images in inputs from bgr to rgb if the shape of input is (3, H, W). - Normalize images in inputs with defined std and mean. - Do batch augmentations during training. - 2) For point cloud data: - If no voxelization, directly return list of point cloud data. - If voxelization is applied, voxelize point cloud according to ``voxel_type`` and obtain ``voxels``. Args: voxel (bool): Whether to apply voxelization to point cloud. Defaults to False. voxel_type (str): Voxelization type. Two voxelization types are provided: 'hard' and 'dynamic', respectively for hard voxelization and dynamic voxelization. Defaults to 'hard'. voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer config. Defaults to None. batch_first (bool): Whether to put the batch dimension to the first dimension when getting voxel coordinates. Defaults to True. max_voxels (int, optional): Maximum number of voxels in each voxel grid. Defaults to None. mean (Sequence[Number], optional): The pixel mean of R, G, B channels. Defaults to None. std (Sequence[Number], optional): The pixel standard deviation of R, G, B channels. Defaults to None. pad_size_divisor (int): The size of padded image should be divisible by ``pad_size_divisor``. Defaults to 1. pad_value (float or int): The padded pixel value. Defaults to 0. pad_mask (bool): Whether to pad instance masks. Defaults to False. mask_pad_value (int): The padded pixel value for instance masks. Defaults to 0. pad_seg (bool): Whether to pad semantic segmentation maps. Defaults to False. seg_pad_value (int): The padded pixel value for semantic segmentation maps. Defaults to 255. bgr_to_rgb (bool): Whether to convert image from BGR to RGB. Defaults to False. rgb_to_bgr (bool): Whether to convert image from RGB to BGR. Defaults to False. boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of bboxes data to ``Tensor`` type. Defaults to True. non_blocking (bool): Whether to block current process when transferring data to device. Defaults to False. batch_augments (List[dict], optional): Batch-level augmentations. Defaults to None. batchwise_inputs (bool): Pack the input as a batch of samples with 1-N frames for the continuous 3D perception setting. Defaults to False. """ def __init__( self, voxel: bool = False, voxel_type: str = "hard", voxel_layer: Optional[ConfigType] = None, batch_first: bool = True, max_voxels: Optional[int] = None, mean: Sequence[Number] = None, std: Sequence[Number] = None, pad_size_divisor: int = 1, pad_value: Union[float, int] = 0, pad_mask: bool = False, mask_pad_value: int = 0, pad_seg: bool = False, seg_pad_value: int = 255, bgr_to_rgb: bool = False, rgb_to_bgr: bool = False, boxtype2tensor: bool = True, non_blocking: bool = False, batch_augments: Optional[List[dict]] = None, batchwise_inputs: bool = False, ): super().__init__( mean=mean, std=std, pad_size_divisor=pad_size_divisor, pad_value=pad_value, pad_mask=pad_mask, mask_pad_value=mask_pad_value, pad_seg=pad_seg, seg_pad_value=seg_pad_value, bgr_to_rgb=bgr_to_rgb, rgb_to_bgr=rgb_to_bgr, boxtype2tensor=boxtype2tensor, non_blocking=non_blocking, batch_augments=batch_augments, ) self.voxel = voxel self.voxel_type = voxel_type self.batch_first = batch_first self.max_voxels = max_voxels self.batchwise_inputs = batchwise_inputs if voxel: self.voxel_layer = VoxelizationByGridShape(**voxel_layer) def forward(self, data: Union[dict, List[dict]], training: bool = False): """Perform normalization, padding and bgr2rgb conversion based on ``BaseDataPreprocessor``. Args: data (dict or List[dict]): Data from dataloader. The dict contains the whole batch data, when it is a list[dict], the list indicates test time augmentation. training (bool): Whether to enable training time augmentation. Defaults to False. Returns: dict or List[dict]: Data in the same format as the model input. """ if isinstance(data, list): num_augs = len(data) aug_batch_data = [] for aug_id in range(num_augs): single_aug_batch_data = self.simple_process( data[aug_id], training ) aug_batch_data.append(single_aug_batch_data) return aug_batch_data else: return self.simple_process(data, training) def simple_process(self, data: dict, training: bool = False): """Perform normalization, padding and bgr2rgb conversion for img data based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel` is set to be True. Args: data (dict): Data sampled from dataloader. training (bool): Whether to enable training time augmentation. Defaults to False. Returns: dict: Data in the same format as the model input. """ if "img" in data["inputs"]: batch_pad_shape = self._get_pad_shape(data) if self.batchwise_inputs: data_samples = data["data_samples"] batchwise_data_samples = [] if "bboxes_3d" in data_samples[0].gt_instances_3d: assert isinstance( data_samples[0].gt_instances_3d.labels_3d, list ) bboxes_3d = data_samples[0].gt_instances_3d.bboxes_3d labels_3d = data_samples[0].gt_instances_3d.labels_3d if "gt_occupancy_masks" in data_samples[0]: gt_occupancy_masks = [ mask.clone() for mask in data_samples[0].gt_occupancy_masks ] if ( "eval_ann_info" in data_samples[0] and data_samples[0].eval_ann_info is not None ): eval_ann_info = data_samples[0].eval_ann_info for idx in range(len(labels_3d)): data_sample = data_samples[0].clone() if "bboxes_3d" in data_sample.gt_instances_3d: data_sample.gt_instances_3d = InstanceData() data_sample.gt_instances_3d.bboxes_3d = bboxes_3d[idx] data_sample.gt_instances_3d.labels_3d = labels_3d[idx] if "gt_occupancy_masks" in data_sample: data_sample.gt_occupancy_masks = gt_occupancy_masks[idx] if "eval_ann_info" in data_sample: if data_sample.eval_ann_info is not None: data_sample.eval_ann_info = dict() data_sample.eval_ann_info["gt_bboxes_3d"] = ( eval_ann_info["gt_bboxes_3d"][idx] ) data_sample.eval_ann_info["gt_labels_3d"] = ( eval_ann_info["gt_labels_3d"][idx] ) batchwise_data_samples.append(data_sample) data["data_samples"] = batchwise_data_samples data = self.collate_data(data) inputs, data_samples = data["inputs"], data["data_samples"] batch_inputs = dict() batch_inputs.update(self.process_camera_params(data_samples)) for key in ["depth_img", "depth_prob_gt"]: if key not in inputs: continue batch_inputs[key] = torch.stack(inputs[key]) for key in ["text", "scan_id", "tokens_positive", "ignore_mask"]: if not hasattr(data_samples[0], key): continue batch_inputs[key] = [getattr(x, key) for x in data_samples] if hasattr(data_samples[0], "gt_instances_3d"): batch_inputs["gt_bboxes_3d"] = [ x.gt_instances_3d.bboxes_3d for x in data_samples ] batch_inputs["gt_labels_3d"] = [ x.gt_instances_3d.labels_3d for x in data_samples ] if "imgs" in inputs: imgs = inputs["imgs"] if data_samples is not None: # NOTE the batched image size information may be useful, e.g. # in DETR, this is needed for the construction of masks, which # is then used for the transformer_head. batch_input_shape = tuple(imgs[0].size()[-2:]) for data_sample, pad_shape in zip( data_samples, batch_pad_shape ): data_sample.set_metainfo( { "batch_input_shape": batch_input_shape, "pad_shape": pad_shape, } ) if self.boxtype2tensor: samplelist_boxtype2tensor(data_samples) if self.pad_mask: self.pad_gt_masks(data_samples) if self.pad_seg: self.pad_gt_sem_seg(data_samples) if training and self.batch_augments is not None: for batch_aug in self.batch_augments: imgs, data_samples = batch_aug(imgs, data_samples) batch_inputs["imgs"] = imgs return {"inputs": batch_inputs, "data_samples": data_samples} def process_camera_params(self, data_samples): projection_mat = [] extrinsic_list = [] intrinsic_list = [] image_wh = [] for data_sample in data_samples: proj_mat = get_proj_mat_by_coord_type( data_sample.metainfo, "DEPTH" ) img_scale_factor = data_sample.metainfo.get("scale_factor", [1, 1]) img_flip = data_sample.metainfo.get("flip", False) img_crop_offset = data_sample.metainfo.get( "img_crop_offset", [0, 0] ) trans_mat = np.eye(4) trans_mat[0, 0] = img_scale_factor[0] trans_mat[1, 1] = img_scale_factor[1] trans_mat[0, 2] = -img_crop_offset[0] trans_mat[1, 2] = -img_crop_offset[1] if img_flip: assert False if "trans_mat" in proj_mat: trans_mat = np.stack(proj_mat["trans_mat"]) @ trans_mat if isinstance(proj_mat, dict): extrinsic = np.stack(proj_mat["extrinsic"]) intrinsic = np.stack(proj_mat["intrinsic"]) proj_mat = intrinsic @ extrinsic extrinsic_list.append(extrinsic) intrinsic_list.append(trans_mat @ intrinsic) else: extrinsic_list.append( np.tile(np.eye(4)[None], (proj_mat.shape[0], 1, 1)) ) intrinsic_list.append( np.tile(np.eye(4)[None], (proj_mat.shape[0], 1, 1)) ) proj_mat = trans_mat @ proj_mat projection_mat.append(proj_mat) image_wh.append(data_sample.metainfo["img_shape"][:2]) to_tensor = lambda x: torch.from_numpy(x).cuda().to(torch.float32) projection_mat = to_tensor(np.stack(projection_mat)) image_wh = to_tensor(np.array(image_wh)) image_wh = image_wh[:, None].tile(1, projection_mat.shape[1], 1) extrinsic = to_tensor(np.stack(extrinsic_list)) intrinsic = to_tensor(np.stack(intrinsic_list)) return { "projection_mat": projection_mat, "image_wh": image_wh, "extrinsic": extrinsic, "intrinsic": intrinsic, } def preprocess_img(self, _batch_img: Tensor): # channel transform if self._channel_conversion: _batch_img = _batch_img[[2, 1, 0], ...] # Convert to float after channel conversion to ensure # efficiency _batch_img = _batch_img.float() # Normalization. if self._enable_normalize: if self.mean.shape[0] == 3: assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, ( "If the mean has 3 values, the input tensor " "should in shape of (3, H, W), but got the " f"tensor with shape {_batch_img.shape}" ) _batch_img = (_batch_img - self.mean) / self.std return _batch_img def collate_data(self, data: dict): """Copy data to the target device and perform normalization, padding and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``. Collates the data sampled from dataloader into a list of dict and list of labels, and then copies tensor to the target device. Args: data (dict): Data sampled from dataloader. Returns: dict: Data in the same format as the model input. """ data = self.cast_data(data) # type: ignore if "img" in data["inputs"]: _batch_imgs = data["inputs"]["img"] # Process data with `pseudo_collate`. if is_seq_of(_batch_imgs, torch.Tensor): batch_imgs = [] img_dim = _batch_imgs[0].dim() for _batch_img in _batch_imgs: if img_dim == 3: # standard img _batch_img = self.preprocess_img(_batch_img) elif img_dim == 4: _batch_img = [ self.preprocess_img(_img) for _img in _batch_img ] _batch_img = torch.stack(_batch_img, dim=0) batch_imgs.append(_batch_img) # Pad and stack Tensor. if img_dim == 3: batch_imgs = stack_batch( batch_imgs, self.pad_size_divisor, self.pad_value ) elif img_dim == 4: batch_imgs = multiview_img_stack_batch( batch_imgs, self.pad_size_divisor, self.pad_value ) # Process data with `default_collate`. elif isinstance(_batch_imgs, torch.Tensor): assert _batch_imgs.dim() == 4, ( "The input of `ImgDataPreprocessor` should be a NCHW " "tensor or a list of tensor, but got a tensor with " f"shape: {_batch_imgs.shape}" ) if self._channel_conversion: _batch_imgs = _batch_imgs[:, [2, 1, 0], ...] # Convert to float after channel conversion to ensure # efficiency _batch_imgs = _batch_imgs.float() if self._enable_normalize: _batch_imgs = (_batch_imgs - self.mean) / self.std h, w = _batch_imgs.shape[2:] target_h = ( math.ceil(h / self.pad_size_divisor) * self.pad_size_divisor ) target_w = ( math.ceil(w / self.pad_size_divisor) * self.pad_size_divisor ) pad_h = target_h - h pad_w = target_w - w batch_imgs = F.pad( _batch_imgs, (0, pad_w, 0, pad_h), "constant", self.pad_value, ) else: raise TypeError( "Output of `cast_data` should be a list of dict " "or a tuple with inputs and data_samples, but got " f"{type(data)}: {data}" ) data["inputs"]["imgs"] = batch_imgs data.setdefault("data_samples", None) return data def _get_pad_shape(self, data: dict): """Get the pad_shape of each image based on data and pad_size_divisor.""" # rewrite `_get_pad_shape` for obtaining image inputs. _batch_inputs = data["inputs"]["img"] # Process data with `pseudo_collate`. if is_seq_of(_batch_inputs, torch.Tensor): batch_pad_shape = [] for ori_input in _batch_inputs: if ori_input.dim() == 4: # mean multiview input, select one of the # image to calculate the pad shape ori_input = ori_input[0] pad_h = ( int(np.ceil(ori_input.shape[1] / self.pad_size_divisor)) * self.pad_size_divisor ) pad_w = ( int(np.ceil(ori_input.shape[2] / self.pad_size_divisor)) * self.pad_size_divisor ) batch_pad_shape.append((pad_h, pad_w)) # Process data with `default_collate`. elif isinstance(_batch_inputs, torch.Tensor): assert _batch_inputs.dim() == 4, ( "The input of `ImgDataPreprocessor` should be a NCHW tensor " "or a list of tensor, but got a tensor with shape: " f"{_batch_inputs.shape}" ) pad_h = ( int(np.ceil(_batch_inputs.shape[1] / self.pad_size_divisor)) * self.pad_size_divisor ) pad_w = ( int(np.ceil(_batch_inputs.shape[2] / self.pad_size_divisor)) * self.pad_size_divisor ) batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0] else: raise TypeError( "Output of `cast_data` should be a list of dict " "or a tuple with inputs and data_samples, but got " f"{type(data)}: {data}" ) return batch_pad_shape ================================================ FILE: bip3d/models/data_preprocessors/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import List, Union import torch import torch.nn.functional as F from torch import Tensor def multiview_img_stack_batch( tensor_list: List[Tensor], pad_size_divisor: int = 1, pad_value: Union[int, float] = 0, ) -> Tensor: """Compared to the ``stack_batch`` in `mmengine.model.utils`, multiview_img_stack_batch further handle the multiview images. See diff of padded_sizes[:, :-2] = 0 vs padded_sizes[:, 0] = 0 in line 47. Stack multiple tensors to form a batch and pad the tensor to the max shape use the right bottom padding mode in these images. If ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is divisible by ``pad_size_divisor``. Args: tensor_list (List[Tensor]): A list of tensors with the same dim. pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is divisible by ``pad_size_divisor``. This depends on the model, and many models need to be divisible by 32. Defaults to 1. pad_value (int or float): The padding value. Defaults to 0. Returns: Tensor: The n dim tensor. """ assert isinstance( tensor_list, list ), f"Expected input type to be list, but got {type(tensor_list)}" assert tensor_list, "`tensor_list` could not be an empty list" assert len({tensor.ndim for tensor in tensor_list}) == 1, ( "Expected the dimensions of all tensors must be the same, " f"but got {[tensor.ndim for tensor in tensor_list]}" ) dim = tensor_list[0].dim() num_img = len(tensor_list) all_sizes: torch.Tensor = torch.Tensor( [tensor.shape for tensor in tensor_list] ) max_sizes = ( torch.ceil(torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor ) padded_sizes = max_sizes - all_sizes # The first dim normally means channel, which should not be padded. padded_sizes[:, :-2] = 0 if padded_sizes.sum() == 0: return torch.stack(tensor_list) # `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4), # it means that padding the last dim with 1(left) 2(right), padding the # penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of # the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed, # and only odd index of pad should be assigned to keep padding "right" and # "bottom". pad = torch.zeros(num_img, 2 * dim, dtype=torch.int) pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)] batch_tensor = [] for idx, tensor in enumerate(tensor_list): batch_tensor.append( F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value) ) return torch.stack(batch_tensor) ================================================ FILE: bip3d/models/deformable_aggregation.py ================================================ from typing import List, Optional, Tuple import numpy as np import torch import torch.nn as nn from torch.cuda.amp.autocast_mode import autocast from mmcv.cnn import build_norm_layer from mmcv.cnn import Linear from mmengine.model import ( Sequential, BaseModule, xavier_init, constant_init, ModuleList, ) from mmcv.cnn.bricks.transformer import FFN from ..ops import deformable_aggregation_func from .utils import linear_act_ln from bip3d.registry import MODELS __all__ = [ "DeformableFeatureAggregation", ] @MODELS.register_module() class DeformableFeatureAggregation(BaseModule): def __init__( self, embed_dims: int = 256, num_groups: int = 8, num_levels: int = 4, num_cams: int = 6, proj_drop: float = 0.0, attn_drop: float = 0.0, kps_generator: dict = None, temporal_fusion_module=None, use_temporal_anchor_embed=True, use_deformable_func=False, use_camera_embed=False, residual_mode="add", batch_first=True, ffn_cfg=None, with_value_proj=False, filter_outlier=False, with_depth=False, min_depth=None, max_depth=None, ): super(DeformableFeatureAggregation, self).__init__() assert batch_first if embed_dims % num_groups != 0: raise ValueError( f"embed_dims must be divisible by num_groups, " f"but got {embed_dims} and {num_groups}" ) self.group_dims = int(embed_dims / num_groups) self.embed_dims = embed_dims self.num_levels = num_levels self.num_groups = num_groups self.num_cams = num_cams self.use_temporal_anchor_embed = use_temporal_anchor_embed self.use_deformable_func = use_deformable_func self.attn_drop = attn_drop self.residual_mode = residual_mode self.proj_drop = nn.Dropout(proj_drop) if kps_generator is not None: kps_generator["embed_dims"] = embed_dims self.kps_generator = MODELS.build(kps_generator) self.num_pts = self.kps_generator.num_pts else: self.kps_generator = None self.num_pts = 1 if temporal_fusion_module is not None: if "embed_dims" not in temporal_fusion_module: temporal_fusion_module["embed_dims"] = embed_dims self.temp_module = MODELS.build(temporal_fusion_module) else: self.temp_module = None self.output_proj = Linear(embed_dims, embed_dims) if use_camera_embed: self.camera_encoder = Sequential( *linear_act_ln(embed_dims, 1, 2, 12) ) self.weights_fc = Linear( embed_dims, num_groups * num_levels * self.num_pts ) else: self.camera_encoder = None self.weights_fc = Linear( embed_dims, num_groups * num_cams * num_levels * self.num_pts ) if ffn_cfg is not None: ffn_cfg["embed_dims"] = embed_dims self.ffn = FFN(**ffn_cfg) self.norms = ModuleList( build_norm_layer(dict(type="LN"), embed_dims)[1] for _ in range(2) ) else: self.ffn = None self.with_value_proj = with_value_proj self.filter_outlier = filter_outlier if self.with_value_proj: self.value_proj = Linear(embed_dims, embed_dims) self.with_depth = with_depth if self.with_depth: assert min_depth is not None and max_depth is not None self.min_depth = min_depth self.max_depth = max_depth def init_weight(self): constant_init(self.weights_fc, val=0.0, bias=0.0) xavier_init(self.output_proj, distribution="uniform", bias=0.0) def get_spatial_shape_3D(self, spatial_shape, depth_dim): spatial_shape_depth = spatial_shape.new_ones(*spatial_shape.shape[:-1], 1) * depth_dim spatial_shape_3D = torch.cat([spatial_shape, spatial_shape_depth], dim=-1) return spatial_shape_3D.contiguous() def forward( self, instance_feature: torch.Tensor, anchor: torch.Tensor, anchor_embed: torch.Tensor, feature_maps: List[torch.Tensor], metas: dict, depth_prob=None, **kwargs: dict, ): bs, num_anchor = instance_feature.shape[:2] if self.kps_generator is not None: key_points = self.kps_generator(anchor, instance_feature) else: key_points = anchor[:, :, None] points_2d, depth, mask = self.project_points( key_points, metas["projection_mat"], metas.get("image_wh"), ) weights = self._get_weights( instance_feature, anchor_embed, metas, mask ) if self.use_deformable_func: if self.with_value_proj: feature_maps[0] = self.value_proj(feature_maps[0]) points_2d = points_2d.permute(0, 2, 3, 1, 4).reshape( bs, num_anchor * self.num_pts, -1, 2 ) weights = ( weights.permute(0, 1, 4, 2, 3, 5) .contiguous() .reshape( bs, num_anchor * self.num_pts, -1, self.num_levels, self.num_groups, ) ) if self.with_depth: depth = depth.permute(0, 2, 3, 1).reshape( bs, num_anchor * self.num_pts, -1, 1 ) # normalize depth to [0, depth_prob.shape[-1]-1] depth = (depth - self.min_depth) / (self.max_depth - self.min_depth) depth = depth * (depth_prob.shape[-1] - 1) features = deformable_aggregation_func( *feature_maps, points_2d, weights, depth_prob, depth ) else: features = deformable_aggregation_func(*feature_maps, points_2d, weights) features = features.reshape(bs, num_anchor, self.num_pts, self.embed_dims) features = features.sum(dim=2) else: assert False assert not self.with_value_proj features = self.feature_sampling( feature_maps, key_points, metas["projection_mat"], metas.get("image_wh"), ) features = self.multi_view_level_fusion(features, weights) features = features.sum(dim=2) # fuse multi-point features output = self.proj_drop(self.output_proj(features)) if self.residual_mode == "add": output = output + instance_feature elif self.residual_mode == "cat": output = torch.cat([output, instance_feature], dim=-1) if self.ffn is not None: output = self.norms[0](output) output = self.ffn(output) output = self.norms[1](output) return output def _get_weights( self, instance_feature, anchor_embed, metas=None, mask=None ): bs, num_anchor = instance_feature.shape[:2] feature = instance_feature + anchor_embed if self.camera_encoder is not None: num_cams = metas["projection_mat"].shape[1] camera_embed = self.camera_encoder( metas["projection_mat"][:, :, :3].reshape(bs, num_cams, -1) ) feature = feature[:, :, None] + camera_embed[:, None] weights = self.weights_fc(feature) if mask is not None and self.filter_outlier: num_cams = weights.shape[2] mask = mask.permute(0, 2, 1, 3)[..., None, :, None] weights = weights.reshape( bs, num_anchor, num_cams, self.num_levels, self.num_pts, self.num_groups, ) weights = weights.masked_fill( torch.logical_and(~mask, mask.sum(dim=2, keepdim=True) != 0), float("-inf"), ) weights = ( weights.reshape(bs, num_anchor, -1, self.num_groups) .softmax(dim=-2) .reshape( bs, num_anchor, -1, self.num_levels, self.num_pts, self.num_groups, ) ) if self.training and self.attn_drop > 0: mask = torch.rand(bs, num_anchor, -1, 1, self.num_pts, 1) mask = mask.to(device=weights.device, dtype=weights.dtype) weights = ((mask > self.attn_drop) * weights) / ( 1 - self.attn_drop ) return weights @staticmethod def project_points(key_points, projection_mat, image_wh=None): bs, num_anchor, num_pts = key_points.shape[:3] pts_extend = torch.cat( [key_points, torch.ones_like(key_points[..., :1])], dim=-1 ) points_2d = torch.matmul( projection_mat[:, :, None, None], pts_extend[:, None, ..., None] ).squeeze(-1) depth = points_2d[..., 2] mask = depth > 1e-5 points_2d = points_2d[..., :2] / torch.clamp( points_2d[..., 2:3], min=1e-5 ) mask = mask & (points_2d[..., 0] > 0) & (points_2d[..., 1] > 0) if image_wh is not None: points_2d = points_2d / image_wh[:, :, None, None] mask = mask & (points_2d[..., 0] < 1) & (points_2d[..., 1] < 1) return points_2d, depth, mask @staticmethod def feature_sampling( feature_maps: List[torch.Tensor], key_points: torch.Tensor, projection_mat: torch.Tensor, image_wh: Optional[torch.Tensor] = None, ) -> torch.Tensor: num_levels = len(feature_maps) num_cams = feature_maps[0].shape[1] bs, num_anchor, num_pts = key_points.shape[:3] points_2d = DeformableFeatureAggregation.project_points( key_points, projection_mat, image_wh ) points_2d = points_2d * 2 - 1 points_2d = points_2d.flatten(end_dim=1) features = [] for fm in feature_maps: features.append( torch.nn.functional.grid_sample( fm.flatten(end_dim=1), points_2d ) ) features = torch.stack(features, dim=1) features = features.reshape( bs, num_cams, num_levels, -1, num_anchor, num_pts ).permute( 0, 4, 1, 2, 5, 3 ) # bs, num_anchor, num_cams, num_levels, num_pts, embed_dims return features def multi_view_level_fusion( self, features: torch.Tensor, weights: torch.Tensor, ): bs, num_anchor = weights.shape[:2] features = weights[..., None] * features.reshape( features.shape[:-1] + (self.num_groups, self.group_dims) ) features = features.sum(dim=2).sum(dim=2) features = features.reshape( bs, num_anchor, self.num_pts, self.embed_dims ) return features ================================================ FILE: bip3d/models/feature_enhancer.py ================================================ import torch from torch import nn from mmengine.model import BaseModel from mmdet.models.layers.transformer.utils import get_text_sine_pos_embed from mmdet.models.layers import SinePositionalEncoding from mmdet.models.layers.transformer.deformable_detr_layers import ( DeformableDetrTransformerEncoder as DDTE, ) from mmdet.models.layers.transformer.deformable_detr_layers import ( DeformableDetrTransformerEncoderLayer, ) from mmdet.models.layers.transformer.detr_layers import ( DetrTransformerEncoderLayer, ) from mmdet.models.utils.vlfuse_helper import SingleScaleBiAttentionBlock from bip3d.registry import MODELS from .utils import deformable_format @MODELS.register_module() class TextImageDeformable2DEnhancer(BaseModel): def __init__( self, num_layers, text_img_attn_block, img_attn_block, text_attn_block=None, embed_dims=256, num_feature_levels=4, positional_encoding=None, **kwargs, ): super().__init__(**kwargs) self.num_layers = num_layers self.num_feature_levels = num_feature_levels self.embed_dims = embed_dims self.positional_encoding = positional_encoding self.text_img_attn_blocks = nn.ModuleList() self.img_attn_blocks = nn.ModuleList() self.text_attn_blocks = nn.ModuleList() for _ in range(self.num_layers): self.text_img_attn_blocks.append( SingleScaleBiAttentionBlock(**text_img_attn_block) ) self.img_attn_blocks.append( DeformableDetrTransformerEncoderLayer(**img_attn_block) ) self.text_attn_blocks.append( DetrTransformerEncoderLayer(**text_attn_block) ) self.positional_encoding = SinePositionalEncoding( **self.positional_encoding ) self.level_embed = nn.Parameter( torch.Tensor(self.num_feature_levels, self.embed_dims) ) def forward( self, feature_maps, text_dict=None, **kwargs, ): with_cams = feature_maps[0].dim() == 5 if with_cams: bs, num_cams = feature_maps[0].shape[:2] feature_maps = [x.flatten(0, 1) for x in feature_maps] else: bs = feature_maps[0].shape[0] num_cams = 1 pos_2d = self.get_2d_position_embed(feature_maps) feature_2d, spatial_shapes, level_start_index = deformable_format( feature_maps ) reference_points = DDTE.get_encoder_reference_points( spatial_shapes, valid_ratios=feature_2d.new_ones( [bs * num_cams, self.num_feature_levels, 2] ), device=feature_2d.device, ) text_feature = text_dict["embedded"] pos_text = get_text_sine_pos_embed( text_dict["position_ids"][..., None], num_pos_feats=self.embed_dims, exchange_xy=False, ) for layer_id in range(self.num_layers): feature_2d_fused = feature_2d[:, level_start_index[-1] :] if with_cams: feature_2d_fused = feature_2d_fused.unflatten( 0, (bs, num_cams) ) feature_2d_fused = feature_2d_fused.flatten(1, 2) feature_2d_fused, text_feature = self.text_img_attn_blocks[ layer_id ]( feature_2d_fused, text_feature, attention_mask_l=text_dict["text_token_mask"], ) if with_cams: feature_2d_fused = feature_2d_fused.unflatten( 1, (num_cams, -1) ) feature_2d_fused = feature_2d_fused.flatten(0, 1) feature_2d = torch.cat( [feature_2d[:, : level_start_index[-1]], feature_2d_fused], dim=1, ) feature_2d = self.img_attn_blocks[layer_id]( query=feature_2d, query_pos=pos_2d, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, key_padding_mask=None, ) text_attn_mask = text_dict.get("masks") if text_attn_mask is not None: text_num_heads = self.text_attn_blocks[ layer_id ].self_attn_cfg.num_heads text_attn_mask = ~text_attn_mask.repeat(text_num_heads, 1, 1) text_feature = self.text_attn_blocks[layer_id]( query=text_feature, query_pos=pos_text, attn_mask=text_attn_mask, key_padding_mask=None, ) feature_2d = deformable_format( feature_2d, spatial_shapes, batch_size=bs if with_cams else None ) return feature_2d, text_feature def get_2d_position_embed(self, feature_maps): pos_2d = [] for lvl, feat in enumerate(feature_maps): batch_size, c, h, w = feat.shape pos = self.positional_encoding(None, feat) pos = pos.view(batch_size, c, h * w).permute(0, 2, 1) pos = pos + self.level_embed[lvl] pos_2d.append(pos) pos_2d = torch.cat(pos_2d, 1) return pos_2d ================================================ FILE: bip3d/models/instance_bank.py ================================================ import torch from torch import nn import torch.nn.functional as F import numpy as np from bip3d.registry import MODELS __all__ = ["InstanceBank"] def topk(confidence, k, *inputs): bs, N = confidence.shape[:2] confidence, indices = torch.topk(confidence, k, dim=1) indices = ( indices + torch.arange(bs, device=indices.device)[:, None] * N ).reshape(-1) outputs = [] for input in inputs: outputs.append(input.flatten(end_dim=1)[indices].reshape(bs, k, -1)) return confidence, outputs @MODELS.register_module() class InstanceBank(nn.Module): def __init__( self, num_anchor, embed_dims, anchor, anchor_handler=None, num_current_instance=None, num_temp_instances=0, default_time_interval=0.5, confidence_decay=0.6, anchor_grad=True, feat_grad=True, max_time_interval=2, anchor_in_camera=True, ): super(InstanceBank, self).__init__() self.embed_dims = embed_dims self.num_current_instance = num_current_instance self.num_temp_instances = num_temp_instances self.default_time_interval = default_time_interval self.confidence_decay = confidence_decay self.max_time_interval = max_time_interval if anchor_handler is not None: anchor_handler = MODELS.build(anchor_handler) assert hasattr(anchor_handler, "anchor_projection") self.anchor_handler = anchor_handler if isinstance(anchor, str): anchor = np.load(anchor) elif isinstance(anchor, (list, tuple)): anchor = np.array(anchor) if len(anchor.shape) == 3: # for map anchor = anchor.reshape(anchor.shape[0], -1) self.num_anchor = min(len(anchor), num_anchor) self.anchor = anchor[:num_anchor] # self.anchor = nn.Parameter( # torch.tensor(anchor, dtype=torch.float32), # requires_grad=anchor_grad, # ) self.anchor_init = anchor self.instance_feature = nn.Parameter( torch.zeros([1, self.embed_dims]), # torch.zeros([self.anchor.shape[0], self.embed_dims]), requires_grad=feat_grad, ) self.anchor_in_camera = anchor_in_camera self.reset() def init_weight(self): self.anchor.data = self.anchor.data.new_tensor(self.anchor_init) if self.instance_feature.requires_grad: torch.nn.init.xavier_uniform_(self.instance_feature.data, gain=1) def reset(self): self.cached_feature = None self.cached_anchor = None self.metas = None self.mask = None self.confidence = None self.temp_confidence = None self.instance_id = None self.prev_id = 0 def bbox_transform(self, bbox, matrix): # bbox: bs, n, 9 # matrix: bs, cam, 4, 4 # output: bs, n*cam, 9 bbox = bbox.unsqueeze(dim=2) matrix = matrix.unsqueeze(dim=1) points = bbox[..., :3] points_extend = torch.concat( [points, torch.ones_like(points[..., :1])], dim=-1 ) points_trans = torch.matmul(matrix, points_extend[..., None])[ ..., :3, 0 ] size = bbox[..., 3:6].tile(1, 1, points_trans.shape[2], 1) angle = bbox[..., 6:].tile(1, 1, points_trans.shape[2], 1) bbox = torch.cat([points_trans, size, angle], dim=-1) bbox = bbox.flatten(1, 2) return bbox def get(self, batch_size, metas=None, dn_metas=None): instance_feature = torch.tile( self.instance_feature[None], (batch_size, self.anchor.shape[0], 1) ) anchor = torch.tile( instance_feature.new_tensor(self.anchor)[None], (batch_size, 1, 1) ) if self.anchor_in_camera: cam2global = np.linalg.inv(metas["extrinsic"].cpu().numpy()) cam2global = torch.from_numpy(cam2global).to(anchor) anchor = self.bbox_transform(anchor, cam2global) instance_feature = instance_feature.tile(1, cam2global.shape[1], 1) if ( self.cached_anchor is not None and batch_size == self.cached_anchor.shape[0] ): # assert False, "TODO: linxuewu" # history_time = self.metas["timestamp"] # time_interval = metas["timestamp"] - history_time # time_interval = time_interval.to(dtype=instance_feature.dtype) # self.mask = torch.abs(time_interval) <= self.max_time_interval last_scan_id = self.metas["scan_id"] current_scan_id = metas["scan_id"] self.mask = torch.tensor( [x==y for x,y in zip(last_scan_id, current_scan_id)], device=anchor.device, ) assert self.mask.shape[0] == 1 if not self.mask: self.reset() time_interval = instance_feature.new_tensor( [self.default_time_interval] * batch_size ) else: self.reset() time_interval = instance_feature.new_tensor( [self.default_time_interval] * batch_size ) return ( instance_feature, anchor, self.cached_feature, self.cached_anchor, time_interval, ) def update(self, instance_feature, anchor, confidence, num_dn=None): if self.cached_feature is None: return instance_feature, anchor if num_dn is not None and num_dn > 0: dn_instance_feature = instance_feature[:, -num_dn:] dn_anchor = anchor[:, -num_dn:] instance_feature = instance_feature[:, : -num_dn] anchor = anchor[:, : -num_dn] confidence = confidence[:, : -num_dn] N = self.num_current_instance if N is not None and N < confidence.shape[1]: confidence = confidence.max(dim=-1).values _, (selected_feature, selected_anchor) = topk( confidence, N, instance_feature, anchor ) else: selected_feature, selected_anchor = instance_feature, anchor instance_feature = torch.cat( [self.cached_feature, selected_feature], dim=1 ) anchor = torch.cat( [self.cached_anchor, selected_anchor], dim=1 ) if num_dn is not None and num_dn > 0: instance_feature = torch.cat( [instance_feature, dn_instance_feature], dim=1 ) anchor = torch.cat([anchor, dn_anchor], dim=1) return instance_feature, anchor def cache( self, instance_feature, anchor, confidence, metas=None, feature_maps=None, ): if self.num_temp_instances <= 0: return instance_feature = instance_feature.detach() anchor = anchor.detach() confidence = confidence.detach() self.metas = metas confidence = confidence.max(dim=-1).values.sigmoid() if self.confidence is not None: N = self.confidence.shape[1] confidence[:, : N] = torch.maximum( self.confidence * self.confidence_decay, confidence[:, : N], ) self.temp_confidence = confidence if self.num_temp_instances < confidence.shape[1]: ( self.confidence, (self.cached_feature, self.cached_anchor), ) = topk( confidence, self.num_temp_instances, instance_feature, anchor ) else: self.confidence, self.cached_feature, self.cached_anchor = ( confidence, instance_feature, anchor ) def get_instance_id(self, confidence, anchor=None, threshold=None): confidence = confidence.max(dim=-1).values.sigmoid() instance_id = confidence.new_full(confidence.shape, -1).long() if ( self.instance_id is not None and self.instance_id.shape[0] == instance_id.shape[0] ): instance_id[:, : self.instance_id.shape[1]] = self.instance_id mask = instance_id < 0 if threshold is not None: mask = mask & (confidence >= threshold) num_new_instance = mask.sum() new_ids = torch.arange(num_new_instance).to(instance_id) + self.prev_id instance_id[torch.where(mask)] = new_ids self.prev_id += num_new_instance self.update_instance_id(instance_id, confidence) return instance_id def update_instance_id(self, instance_id=None, confidence=None): if self.temp_confidence is None: if confidence.dim() == 3: # bs, num_anchor, num_cls temp_conf = confidence.max(dim=-1).values else: # bs, num_anchor temp_conf = confidence else: temp_conf = self.temp_confidence instance_id = topk(temp_conf, self.num_temp_instances, instance_id)[1][ 0 ] instance_id = instance_id.squeeze(dim=-1) self.instance_id = F.pad( instance_id, (0, self.num_anchor - self.num_temp_instances), value=-1, ) ================================================ FILE: bip3d/models/spatial_enhancer.py ================================================ import torch from torch import nn from mmengine.model import BaseModel from mmdet.models.layers.transformer.utils import MLP from mmcv.cnn.bricks.transformer import FFN from .utils import deformable_format from bip3d.registry import MODELS @MODELS.register_module() class DepthFusionSpatialEnhancer(BaseModel): def __init__( self, embed_dims=256, feature_3d_dim=32, num_depth_layers=2, min_depth=0.25, max_depth=10, num_depth=64, with_feature_3d=True, loss_depth_weight=-1, **kwargs, ): super().__init__(**kwargs) self.embed_dims = embed_dims self.feature_3d_dim = feature_3d_dim self.num_depth_layers = num_depth_layers self.min_depth = min_depth self.max_depth = max_depth self.num_depth = num_depth self.with_feature_3d = with_feature_3d self.loss_depth_weight = loss_depth_weight fusion_dim = self.embed_dims + self.feature_3d_dim if self.with_feature_3d: self.pts_prob_pre_fc = nn.Linear( self.embed_dims, self.feature_3d_dim ) dim = self.feature_3d_dim * 2 fusion_dim += self.feature_3d_dim else: dim = self.embed_dims self.pts_prob_fc = MLP( dim, dim, self.num_depth, self.num_depth_layers, ) self.pts_fc = nn.Linear(3, self.feature_3d_dim) self.fusion_fc = nn.Sequential( FFN(embed_dims=fusion_dim, feedforward_channels=1024), nn.Linear(fusion_dim, self.embed_dims), ) self.fusion_norm = nn.LayerNorm(self.embed_dims) def forward( self, feature_maps, feature_3d=None, batch_inputs=None, **kwargs, ): with_cams = feature_maps[0].dim() == 5 if with_cams: bs, num_cams = feature_maps[0].shape[:2] else: bs = feature_maps[0].shape[0] num_cams = 1 feature_2d, spatial_shapes, _ = deformable_format(feature_maps) pts = self.get_pts( spatial_shapes, batch_inputs["image_wh"][0, 0], batch_inputs["projection_mat"], feature_2d.device, feature_2d.dtype, ) if self.with_feature_3d: feature_3d = deformable_format(feature_3d)[0] depth_prob_feat = self.pts_prob_pre_fc(feature_2d) depth_prob_feat = torch.cat([depth_prob_feat, feature_3d], dim=-1) depth_prob = self.pts_prob_fc(depth_prob_feat).softmax(dim=-1) feature_fused = [feature_2d, feature_3d] else: depth_prob = self.pts_prob_fc(feature_2d).softmax(dim=-1) feature_fused = [feature_2d] pts_feature = self.pts_fc(pts) pts_feature = (depth_prob.unsqueeze(dim=-1) * pts_feature).sum(dim=-2) feature_fused.append(pts_feature) feature_fused = torch.cat(feature_fused, dim=-1) feature_fused = self.fusion_fc(feature_fused) + feature_2d feature_fused = self.fusion_norm(feature_fused) feature_fused = deformable_format(feature_fused, spatial_shapes) if self.loss_depth_weight > 0 and self.training: loss_depth = self.depth_prob_loss(depth_prob, batch_inputs) else: loss_depth = None return feature_fused, depth_prob, loss_depth def get_pts(self, spatial_shapes, image_wh, projection_mat, device, dtype): pixels = [] for i, shape in enumerate(spatial_shapes): stride = image_wh[0] / shape[1] u = torch.linspace( 0, image_wh[0] - stride, shape[1], device=device, dtype=dtype ) v = torch.linspace( 0, image_wh[1] - stride, shape[0], device=device, dtype=dtype ) u = u[None].tile(shape[0], 1) v = v[:, None].tile(1, shape[1]) uv = torch.stack([u, v], dim=-1).flatten(0, 1) pixels.append(uv) pixels = torch.cat(pixels, dim=0)[:, None] depths = torch.linspace( self.min_depth, self.max_depth, self.num_depth, device=device, dtype=dtype, ) depths = depths[None, :, None] pts = pixels * depths depths = depths.tile(pixels.shape[0], 1, 1) pts = torch.cat([pts, depths, torch.ones_like(depths)], dim=-1) pts = torch.linalg.solve( projection_mat.mT.unsqueeze(dim=2), pts, left=False )[ ..., :3 ] # b,cam,N,3 return pts def depth_prob_loss(self, depth_prob, batch_inputs): mask = batch_inputs["depth_prob_gt"][..., 0] != 1 loss_depth = ( torch.nn.functional.binary_cross_entropy( depth_prob[mask], batch_inputs["depth_prob_gt"][mask] ) * self.loss_depth_weight ) return loss_depth ================================================ FILE: bip3d/models/structure.py ================================================ from torch import nn from mmdet.models.detectors import BaseDetector from mmdet.models.detectors.deformable_detr import ( MultiScaleDeformableAttention, ) from bip3d.registry import MODELS @MODELS.register_module() class BIP3D(BaseDetector): def __init__( self, backbone, decoder, neck=None, text_encoder=None, feature_enhancer=None, spatial_enhancer=None, data_preprocessor=None, backbone_3d=None, neck_3d=None, init_cfg=None, input_2d="imgs", input_3d="depth_img", embed_dims=256, use_img_grid_mask=False, use_depth_grid_mask=False, ): super().__init__(data_preprocessor, init_cfg) def build(cfg): if cfg is None: return None return MODELS.build(cfg) self.backbone = build(backbone) self.decoder = build(decoder) self.neck = build(neck) self.text_encoder = build(text_encoder) self.feature_enhancer = build(feature_enhancer) self.spatial_enhancer = build(spatial_enhancer) self.backbone_3d = build(backbone_3d) self.neck_3d = build(neck_3d) self.input_2d = input_2d self.input_3d = input_3d self.embed_dims = embed_dims if text_encoder is not None: self.text_feat_map = nn.Linear( self.text_encoder.language_backbone.body.language_dim, self.embed_dims, bias=True, ) self.use_img_grid_mask = use_img_grid_mask self.use_depth_grid_mask = use_depth_grid_mask if use_depth_grid_mask or use_img_grid_mask: from ..grid_mask import GridMask self.grid_mask = GridMask( True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7 ) def init_weights(self): """Initialize weights for Transformer and other components.""" for p in self.feature_enhancer.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) self.decoder.init_weights() for m in self.modules(): if isinstance(m, MultiScaleDeformableAttention): m.init_weights() nn.init.normal_(self.feature_enhancer.level_embed) nn.init.constant_(self.text_feat_map.bias.data, 0) nn.init.xavier_uniform_(self.text_feat_map.weight.data) def extract_feat(self, batch_inputs_dict, batch_data_samples): imgs = batch_inputs_dict.get(self.input_2d) if imgs.dim() == 5: bs, num_cams = imgs.shape[:2] imgs = imgs.flatten(end_dim=1) else: bs = imgs.shape[0] num_cams = 1 if self.use_img_grid_mask and self.training: img = self.grid_mask( img, offset=-self.data_preprocessor.mean / self.data_preprocessor.std, ) feature_maps = self.backbone(imgs) if self.neck is not None: feature_maps = self.neck(feature_maps) feature_maps = [x.unflatten(0, (bs, num_cams)) for x in feature_maps] input_3d = batch_inputs_dict.get(self.input_3d) if self.backbone_3d is not None and input_3d is not None: if self.input_3d == "depth_img" and input_3d.dim() == 5: assert input_3d.shape[1] == num_cams input_3d = input_3d.flatten(end_dim=1) if self.use_depth_grid_mask and self.training: input_3d = self.grid_mask(input_3d) feature_3d = self.backbone_3d(input_3d) if self.neck_3d is not None: feature_3d = self.neck_3d(feature_3d) feature_3d = [x.unflatten(0, (bs, num_cams)) for x in feature_3d] else: feature_3d = None return feature_maps, feature_3d def extract_text_feature(self, batch_inputs_dict): if self.text_encoder is not None: text_dict = self.text_encoder(batch_inputs_dict["text"]) text_dict["embedded"] = self.text_feat_map(text_dict["embedded"]) else: text_dict = None return text_dict def loss(self, batch_inputs, batch_data_samples): model_outs, text_dict, loss_depth = self._forward( batch_inputs, batch_data_samples ) loss = self.decoder.loss(model_outs, batch_inputs, text_dict=text_dict) if loss_depth is not None: loss["loss_depth"] = loss_depth return loss def predict(self, batch_inputs, batch_data_samples): model_outs, text_dict = self._forward(batch_inputs, batch_data_samples) results = self.decoder.post_process( model_outs, text_dict, batch_inputs, batch_data_samples ) return results def _forward(self, batch_inputs, batch_data_samples): feature_maps, feature_3d = self.extract_feat( batch_inputs, batch_data_samples ) text_dict = self.extract_text_feature(batch_inputs) if self.feature_enhancer is not None: feature_maps, text_feature = self.feature_enhancer( feature_maps=feature_maps, feature_3d=feature_3d, text_dict=text_dict, batch_inputs=batch_inputs, batch_data_samples=batch_data_samples, ) text_dict["embedded"] = text_feature if self.spatial_enhancer is not None: feature_maps, depth_prob, loss_depth = self.spatial_enhancer( feature_maps=feature_maps, feature_3d=feature_3d, text_dict=text_dict, batch_inputs=batch_inputs, batch_data_samples=batch_data_samples, ) else: loss_depth = depth_prob = None model_outs = self.decoder( feature_maps=feature_maps, feature_3d=feature_3d, text_dict=text_dict, batch_inputs=batch_inputs, batch_data_samples=batch_data_samples, depth_prob=depth_prob, ) if self.training: return model_outs, text_dict, loss_depth return model_outs, text_dict ================================================ FILE: bip3d/models/target.py ================================================ import torch from torch import nn import numpy as np import torch.nn.functional as F from scipy.optimize import linear_sum_assignment from .base_target import BaseTargetWithDenoising from .utils import ( wasserstein_distance, permutation_corner_distance, center_distance, get_positive_map, ) from bip3d.registry import TASK_UTILS __all__ = ["Grounding3DTarget"] X, Y, Z, W, L, H, ALPHA, BETA, GAMMA = range(9) @TASK_UTILS.register_module() class Grounding3DTarget(BaseTargetWithDenoising): def __init__( self, cls_weight=1.0, alpha=0.25, gamma=2, eps=1e-12, box_weight=1.0, cls_wise_reg_weights=None, num_dn=0, dn_noise_scale=0.5, add_neg_dn=True, num_temp_dn_groups=0, with_dn_query=False, num_classes=None, embed_dims=256, label_noise_scale=0.5, cost_weight_wd=1.0, cost_weight_pcd=0.0, cost_weight_cd=0.8, use_ignore_mask=False, ): super(Grounding3DTarget, self).__init__(num_dn, num_temp_dn_groups) self.cls_weight = cls_weight self.box_weight = box_weight self.alpha = alpha self.gamma = gamma self.eps = eps self.cls_wise_reg_weights = cls_wise_reg_weights self.dn_noise_scale = dn_noise_scale self.add_neg_dn = add_neg_dn self.with_dn_query = with_dn_query self.cost_weight_wd = cost_weight_wd self.cost_weight_pcd = cost_weight_pcd self.cost_weight_cd = cost_weight_cd self.use_ignore_mask = use_ignore_mask if self.with_dn_query: self.num_classes = num_classes self.embed_dims = embed_dims self.label_noise_scale = label_noise_scale self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims) def encode_reg_target(self, box_target, device=None): outputs = [] for box in box_target: if not isinstance(box, torch.Tensor): box = torch.cat( [box.gravity_center, box.tensor[..., 3:]], dim=-1 ) output = torch.cat( [box[..., [X, Y, Z]], box[..., [W, L, H]], box[..., ALPHA:]], dim=-1, ) if device is not None: output = output.to(device=device) outputs.append(output) return outputs @torch.no_grad() def sample( self, cls_pred, box_pred, char_positives, box_target, text_dict, ignore_mask=None, ): bs, num_pred, num_cls = cls_pred.shape token_positive_maps = get_positive_map(char_positives, text_dict) token_positive_maps = [ x.to(cls_pred).bool().float() for x in token_positive_maps ] cls_cost = self._cls_cost(cls_pred, token_positive_maps, text_dict["text_token_mask"]) box_target = self.encode_reg_target(box_target, box_pred.device) box_cost = self._box_cost(box_pred, box_target) indices = [] for i in range(bs): if cls_cost[i] is not None and box_cost[i] is not None: cost = (cls_cost[i] + box_cost[i]).detach().cpu().numpy() cost = np.where(np.isneginf(cost) | np.isnan(cost), 1e8, cost) assign = linear_sum_assignment(cost) indices.append( [cls_pred.new_tensor(x, dtype=torch.int64) for x in assign] ) else: indices.append([None, None]) output_cls_target = torch.zeros_like(cls_pred) output_box_target = torch.zeros_like(box_pred) output_reg_weights = torch.ones_like(box_pred) if self.use_ignore_mask: output_ignore_mask = torch.zeros_like(cls_pred[..., 0], dtype=torch.bool) ignore_mask = [output_ignore_mask.new_tensor(x) for x in ignore_mask] else: output_ignore_mask = None for i, (pred_idx, target_idx) in enumerate(indices): if len(box_target[i]) == 0: continue output_cls_target[i, pred_idx] = token_positive_maps[i][target_idx] output_box_target[i, pred_idx] = box_target[i][target_idx] if self.use_ignore_mask: output_ignore_mask[i, pred_idx] = ignore_mask[i][target_idx] self.indices = indices return output_cls_target, output_box_target, output_reg_weights, output_ignore_mask def _cls_cost(self, cls_pred, token_positive_maps, text_token_mask): bs = cls_pred.shape[0] cls_pred = cls_pred.sigmoid() cost = [] for i in range(bs): if len(token_positive_maps[i]) > 0: pred = cls_pred[i][:, text_token_mask[i]] neg_cost = ( -(1 - pred + self.eps).log() * (1 - self.alpha) * pred.pow(self.gamma) ) pos_cost = ( -(pred + self.eps).log() * self.alpha * (1 - pred).pow(self.gamma) ) gt = token_positive_maps[i][:, text_token_mask[i]] cls_cost = torch.einsum( "nc,mc->nm", pos_cost, gt ) + torch.einsum("nc,mc->nm", neg_cost, (1 - gt)) cost.append(cls_cost) else: cost.append(None) return cost def _box_cost(self, box_pred, box_target): bs = box_pred.shape[0] cost = [] for i in range(bs): if len(box_target[i]) > 0: pred = box_pred[i].unsqueeze(dim=-2) gt = box_target[i].unsqueeze(dim=-3) _cost = 0 if self.cost_weight_wd > 0: _cost += self.cost_weight_wd * wasserstein_distance( pred, gt ) if self.cost_weight_pcd > 0: _cost += ( self.cost_weight_pcd * permutation_corner_distance(pred, gt) ) if self.cost_weight_cd > 0: _cost += self.cost_weight_cd * center_distance(pred, gt) _cost *= self.box_weight cost.append(_cost) else: cost.append(None) return cost def get_dn_anchors(self, char_positives, box_target, text_dict, label=None, ignore_mask=None): if self.num_dn <= 0: return None if self.num_temp_dn_groups <= 0: gt_instance_id = None char_positives = [x[: self.num_dn] for x in char_positives] box_target = [x[: self.num_dn] for x in box_target] max_dn_gt = max([len(x) for x in char_positives] + [1]) token_positive_maps = get_positive_map(char_positives, text_dict) token_positive_maps = torch.stack( [ F.pad(x, (0, 0, 0, max_dn_gt - x.shape[0]), value=-1) for x in token_positive_maps ] ) box_target = self.encode_reg_target(box_target, box_target[0].device) box_target = torch.stack( [F.pad(x, (0, 0, 0, max_dn_gt - x.shape[0])) for x in box_target] ) token_positive_maps = token_positive_maps.to(box_target) box_target = torch.where( (token_positive_maps == -1).all(dim=-1, keepdim=True), box_target.new_tensor(0), box_target, ) bs, num_gt, state_dims = box_target.shape num_dn_groups = self.num_dn // max(num_gt, 1) if num_dn_groups > 1: token_positive_maps = token_positive_maps.tile(num_dn_groups, 1, 1) box_target = box_target.tile(num_dn_groups, 1, 1) noise = torch.rand_like(box_target) * 2 - 1 noise *= box_target.new_tensor(self.dn_noise_scale) noise[..., :3] *= box_target[..., 3:6] noise[..., 3:6] *= box_target[..., 3:6] dn_anchor = box_target + noise if self.add_neg_dn: noise_neg = torch.rand_like(box_target) + 1 flag = torch.where( torch.rand_like(box_target) > 0.5, noise_neg.new_tensor(1), noise_neg.new_tensor(-1), ) noise_neg *= flag noise_neg *= box_target.new_tensor(self.dn_noise_scale) noise_neg[..., :3] *= box_target[..., 3:6] noise_neg[..., 3:6] *= box_target[..., 3:6] dn_anchor = torch.cat([dn_anchor, box_target + noise_neg], dim=1) num_gt *= 2 box_cost = self._box_cost(dn_anchor, box_target) dn_box_target = torch.zeros_like(dn_anchor) dn_token_positive_maps = -torch.ones_like(token_positive_maps) * 3 if self.add_neg_dn: dn_token_positive_maps = torch.cat( [dn_token_positive_maps, dn_token_positive_maps], dim=1 ) for i in range(dn_anchor.shape[0]): if box_cost[i] is None: continue cost = box_cost[i].cpu().numpy() anchor_idx, gt_idx = linear_sum_assignment(cost) anchor_idx = dn_anchor.new_tensor(anchor_idx, dtype=torch.int64) gt_idx = dn_anchor.new_tensor(gt_idx, dtype=torch.int64) dn_box_target[i, anchor_idx] = box_target[i, gt_idx] dn_token_positive_maps[i, anchor_idx] = token_positive_maps[ i, gt_idx ] dn_anchor = ( dn_anchor.reshape(num_dn_groups, bs, num_gt, state_dims) .permute(1, 0, 2, 3) .flatten(1, 2) ) dn_box_target = ( dn_box_target.reshape(num_dn_groups, bs, num_gt, state_dims) .permute(1, 0, 2, 3) .flatten(1, 2) ) text_length = dn_token_positive_maps.shape[-1] dn_token_positive_maps = ( dn_token_positive_maps.reshape( num_dn_groups, bs, num_gt, text_length ) .permute(1, 0, 2, 3) .flatten(1, 2) ) valid_mask = (dn_token_positive_maps >= 0).all(dim=-1) if self.add_neg_dn: token_positive_maps = ( torch.cat([token_positive_maps, token_positive_maps], dim=1) .reshape(num_dn_groups, bs, num_gt, text_length) .permute(1, 0, 2, 3) .flatten(1, 2) ) valid_mask = torch.logical_or( valid_mask, ( (token_positive_maps >= 0).all(dim=-1) & (dn_token_positive_maps == -3).all(dim=-1) ), ) # valid denotes the items is not from pad. attn_mask = dn_box_target.new_ones( num_gt * num_dn_groups, num_gt * num_dn_groups ) dn_token_positive_maps = torch.clamp(dn_token_positive_maps, min=0) for i in range(num_dn_groups): start = num_gt * i end = start + num_gt attn_mask[start:end, start:end] = 0 attn_mask = attn_mask == 1 dn_anchor[..., 3:6] = torch.clamp(dn_anchor[..., 3:6], min=0.01).log() if label is not None and self.with_dn_query: label = torch.stack( [F.pad(x, (0, max_dn_gt - x.shape[0]), value=-1) for x in label] ) label = label.tile(num_dn_groups, 1) if self.add_neg_dn: label = torch.cat([label, label], dim=1) label = label.reshape(num_dn_groups, bs, num_gt).permute(1, 0, 2).flatten(1, 2) label_noise_mask = torch.logical_or( torch.rand_like(label.float()) < self.label_noise_scale * 0.5, label == -1 ) label = torch.where( label_noise_mask, torch.randint_like(label, 0, self.num_classes), label, ) dn_query = self.label_embedding(label) else: dn_query = None return ( dn_anchor, dn_box_target, dn_token_positive_maps, attn_mask, valid_mask, dn_query, ) ================================================ FILE: bip3d/models/utils.py ================================================ import torch from torch import nn from pytorch3d.transforms import euler_angles_to_matrix def deformable_format( feature_maps, spatial_shapes=None, level_start_index=None, flat_batch=False, batch_size=None, ): if spatial_shapes is None: if flat_batch and feature_maps[0].dim() > 4: feature_maps = [x.flatten(end_dim=-4) for x in feature_maps] feat_flatten = [] spatial_shapes = [] for lvl, feat in enumerate(feature_maps): spatial_shape = torch._shape_as_tensor(feat)[-2:].to(feat.device) feat = feat.flatten(start_dim=-2).transpose(-1, -2) feat_flatten.append(feat) spatial_shapes.append(spatial_shape) # (bs, num_feat_points, dim) feat_flatten = torch.cat(feat_flatten, -2) spatial_shapes = torch.cat(spatial_shapes).view(-1, 2) level_start_index = torch.cat( ( spatial_shapes.new_zeros((1,)), # (num_level) spatial_shapes.prod(1).cumsum(0)[:-1], ) ) return feat_flatten, spatial_shapes, level_start_index else: split_size = (spatial_shapes[:, 0] * spatial_shapes[:, 1]).tolist() feature_maps = feature_maps.transpose(-1, -2) feature_maps = list(torch.split(feature_maps, split_size, dim=-1)) for i, feat in enumerate(feature_maps): feature_maps[i] = feature_maps[i].unflatten( -1, (spatial_shapes[i, 0], spatial_shapes[i, 1]) ) if batch_size is not None: if isinstance(batch_size, int): feature_maps[i] = feature_maps[i].unflatten( 0, (batch_size, -1) ) else: feature_maps[i] = feature_maps[i].unflatten( 0, batch_size + (-1,) ) return feature_maps def bbox_to_corners(bbox, permutation=False): assert ( len(bbox.shape) == 2 ), "bbox must be 2D tensor of shape (N, 6) or (N, 7) or (N, 9)" if bbox.shape[-1] == 6: rot_mat = ( torch.eye(3, device=bbox.device) .unsqueeze(0) .repeat(bbox.shape[0], 1, 1) ) elif bbox.shape[-1] == 7: angles = bbox[:, 6:] fake_angles = torch.zeros_like(angles).repeat(1, 2) angles = torch.cat((angles, fake_angles), dim=1) rot_mat = euler_angles_to_matrix(angles, "ZXY") elif bbox.shape[-1] == 9: rot_mat = euler_angles_to_matrix(bbox[:, 6:], "ZXY") else: raise NotImplementedError centers = bbox[:, :3].unsqueeze(1).repeat(1, 8, 1) # shape (N, 8, 3) half_sizes = ( bbox[:, 3:6].unsqueeze(1).repeat(1, 8, 1) / 2 ) # shape (N, 8, 3) eight_corners_x = ( torch.tensor([1, 1, 1, 1, -1, -1, -1, -1], device=bbox.device) .unsqueeze(0) .repeat(bbox.shape[0], 1) ) # shape (N, 8) eight_corners_y = ( torch.tensor([1, 1, -1, -1, 1, 1, -1, -1], device=bbox.device) .unsqueeze(0) .repeat(bbox.shape[0], 1) ) # shape (N, 8) eight_corners_z = ( torch.tensor([1, -1, 1, -1, 1, -1, 1, -1], device=bbox.device) .unsqueeze(0) .repeat(bbox.shape[0], 1) ) # shape (N, 8) eight_corners = torch.stack( (eight_corners_x, eight_corners_y, eight_corners_z), dim=-1 ) # shape (N, 8, 3) eight_corners = eight_corners * half_sizes # shape (N, 8, 3) # rot_mat: (N, 3, 3), eight_corners: (N, 8, 3) rotated_corners = torch.matmul( eight_corners, rot_mat.transpose(1, 2) ) # shape (N, 8, 3) corners = rotated_corners + centers if permutation: corners = corners[:, PERMUTE_INDEX] return corners def wasserstein_distance(source, target): rot_mat_src = euler_angles_to_matrix(source[..., 6:9], "ZXY") sqrt_sigma_src = rot_mat_src @ ( source[..., 3:6, None] * rot_mat_src.transpose(-2, -1) ) rot_mat_tgt = euler_angles_to_matrix(target[..., 6:9], "ZXY") sqrt_sigma_tgt = rot_mat_tgt @ ( target[..., 3:6, None] * rot_mat_tgt.transpose(-2, -1) ) sigma_distance = sqrt_sigma_src - sqrt_sigma_tgt sigma_distance = sigma_distance.pow(2).sum(dim=-1).sum(dim=-1) center_distance = ((source[..., :3] - target[..., :3]) ** 2).sum(dim=-1) distance = sigma_distance + center_distance distance = distance.clamp(1e-7).sqrt() return distance def permutation_corner_distance(source, target): source_corners = bbox_to_corners(source) # N, 8, 3 target_corners = bbox_to_corners(target, True) # N, 48, 8, 3 distance = torch.norm( source_corners.unsqueeze(dim=-2) - target_corners, p=2, dim=-1 ) distance = distance.mean(dim=-1).min(dim=-1).values return distance def center_distance(source, target): return torch.norm(source[..., :3] - target[..., :3], p=2, dim=-1) def get_positive_map(char_positive, text_dict): bs, text_length = text_dict["embedded"].shape[:2] tokenized = text_dict["tokenized"] positive_maps = [] for i in range(bs): num_target = len(char_positive[i]) positive_map = torch.zeros( (num_target, text_length), dtype=torch.float ) for j, tok_list in enumerate(char_positive[i]): for beg, end in tok_list: try: beg_pos = tokenized.char_to_token(i, beg) end_pos = tokenized.char_to_token(i, end - 1) except Exception as e: print("beg:", beg, "end:", end) print("token_positive:", char_positive[i]) raise e if beg_pos is None: try: beg_pos = tokenized.char_to_token(i, beg + 1) if beg_pos is None: beg_pos = tokenized.char_to_token(i, beg + 2) except Exception: beg_pos = None if end_pos is None: try: end_pos = tokenized.char_to_token(i, end - 2) if end_pos is None: end_pos = tokenized.char_to_token(i, end - 3) except Exception: end_pos = None if beg_pos is None or end_pos is None: continue assert beg_pos is not None and end_pos is not None positive_map[j, beg_pos : end_pos + 1].fill_(1) positive_map /= (positive_map.sum(-1)[:, None] + 1e-6) positive_maps.append(positive_map) return positive_maps def get_entities(text, char_positive, sep_token="[SEP]"): batch_entities = [] for bs_idx in range(len(char_positive)): entities = [] for obj_idx in range(len(char_positive[bs_idx])): entity = "" for beg, end in char_positive[bs_idx][obj_idx]: if len(entity) == 0: entity = text[bs_idx][beg:end] else: entity += sep_token + text[bs_idx][beg:end] entities.append(entity) batch_entities.append(entities) return batch_entities def linear_act_ln( embed_dims, in_loops, out_loops, input_dims=None, act_cfg=None, ): if act_cfg is None: act_cfg = dict(type='ReLU', inplace=True) from mmcv.cnn import build_activation_layer if input_dims is None: input_dims = embed_dims layers = [] for _ in range(out_loops): for _ in range(in_loops): layers.append(nn.Linear(input_dims, embed_dims)) layers.append(build_activation_layer(act_cfg)) input_dims = embed_dims layers.append(nn.LayerNorm(embed_dims)) return layers ================================================ FILE: bip3d/ops/__init__.py ================================================ from .deformable_aggregation import ( deformable_aggregation_func, feature_maps_format ) ================================================ FILE: bip3d/ops/deformable_aggregation.py ================================================ import torch from torch.autograd.function import Function, once_differentiable from torch.cuda.amp import custom_fwd, custom_bwd from . import deformable_aggregation_ext as da from . import deformable_aggregation_with_depth_ext as dad class DeformableAggregationFunction(Function): @staticmethod @custom_fwd def forward( ctx, mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, ): # output: [bs, num_pts, num_embeds] mc_ms_feat = mc_ms_feat.contiguous().float() spatial_shape = spatial_shape.contiguous().int() scale_start_index = scale_start_index.contiguous().int() sampling_location = sampling_location.contiguous().float() weights = weights.contiguous().float() output = da.deformable_aggregation_forward( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, ) ctx.save_for_backward( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, ) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): ( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, ) = ctx.saved_tensors mc_ms_feat = mc_ms_feat.contiguous().float() spatial_shape = spatial_shape.contiguous().int() scale_start_index = scale_start_index.contiguous().int() sampling_location = sampling_location.contiguous().float() weights = weights.contiguous().float() grad_mc_ms_feat = torch.zeros_like(mc_ms_feat) grad_sampling_location = torch.zeros_like(sampling_location) grad_weights = torch.zeros_like(weights) da.deformable_aggregation_backward( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, grad_output.contiguous(), grad_mc_ms_feat, grad_sampling_location, grad_weights, ) # print(grad_mc_ms_feat.abs().mean(), grad_sampling_location.abs().mean(), grad_weights.abs().mean()) return ( grad_mc_ms_feat, None, None, grad_sampling_location, grad_weights, ) class DeformableAggregationWithDepthFunction(Function): @staticmethod @custom_fwd def forward( ctx, mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, num_depths, ): # output: [bs, num_pts, num_embeds] mc_ms_feat = mc_ms_feat.contiguous().float() spatial_shape = spatial_shape.contiguous().int() scale_start_index = scale_start_index.contiguous().int() sampling_location = sampling_location.contiguous().float() weights = weights.contiguous().float() output = dad.deformable_aggregation_with_depth_forward( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, num_depths, ) ctx.save_for_backward( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, ) ctx._num_depths = num_depths return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): ( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, ) = ctx.saved_tensors num_depths = ctx._num_depths mc_ms_feat = mc_ms_feat.contiguous().float() spatial_shape = spatial_shape.contiguous().int() scale_start_index = scale_start_index.contiguous().int() sampling_location = sampling_location.contiguous().float() weights = weights.contiguous().float() grad_mc_ms_feat = torch.zeros_like(mc_ms_feat) grad_sampling_location = torch.zeros_like(sampling_location) grad_weights = torch.zeros_like(weights) dad.deformable_aggregation_with_depth_backward( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, num_depths, grad_output.contiguous(), grad_mc_ms_feat, grad_sampling_location, grad_weights, ) # print(grad_mc_ms_feat.abs().mean(), grad_sampling_location.abs().mean(), grad_weights.abs().mean()) # print(grad_mc_ms_feat.abs().max(), grad_sampling_location.abs().max(), grad_weights.abs().max()) # print("") return ( grad_mc_ms_feat, None, None, grad_sampling_location, grad_weights, None, ) def deformable_aggregation_func( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, depth_prob=None, depth=None ): if depth_prob is not None and depth is not None: mc_ms_feat = torch.cat([mc_ms_feat, depth_prob], dim=-1) sampling_location = torch.cat([sampling_location, depth], dim=-1) return DeformableAggregationWithDepthFunction.apply( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, depth_prob.shape[-1], ) else: return DeformableAggregationFunction.apply( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, ) def feature_maps_format(feature_maps, inverse=False): bs, num_cams = feature_maps[0].shape[:2] if not inverse: spatial_shape = [] scale_start_index = [0] col_feats = [] for i, feat in enumerate(feature_maps): spatial_shape.append(feat.shape[-2:]) scale_start_index.append( feat.shape[-1] * feat.shape[-2] + scale_start_index[-1] ) col_feats.append(torch.reshape( feat, (bs, num_cams, feat.shape[2], -1) )) scale_start_index.pop() col_feats = torch.cat(col_feats, dim=-1).permute(0, 1, 3, 2) feature_maps = [ col_feats, torch.tensor( spatial_shape, dtype=torch.int64, device=col_feats.device, ), torch.tensor( scale_start_index, dtype=torch.int64, device=col_feats.device, ), ] else: spatial_shape = feature_maps[1].int() split_size = (spatial_shape[:, 0] * spatial_shape[:, 1]).tolist() feature_maps = feature_maps[0].permute(0, 1, 3, 2) feature_maps = list(torch.split(feature_maps, split_size, dim=-1)) for i, feat in enumerate(feature_maps): feature_maps[i] = feat.reshape( feat.shape[:3] + (spatial_shape[i, 0], spatial_shape[i, 1]) ) return feature_maps ================================================ FILE: bip3d/ops/gcc.sh ================================================ export CC=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/bin/gcc export CXX=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/bin/g++ export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib/dyninst:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/horizon-bucket/robot_lab/users/xuewu.lin/devtoolset-9/root/usr/lib64/dyninst:$LD_LIBRARY_PATH ================================================ FILE: bip3d/ops/setup.py ================================================ import os import torch from setuptools import setup from torch.utils.cpp_extension import ( BuildExtension, CppExtension, CUDAExtension, ) def make_cuda_ext( name, module, sources, sources_cuda=[], extra_args=[], extra_include_path=[], ): define_macros = [] extra_compile_args = {"cxx": [] + extra_args} if torch.cuda.is_available() or os.getenv("FORCE_CUDA", "0") == "1": define_macros += [("WITH_CUDA", None)] extension = CUDAExtension extra_compile_args["nvcc"] = extra_args + [ "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] sources += sources_cuda else: print("Compiling {} without CUDA".format(name)) extension = CppExtension return extension( name="{}.{}".format(module, name), sources=[os.path.join(*module.split("."), p) for p in sources], include_dirs=extra_include_path, define_macros=define_macros, extra_compile_args=extra_compile_args, ) if __name__ == "__main__": setup( name="deformable_aggregation_with_depth_ext", ext_modules=[ make_cuda_ext( "deformable_aggregation_with_depth_ext", module=".", sources=[ f"src/deformable_aggregation_with_depth.cpp", f"src/deformable_aggregation_with_depth_cuda.cu", ], ), ], cmdclass={"build_ext": BuildExtension}, ) setup( name="deformable_aggregation_ext", ext_modules=[ make_cuda_ext( "deformable_aggregation_ext", module=".", sources=[ f"src/deformable_aggregation.cpp", f"src/deformable_aggregation_cuda.cu", ], ), ], cmdclass={"build_ext": BuildExtension}, ) ================================================ FILE: bip3d/ops/src/deformable_aggregation.cpp ================================================ #include #include void deformable_aggregation( float* output, const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups ); void deformable_aggregation_grad( const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, const float* grad_output, float* grad_mc_ms_feat, float* grad_sampling_location, float* grad_weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups ); /* _mc_ms_feat: b, cam, feat, C */ /* _spatial_shape: scale, 2 */ /* _scale_start_index: scale */ /* _sampling_location: bs, pts, */ at::Tensor deformable_aggregation_forward( const at::Tensor &_mc_ms_feat, const at::Tensor &_spatial_shape, const at::Tensor &_scale_start_index, const at::Tensor &_sampling_location, const at::Tensor &_weights ) { at::DeviceGuard guard(_mc_ms_feat.device()); const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat)); int batch_size = _mc_ms_feat.size(0); int num_cams = _mc_ms_feat.size(1); int num_feat = _mc_ms_feat.size(2); int num_embeds = _mc_ms_feat.size(3); int num_scale = _spatial_shape.size(0); int num_pts = _sampling_location.size(1); int num_groups = _weights.size(4); const float* mc_ms_feat = _mc_ms_feat.data_ptr(); const int* spatial_shape = _spatial_shape.data_ptr(); const int* scale_start_index = _scale_start_index.data_ptr(); const float* sampling_location = _sampling_location.data_ptr(); const float* weights = _weights.data_ptr(); auto output = at::zeros({batch_size, num_pts, num_embeds}, _mc_ms_feat.options()); deformable_aggregation( output.data_ptr(), mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups ); return output; } void deformable_aggregation_backward( const at::Tensor &_mc_ms_feat, const at::Tensor &_spatial_shape, const at::Tensor &_scale_start_index, const at::Tensor &_sampling_location, const at::Tensor &_weights, const at::Tensor &_grad_output, at::Tensor &_grad_mc_ms_feat, at::Tensor &_grad_sampling_location, at::Tensor &_grad_weights ) { at::DeviceGuard guard(_mc_ms_feat.device()); const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat)); int batch_size = _mc_ms_feat.size(0); int num_cams = _mc_ms_feat.size(1); int num_feat = _mc_ms_feat.size(2); int num_embeds = _mc_ms_feat.size(3); int num_scale = _spatial_shape.size(0); int num_pts = _sampling_location.size(1); int num_groups = _weights.size(4); const float* mc_ms_feat = _mc_ms_feat.data_ptr(); const int* spatial_shape = _spatial_shape.data_ptr(); const int* scale_start_index = _scale_start_index.data_ptr(); const float* sampling_location = _sampling_location.data_ptr(); const float* weights = _weights.data_ptr(); const float* grad_output = _grad_output.data_ptr(); float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr(); float* grad_sampling_location = _grad_sampling_location.data_ptr(); float* grad_weights = _grad_weights.data_ptr(); deformable_aggregation_grad( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups ); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def( "deformable_aggregation_forward", &deformable_aggregation_forward, "deformable_aggregation_forward" ); m.def( "deformable_aggregation_backward", &deformable_aggregation_backward, "deformable_aggregation_backward" ); } ================================================ FILE: bip3d/ops/src/deformable_aggregation_cuda.cu ================================================ #include #include #include #include #include __device__ float bilinear_sampling( const float *&bottom_data, const int &height, const int &width, const int &num_embeds, const float &h_im, const float &w_im, const int &base_ptr ) { const int h_low = floorf(h_im); const int w_low = floorf(w_im); const int h_high = h_low + 1; const int w_high = w_low + 1; const float lh = h_im - h_low; const float lw = w_im - w_low; const float hh = 1 - lh, hw = 1 - lw; const int w_stride = num_embeds; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; float v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } float v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } float v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } float v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } __device__ void bilinear_sampling_grad( const float *&bottom_data, const float &weight, const int &height, const int &width, const int &num_embeds, const float &h_im, const float &w_im, const int &base_ptr, const float &grad_output, float *&grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights) { const int h_low = floorf(h_im); const int w_low = floorf(w_im); const int h_high = h_low + 1; const int w_high = w_low + 1; const float lh = h_im - h_low; const float lw = w_im - w_low; const float hh = 1 - lh, hw = 1 - lw; const int w_stride = num_embeds; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const float top_grad_mc_ms_feat = grad_output * weight; float grad_h_weight = 0, grad_w_weight = 0; float v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat); } float v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat); } float v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat); } float v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat); } const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_weights, grad_output * val); atomicAdd(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat); atomicAdd(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat); } __global__ void deformable_aggregation_kernel( const int num_kernels, float* output, const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_kernels) return; float *output_ptr = output + idx; const int channel_index = idx % num_embeds; const int groups_index = channel_index / (num_embeds / num_groups); idx /= num_embeds; const int pts_index = idx % num_pts; idx /= num_pts; const int batch_index = idx; const int value_cam_stride = num_feat * num_embeds; const int weight_cam_stride = num_scale * num_groups; int loc_offset = (batch_index * num_pts + pts_index) * num_cams << 1; int value_offset = batch_index * num_cams * value_cam_stride + channel_index; int weight_offset = ( (batch_index * num_pts + pts_index) * num_cams * weight_cam_stride + groups_index ); float result = 0; for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 2) { const float loc_w = sample_location[loc_offset]; const float loc_h = sample_location[loc_offset + 1]; if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1) { for (int scale_index = 0; scale_index < num_scale; ++scale_index) { const int scale_offset = scale_start_index[scale_index] * num_embeds; const int spatial_shape_ptr = scale_index << 1; const int h = spatial_shape[spatial_shape_ptr]; const int w = spatial_shape[spatial_shape_ptr + 1]; const float h_im = loc_h * h - 0.5; const float w_im = loc_w * w - 0.5; const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index; const float *weights_ptr = ( weights + weight_offset + scale_index * num_groups + weight_cam_stride * cam_index ); result += bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_ptr) * *weights_ptr; } } } *output_ptr = result; } __global__ void deformable_aggregation_grad_kernel( const int num_kernels, const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, const float* grad_output, float* grad_mc_ms_feat, float* grad_sampling_location, float* grad_weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_kernels) return; const float grad = grad_output[idx]; const int channel_index = idx % num_embeds; const int groups_index = channel_index / (num_embeds / num_groups); idx /= num_embeds; const int pts_index = idx % num_pts; idx /= num_pts; const int batch_index = idx; const int value_cam_stride = num_feat * num_embeds; const int weight_cam_stride = num_scale * num_groups; int loc_offset = (batch_index * num_pts + pts_index) * num_cams << 1; int value_offset = batch_index * num_cams * value_cam_stride + channel_index; int weight_offset = ( (batch_index * num_pts + pts_index) * num_cams * weight_cam_stride + groups_index ); for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 2) { const float loc_w = sample_location[loc_offset]; const float loc_h = sample_location[loc_offset + 1]; if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1) { for (int scale_index = 0; scale_index < num_scale; ++scale_index) { const int scale_offset = scale_start_index[scale_index] * num_embeds; const int spatial_shape_ptr = scale_index << 1; const int h = spatial_shape[spatial_shape_ptr]; const int w = spatial_shape[spatial_shape_ptr + 1]; const float h_im = loc_h * h - 0.5; const float w_im = loc_w * w - 0.5; const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index; const int weights_ptr = weight_offset + scale_index * num_groups + weight_cam_stride * cam_index; const float weight = weights[weights_ptr]; float *grad_location_ptr = grad_sampling_location + loc_offset; float *grad_weights_ptr = grad_weights + weights_ptr; bilinear_sampling_grad( mc_ms_feat, weight, h, w, num_embeds, h_im, w_im, value_ptr, grad, grad_mc_ms_feat, grad_location_ptr, grad_weights_ptr ); } } } } void deformable_aggregation( float* output, const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups ) { const int num_kernels = batch_size * num_pts * num_embeds; deformable_aggregation_kernel <<<(int)ceil(((double)num_kernels/512)), 512>>>( num_kernels, output, mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups ); } void deformable_aggregation_grad( const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, const float* grad_output, float* grad_mc_ms_feat, float* grad_sampling_location, float* grad_weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups ) { const int num_kernels = batch_size * num_pts * num_embeds; deformable_aggregation_grad_kernel <<<(int)ceil(((double)num_kernels/512)), 512>>>( num_kernels, mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights, grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups ); } ================================================ FILE: bip3d/ops/src/deformable_aggregation_with_depth.cpp ================================================ #include #include void deformable_aggregation_with_depth( float* output, const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups, int num_depths ); void deformable_aggregation_with_depth_grad( const float* mc_ms_feat, const int* spatial_shape, const int* scale_start_index, const float* sample_location, const float* weights, const float* grad_output, float* grad_mc_ms_feat, float* grad_sampling_location, float* grad_weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups, int num_depths ); /* _mc_ms_feat: b, cam, feat, C+D */ /* _spatial_shape: scale, 2 */ /* _scale_start_index: scale */ /* _sampling_location: bs, pts, */ at::Tensor deformable_aggregation_with_depth_forward( const at::Tensor &_mc_ms_feat, const at::Tensor &_spatial_shape, const at::Tensor &_scale_start_index, const at::Tensor &_sampling_location, const at::Tensor &_weights, const int num_depths ) { at::DeviceGuard guard(_mc_ms_feat.device()); const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat)); int batch_size = _mc_ms_feat.size(0); int num_cams = _mc_ms_feat.size(1); int num_feat = _mc_ms_feat.size(2); int num_embeds = _mc_ms_feat.size(3) - num_depths; int num_scale = _spatial_shape.size(0); int num_pts = _sampling_location.size(1); int num_groups = _weights.size(4); const float* mc_ms_feat = _mc_ms_feat.data_ptr(); const int* spatial_shape = _spatial_shape.data_ptr(); const int* scale_start_index = _scale_start_index.data_ptr(); const float* sampling_location = _sampling_location.data_ptr(); const float* weights = _weights.data_ptr(); auto output = at::zeros({batch_size, num_pts, num_embeds}, _mc_ms_feat.options()); deformable_aggregation_with_depth( output.data_ptr(), mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups, num_depths ); return output; } void deformable_aggregation_with_depth_backward( const at::Tensor &_mc_ms_feat, const at::Tensor &_spatial_shape, const at::Tensor &_scale_start_index, const at::Tensor &_sampling_location, const at::Tensor &_weights, const int num_depths, const at::Tensor &_grad_output, at::Tensor &_grad_mc_ms_feat, at::Tensor &_grad_sampling_location, at::Tensor &_grad_weights ) { at::DeviceGuard guard(_mc_ms_feat.device()); const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat)); int batch_size = _mc_ms_feat.size(0); int num_cams = _mc_ms_feat.size(1); int num_feat = _mc_ms_feat.size(2); int num_embeds = _mc_ms_feat.size(3) - num_depths; int num_scale = _spatial_shape.size(0); int num_pts = _sampling_location.size(1); int num_groups = _weights.size(4); const float* mc_ms_feat = _mc_ms_feat.data_ptr(); const int* spatial_shape = _spatial_shape.data_ptr(); const int* scale_start_index = _scale_start_index.data_ptr(); const float* sampling_location = _sampling_location.data_ptr(); const float* weights = _weights.data_ptr(); const float* grad_output = _grad_output.data_ptr(); float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr(); float* grad_sampling_location = _grad_sampling_location.data_ptr(); float* grad_weights = _grad_weights.data_ptr(); deformable_aggregation_with_depth_grad( mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups, num_depths ); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def( "deformable_aggregation_with_depth_forward", &deformable_aggregation_with_depth_forward, "deformable_aggregation_with_depth_forward" ); m.def( "deformable_aggregation_with_depth_backward", &deformable_aggregation_with_depth_backward, "deformable_aggregation_with_depth_backward" ); } ================================================ FILE: bip3d/ops/src/deformable_aggregation_with_depth_cuda.cu ================================================ #include #include #include #include #include __device__ float bilinear_sampling(const float *&bottom_data, const int &height, const int &width, const int &num_embeds_with_depth, const int &num_depths, const float &h_im, const float &w_im, const float &loc_d, const int &base_ptr, const int &depth_ptr) { const int h_low = floorf(h_im); const int w_low = floorf(w_im); const int h_high = h_low + 1; const int w_high = w_low + 1; const int d_low = floorf(loc_d); const int d_high = d_low + 1; const float lh = h_im - h_low; const float lw = w_im - w_low; const float hh = 1 - lh, hw = 1 - lw; const float ld = loc_d - d_low; const float hd = 1 - ld; const int w_stride = num_embeds_with_depth; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; float v1 = 0; float dp_low1 = 0; float dp_high1 = 0; const bool flag_d_low = d_low >= 0 && d_low < num_depths; const bool flag_d_high = d_high >= 0 && d_high < num_depths; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset; v1 = bottom_data[ptr1 + base_ptr]; const int ptr_d1 = ptr1 + depth_ptr + d_low; if (flag_d_low) { dp_low1 = bottom_data[ptr_d1]; } if (flag_d_high) { dp_high1 = bottom_data[ptr_d1 + 1]; } } float v2 = 0; float dp_low2 = 0; float dp_high2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset; v2 = bottom_data[ptr2 + base_ptr]; const int ptr_d2 = ptr2 + depth_ptr + d_low; if (flag_d_low) { dp_low2 = bottom_data[ptr_d2]; } if (flag_d_high) { dp_high2 = bottom_data[ptr_d2 + 1]; } } float v3 = 0; float dp_low3 = 0; float dp_high3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset; v3 = bottom_data[ptr3 + base_ptr]; const int ptr_d3 = ptr3 + depth_ptr + d_low; if (flag_d_low) { dp_low3 = bottom_data[ptr_d3]; } if (flag_d_high) { dp_high3 = bottom_data[ptr_d3 + 1]; } } float v4 = 0; float dp_low4 = 0; float dp_high4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset; v4 = bottom_data[ptr4 + base_ptr]; const int ptr_d4 = ptr4 + depth_ptr + d_low; if (flag_d_low) { dp_low4 = bottom_data[ptr_d4]; } if (flag_d_high) { dp_high4 = bottom_data[ptr_d4 + 1]; } } const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const float val = hd * (w1 * v1 * dp_low1 + w2 * v2 * dp_low2 + w3 * v3 * dp_low3 + w4 * v4 * dp_low4) + ld * (w1 * v1 * dp_high1 + w2 * v2 * dp_high2 + w3 * v3 * dp_high3 + w4 * v4 * dp_high4); return val; } __device__ void bilinear_sampling_grad(const float *&bottom_data, const float &weight, const int &height, const int &width, const int &num_embeds_with_depth, const int &num_depths, const float &h_im, const float &w_im, const float &loc_d, const int &base_ptr, const int &depth_ptr, const float &grad_output, float *&grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights) { const int h_low = floorf(h_im); const int w_low = floorf(w_im); const int h_high = h_low + 1; const int w_high = w_low + 1; const int d_low = floorf(loc_d); const int d_high = d_low + 1; const float lh = h_im - h_low; const float lw = w_im - w_low; const float hh = 1 - lh, hw = 1 - lw; const float ld = loc_d - d_low; const float hd = 1 - ld; const int w_stride = num_embeds_with_depth; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const float top_grad_mc_ms_feat = grad_output * weight; const bool flag_d_low = d_low >= 0 && d_low < num_depths; const bool flag_d_high = d_high >= 0 && d_high < num_depths; float v1 = 0; float dp_low1 = 0; float dp_high1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset; v1 = bottom_data[ptr1 + base_ptr]; const int ptr_d1 = ptr1 + depth_ptr + d_low; if (flag_d_low) { dp_low1 = bottom_data[ptr_d1]; atomicAdd(grad_mc_ms_feat + ptr_d1, w1 * top_grad_mc_ms_feat * v1 * hd); } if (flag_d_high) { dp_high1 = bottom_data[ptr_d1 + 1]; atomicAdd(grad_mc_ms_feat + ptr_d1 + 1, w1 * top_grad_mc_ms_feat * v1 * ld); } atomicAdd(grad_mc_ms_feat + ptr1 + base_ptr, w1 * top_grad_mc_ms_feat * (dp_low1 * hd + dp_high1 * ld)); } float v2 = 0; float dp_low2 = 0; float dp_high2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset; v2 = bottom_data[ptr2 + base_ptr]; const int ptr_d2 = ptr2 + depth_ptr + d_low; if (flag_d_low) { dp_low2 = bottom_data[ptr_d2]; atomicAdd(grad_mc_ms_feat + ptr_d2, w2 * top_grad_mc_ms_feat * v2 * hd); } if (flag_d_high) { dp_high2 = bottom_data[ptr_d2 + 1]; atomicAdd(grad_mc_ms_feat + ptr_d2 + 1, w2 * top_grad_mc_ms_feat * v2 * ld); } atomicAdd(grad_mc_ms_feat + ptr2 + base_ptr, w2 * top_grad_mc_ms_feat * (dp_low2 * hd + dp_high2 * ld)); } float v3 = 0; float dp_low3 = 0; float dp_high3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset; v3 = bottom_data[ptr3 + base_ptr]; const int ptr_d3 = ptr3 + depth_ptr + d_low; if (flag_d_low) { dp_low3 = bottom_data[ptr_d3]; atomicAdd(grad_mc_ms_feat + ptr_d3, w3 * top_grad_mc_ms_feat * v3 * hd); } if (flag_d_high) { dp_high3 = bottom_data[ptr_d3 + 1]; atomicAdd(grad_mc_ms_feat + ptr_d3 + 1, w3 * top_grad_mc_ms_feat * v3 * ld); } atomicAdd(grad_mc_ms_feat + ptr3 + base_ptr, w3 * top_grad_mc_ms_feat * (dp_low3 * hd + dp_high3 * ld)); } float v4 = 0; float dp_low4 = 0; float dp_high4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset; v4 = bottom_data[ptr4 + base_ptr]; const int ptr_d4 = ptr4 + depth_ptr + d_low; if (flag_d_low) { dp_low4 = bottom_data[ptr_d4]; atomicAdd(grad_mc_ms_feat + ptr_d4, w4 * top_grad_mc_ms_feat * v4 * hd); } if (flag_d_high) { dp_high4 = bottom_data[ptr_d4 + 1]; atomicAdd(grad_mc_ms_feat + ptr_d4 + 1, w4 * top_grad_mc_ms_feat * v4 * ld); } atomicAdd(grad_mc_ms_feat + ptr4 + base_ptr, w4 * top_grad_mc_ms_feat * (dp_low4 * hd + dp_high4 * ld)); } const float val1 = w1 * v1 * dp_low1 + w2 * v2 * dp_low2 + w3 * v3 * dp_low3 + w4 * v4 * dp_low4; const float val2 = w1 * v1 * dp_high1 + w2 * v2 * dp_high2 + w3 * v3 * dp_high3 + w4 * v4 * dp_high4; const float val = hd * val1 + ld * val2; atomicAdd(grad_weights, grad_output * val); const float grad_w_weight = hd * (-hh * v1 * dp_low1 + hh * v2 * dp_low2 - lh * v3 * dp_low3 + lh * v4 * dp_low4) + ld * (-hh * v1 * dp_high1 + hh * v2 * dp_high2 - lh * v3 * dp_high3 + lh * v4 * dp_high4); const float grad_h_weight = hd * (-hw * v1 * dp_low1 - lw * v2 * dp_low2 + hw * v3 * dp_low3 + lw * v4 * dp_low4) + ld * (-hw * v1 * dp_high1 - lw * v2 * dp_high2 + hw * v3 * dp_high3 + lw * v4 * dp_high4); /* const float grad_d_weight = -1 * (w1 * v1 * dp_low1 + w2 * v2 * dp_low2 + * w3 * v3 * dp_low3 + w4 * v4 * dp_low4) */ /* + (w1 * v1 * dp_high1 + w2 * v2 * dp_high2 + w3 * v3 * dp_high3 + w4 * * v4 * dp_high4); */ const float grad_d_weight = val2 - val1; /* const float grad_d_weight = w1 * v1 * (dp_high1 - dp_low1) */ /* + w2 * v2 * (dp_high2 - dp_low2) */ /* + w3 * v3 * (dp_high3 - dp_low3) */ /* + w4 * v4 * (dp_high4 - dp_low4); */ atomicAdd(grad_sampling_location, width * top_grad_mc_ms_feat * grad_w_weight); atomicAdd(grad_sampling_location + 1, height * top_grad_mc_ms_feat * grad_h_weight); atomicAdd(grad_sampling_location + 2, top_grad_mc_ms_feat * grad_d_weight); } __global__ void deformable_aggregation_with_depth_kernel( const int num_kernels, float *output, const float *mc_ms_feat, const int *spatial_shape, const int *scale_start_index, const float *sample_location, const float *weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups, int num_depths) { long int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_kernels) return; float *output_ptr = output + idx; const int channel_index = idx % num_embeds; const int groups_index = channel_index / (num_embeds / num_groups); idx /= num_embeds; const int pts_index = idx % num_pts; idx /= num_pts; const int batch_index = idx; const int num_embeds_with_depth = num_embeds + num_depths; const int value_cam_stride = num_feat * num_embeds_with_depth; const int weight_cam_stride = num_scale * num_groups; int loc_offset = (batch_index * num_pts + pts_index) * num_cams * 3; int value_offset = batch_index * num_cams * value_cam_stride; int depth_offset = value_offset + num_embeds; value_offset = value_offset + channel_index; int weight_offset = ((batch_index * num_pts + pts_index) * num_cams * weight_cam_stride + groups_index); float result = 0; for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 3) { const float loc_w = sample_location[loc_offset]; const float loc_h = sample_location[loc_offset + 1]; const float loc_d = sample_location[loc_offset + 2]; if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1 && loc_d > -1 && loc_d < num_depths) { for (int scale_index = 0; scale_index < num_scale; ++scale_index) { const int scale_offset = scale_start_index[scale_index] * num_embeds_with_depth; const int spatial_shape_ptr = scale_index << 1; const int h = spatial_shape[spatial_shape_ptr]; const int w = spatial_shape[spatial_shape_ptr + 1]; const float h_im = loc_h * h - 0.5; const float w_im = loc_w * w - 0.5; const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index; const int depth_ptr = depth_offset + scale_offset + value_cam_stride * cam_index; const float *weights_ptr = (weights + weight_offset + scale_index * num_groups + weight_cam_stride * cam_index); result += bilinear_sampling(mc_ms_feat, h, w, num_embeds_with_depth, num_depths, h_im, w_im, loc_d, value_ptr, depth_ptr) * *weights_ptr; } } } *output_ptr = result; } __global__ void deformable_aggregation_with_depth_grad_kernel( const int num_kernels, const float *mc_ms_feat, const int *spatial_shape, const int *scale_start_index, const float *sample_location, const float *weights, const float *grad_output, float *grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups, int num_depths) { long int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_kernels) return; const float grad = grad_output[idx]; const int channel_index = idx % num_embeds; const int groups_index = channel_index / (num_embeds / num_groups); idx /= num_embeds; const int pts_index = idx % num_pts; idx /= num_pts; const int batch_index = idx; const int num_embeds_with_depth = num_embeds + num_depths; const int value_cam_stride = num_feat * num_embeds_with_depth; const int weight_cam_stride = num_scale * num_groups; int loc_offset = (batch_index * num_pts + pts_index) * num_cams * 3; int value_offset = batch_index * num_cams * value_cam_stride; int depth_offset = value_offset + num_embeds; value_offset = value_offset + channel_index; int weight_offset = ((batch_index * num_pts + pts_index) * num_cams * weight_cam_stride + groups_index); for (int cam_index = 0; cam_index < num_cams; ++cam_index, loc_offset += 3) { const float loc_w = sample_location[loc_offset]; const float loc_h = sample_location[loc_offset + 1]; const float loc_d = sample_location[loc_offset + 2]; if (loc_w > 0 && loc_w < 1 && loc_h > 0 && loc_h < 1 && loc_d > -1 && loc_d < num_depths) { for (int scale_index = 0; scale_index < num_scale; ++scale_index) { const int scale_offset = scale_start_index[scale_index] * num_embeds_with_depth; const int spatial_shape_ptr = scale_index << 1; const int h = spatial_shape[spatial_shape_ptr]; const int w = spatial_shape[spatial_shape_ptr + 1]; const float h_im = loc_h * h - 0.5; const float w_im = loc_w * w - 0.5; const int value_ptr = value_offset + scale_offset + value_cam_stride * cam_index; const int depth_ptr = depth_offset + scale_offset + value_cam_stride * cam_index; const int weights_ptr = weight_offset + scale_index * num_groups + weight_cam_stride * cam_index; const float weight = weights[weights_ptr]; float *grad_location_ptr = grad_sampling_location + loc_offset; float *grad_weights_ptr = grad_weights + weights_ptr; bilinear_sampling_grad(mc_ms_feat, weight, h, w, num_embeds_with_depth, num_depths, h_im, w_im, loc_d, value_ptr, depth_ptr, grad, grad_mc_ms_feat, grad_location_ptr, grad_weights_ptr); } } } } void deformable_aggregation_with_depth( float *output, const float *mc_ms_feat, const int *spatial_shape, const int *scale_start_index, const float *sample_location, const float *weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups, int num_depths) { const long int num_kernels = batch_size * num_pts * num_embeds; deformable_aggregation_with_depth_kernel<<< (int)ceil(((double)num_kernels / 512)), 512>>>( num_kernels, output, mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups, num_depths); } void deformable_aggregation_with_depth_grad( const float *mc_ms_feat, const int *spatial_shape, const int *scale_start_index, const float *sample_location, const float *weights, const float *grad_output, float *grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights, int batch_size, int num_cams, int num_feat, int num_embeds, int num_scale, int num_pts, int num_groups, int num_depths) { const long int num_kernels = batch_size * num_pts * num_embeds; deformable_aggregation_with_depth_grad_kernel<<< (int)ceil(((double)num_kernels / 512)), 512>>>( num_kernels, mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights, grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights, batch_size, num_cams, num_feat, num_embeds, num_scale, num_pts, num_groups, num_depths); } ================================================ FILE: bip3d/registry.py ================================================ from mmengine import DATASETS as MMENGINE_DATASETS from mmengine import METRICS as MMENGINE_METRICS from mmengine import MODELS as MMENGINE_MODELS from mmengine import TASK_UTILS as MMENGINE_TASK_UTILS from mmengine import TRANSFORMS as MMENGINE_TRANSFORMS from mmengine import VISBACKENDS as MMENGINE_VISBACKENDS from mmengine import VISUALIZERS as MMENGINE_VISUALIZERS from mmengine import Registry MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['bip3d.models']) DATASETS = Registry('dataset', parent=MMENGINE_DATASETS, locations=['bip3d.datasets']) TRANSFORMS = Registry('transform', parent=MMENGINE_TRANSFORMS, locations=['bip3d.datasets.transforms']) METRICS = Registry('metric', parent=MMENGINE_METRICS, locations=['bip3d.eval']) TASK_UTILS = Registry('task util', parent=MMENGINE_TASK_UTILS, locations=['bip3d.models']) ================================================ FILE: bip3d/structures/__init__.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. from .bbox_3d import (BaseInstance3DBoxes, Box3DMode, Coord3DMode, EulerDepthInstance3DBoxes, EulerInstance3DBoxes, get_box_type, get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis, points_cam2img, points_img2cam, rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr) __all__ = [ 'BaseInstance3DBoxes', 'Box3DMode', 'Coord3DMode', 'EulerInstance3DBoxes', 'EulerDepthInstance3DBoxes', 'get_box_type', 'get_proj_mat_by_coord_type', 'limit_period', 'mono_cam_box2vis', 'points_cam2img', 'points_img2cam', 'rotation_3d_in_axis', 'rotation_3d_in_euler', 'xywhr2xyxyr' ] ================================================ FILE: bip3d/structures/bbox_3d/__init__.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. from .base_box3d import BaseInstance3DBoxes from .box_3d_mode import Box3DMode from .coord_3d_mode import Coord3DMode from .euler_box3d import EulerInstance3DBoxes from .euler_depth_box3d import EulerDepthInstance3DBoxes from .utils import (batch_points_cam2img, get_box_type, get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis, points_cam2img, points_img2cam, rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr) __all__ = [ 'Box3DMode', 'BaseInstance3DBoxes', 'EulerInstance3DBoxes', 'EulerDepthInstance3DBoxes', 'xywhr2xyxyr', 'get_box_type', 'rotation_3d_in_axis', 'rotation_3d_in_euler', 'limit_period', 'points_cam2img', 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis', 'batch_points_cam2img', 'get_proj_mat_by_coord_type' ] ================================================ FILE: bip3d/structures/bbox_3d/base_box3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import abstractmethod from typing import Iterator, Optional, Sequence, Tuple, Union import numpy as np import torch from torch import Tensor from bip3d.structures.points.base_points import BasePoints from .utils import limit_period class BaseInstance3DBoxes: """Base class for 3D Boxes. Note: The box is bottom centered, i.e. the relative position of origin in the box is (0.5, 0.5, 0). Args: tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes data with shape (N, box_dim). box_dim (int): Number of the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7. with_yaw (bool): Whether the box is with yaw rotation. If False, the value of yaw will be set to 0 as minmax boxes. Defaults to True. origin (Tuple[float]): Relative position of the box origin. Defaults to (0.5, 0.5, 0). This will guide the box be converted to (0.5, 0.5, 0) mode. Attributes: tensor (Tensor): Float matrix with shape (N, box_dim). box_dim (int): Integer indicating the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ YAW_AXIS: int = 0 def __init__( self, tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], box_dim: int = 7, with_yaw: bool = True, origin: Tuple[float, float, float] = (0.5, 0.5, 0) ) -> None: if isinstance(tensor, Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that does # not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((-1, box_dim)) assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \ ('The box dimension must be 2 and the length of the last ' f'dimension must be {box_dim}, but got boxes with shape ' f'{tensor.shape}.') if tensor.shape[-1] == 6: # If the dimension of boxes is 6, we expand box_dim by padding 0 as # a fake yaw and set with_yaw to False assert box_dim == 6 fake_rot = tensor.new_zeros(tensor.shape[0], 1) tensor = torch.cat((tensor, fake_rot), dim=-1) self.box_dim = box_dim + 1 self.with_yaw = False else: self.box_dim = box_dim self.with_yaw = with_yaw self.tensor = tensor.clone() if origin != (0.5, 0.5, 0): dst = self.tensor.new_tensor((0.5, 0.5, 0)) src = self.tensor.new_tensor(origin) self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) @property def shape(self) -> torch.Size: """torch.Size: Shape of boxes.""" return self.tensor.shape @property def volume(self) -> Tensor: """Tensor: A vector with volume of each box in shape (N, ).""" return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5] @property def dims(self) -> Tensor: """Tensor: Size dimensions of each box in shape (N, 3).""" return self.tensor[:, 3:6] @property def yaw(self) -> Tensor: """Tensor: A vector with yaw of each box in shape (N, ).""" return self.tensor[:, 6] @property def height(self) -> Tensor: """Tensor: A vector with height of each box in shape (N, ).""" return self.tensor[:, 5] @property def top_height(self) -> Tensor: """Tensor: A vector with top height of each box in shape (N, ).""" return self.bottom_height + self.height @property def bottom_height(self) -> Tensor: """Tensor: A vector with bottom height of each box in shape (N, ).""" return self.tensor[:, 2] @property def center(self) -> Tensor: """Calculate the center of all the boxes. Note: In MMDetection3D's convention, the bottom center is usually taken as the default center. The relative position of the centers in different kinds of boxes are different, e.g., the relative center of a boxes is (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is recommended to use ``bottom_center`` or ``gravity_center`` for clearer usage. Returns: Tensor: A tensor with center of each box in shape (N, 3). """ return self.bottom_center @property def bottom_center(self) -> Tensor: """Tensor: A tensor with center of each box in shape (N, 3).""" return self.tensor[:, :3] @property def gravity_center(self) -> Tensor: """Tensor: A tensor with center of each box in shape (N, 3).""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, :2] = bottom_center[:, :2] gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 return gravity_center @property def corners(self) -> Tensor: """Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).""" pass @property def bev(self) -> Tensor: """Tensor: 2D BEV box of each box with rotation in XYWHR format, in shape (N, 5).""" return self.tensor[:, [0, 1, 3, 4, 6]] @property def nearest_bev(self) -> Tensor: """Tensor: A tensor of 2D BEV box of each box without rotation.""" # Obtain BEV boxes with rotation in XYWHR format bev_rotated_boxes = self.bev # convert the rotation to a valid range rotations = bev_rotated_boxes[:, -1] normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) # find the center of boxes conditions = (normed_rotations > np.pi / 4)[..., None] bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, [0, 1, 3, 2]], bev_rotated_boxes[:, :4]) centers = bboxes_xywh[:, :2] dims = bboxes_xywh[:, 2:] bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) return bev_boxes def in_range_bev( self, box_range: Union[Tensor, np.ndarray, Sequence[float]]) -> Tensor: """Check whether the boxes are in the given range. Args: box_range (Tensor or np.ndarray or Sequence[float]): The range of box in order of (x_min, y_min, x_max, y_max). Note: The original implementation of SECOND checks whether boxes in a range by checking whether the points are in a convex polygon, we reduce the burden for simpler cases. Returns: Tensor: A binary vector indicating whether each box is inside the reference range. """ in_range_flags = ((self.bev[:, 0] > box_range[0]) & (self.bev[:, 1] > box_range[1]) & (self.bev[:, 0] < box_range[2]) & (self.bev[:, 1] < box_range[3])) return in_range_flags @abstractmethod def rotate( self, angle: Union[Tensor, np.ndarray, float], points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[ BasePoints, Tensor], None]: """Rotate boxes with points (optional) with the given angle or rotation matrix. Args: angle (Tensor or np.ndarray or float): Rotation angle or rotation matrix. points (Tensor or np.ndarray or :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ pass @abstractmethod def flip( self, bev_direction: str = 'horizontal', points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None ) -> Union[Tensor, np.ndarray, BasePoints, None]: """Flip the boxes in BEV along given BEV direction. Args: bev_direction (str): Direction by which to flip. Can be chosen from 'horizontal' and 'vertical'. Defaults to 'horizontal'. points (Tensor or np.ndarray or :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points`` is None, the function returns None, otherwise it returns the flipped points. """ pass def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None: """Translate boxes with the given translation vector. Args: trans_vector (Tensor or np.ndarray): Translation vector of size 1x3. """ if not isinstance(trans_vector, Tensor): trans_vector = self.tensor.new_tensor(trans_vector) self.tensor[:, :3] += trans_vector def in_range_3d( self, box_range: Union[Tensor, np.ndarray, Sequence[float]]) -> Tensor: """Check whether the boxes are in the given range. Args: box_range (Tensor or np.ndarray or Sequence[float]): The range of box (x_min, y_min, z_min, x_max, y_max, z_max). Note: In the original implementation of SECOND, checking whether a box in the range checks whether the points are in a convex polygon, we try to reduce the burden for simpler cases. Returns: Tensor: A binary vector indicating whether each point is inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > box_range[0]) & (self.tensor[:, 1] > box_range[1]) & (self.tensor[:, 2] > box_range[2]) & (self.tensor[:, 0] < box_range[3]) & (self.tensor[:, 1] < box_range[4]) & (self.tensor[:, 2] < box_range[5])) return in_range_flags @abstractmethod def convert_to(self, dst: int, rt_mat: Optional[Union[Tensor, np.ndarray]] = None, correct_yaw: bool = False) -> 'BaseInstance3DBoxes': """Convert self to ``dst`` mode. Args: dst (int): The target Box mode. rt_mat (Tensor or np.ndarray, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. correct_yaw (bool): Whether to convert the yaw angle to the target coordinate. Defaults to False. Returns: :obj:`BaseInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ pass def scale(self, scale_factor: float) -> None: """Scale the box with horizontal and vertical scaling factors. Args: scale_factors (float): Scale factors to scale the boxes. """ self.tensor[:, :6] *= scale_factor self.tensor[:, 7:] *= scale_factor # velocity def limit_yaw(self, offset: float = 0.5, period: float = np.pi) -> None: """Limit the yaw to a given period and offset. Args: offset (float): The offset of the yaw. Defaults to 0.5. period (float): The expected period. Defaults to np.pi. """ self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period) def nonempty(self, threshold: float = 0.0) -> Tensor: """Find boxes that are non-empty. A box is considered empty if either of its side is no larger than threshold. Args: threshold (float): The threshold of minimal sizes. Defaults to 0.0. Returns: Tensor: A binary vector which represents whether each box is empty (False) or non-empty (True). """ box = self.tensor size_x = box[..., 3] size_y = box[..., 4] size_z = box[..., 5] keep = ((size_x > threshold) & (size_y > threshold) & (size_z > threshold)) return keep def __getitem__( self, item: Union[int, slice, np.ndarray, Tensor]) -> 'BaseInstance3DBoxes': """ Args: item (int or slice or np.ndarray or Tensor): Index of boxes. Note: The following usage are allowed: 1. `new_boxes = boxes[3]`: Return a `Boxes` that contains only one box. 2. `new_boxes = boxes[2:10]`: Return a slice of boxes. 3. `new_boxes = boxes[vector]`: Where vector is a torch.BoolTensor with `length = len(boxes)`. Nonzero elements in the vector will be selected. Note that the returned Boxes might share storage with this Boxes, subject to PyTorch's indexing semantics. Returns: :obj:`BaseInstance3DBoxes`: A new object of :class:`BaseInstance3DBoxes` after indexing. """ original_type = type(self) if isinstance(item, int): return original_type(self.tensor[item].view(1, -1), box_dim=self.box_dim, with_yaw=self.with_yaw) b = self.tensor[item] assert b.dim() == 2, \ f'Indexing on Boxes with {item} failed to return a matrix!' return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw) def __len__(self) -> int: """int: Number of boxes in the current object.""" return self.tensor.shape[0] def __repr__(self) -> str: """str: Return a string that describes the object.""" return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' @classmethod def cat(cls, boxes_list: Sequence['BaseInstance3DBoxes'] ) -> 'BaseInstance3DBoxes': """Concatenate a list of Boxes into a single Boxes. Args: boxes_list (Sequence[:obj:`BaseInstance3DBoxes`]): List of boxes. Returns: :obj:`BaseInstance3DBoxes`: The concatenated boxes. """ assert isinstance(boxes_list, (list, tuple)) if len(boxes_list) == 0: return cls(torch.empty(0)) assert all(isinstance(box, cls) for box in boxes_list) # use torch.cat (v.s. layers.cat) # so the returned boxes never share storage with input cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0), box_dim=boxes_list[0].box_dim, with_yaw=boxes_list[0].with_yaw) return cat_boxes def numpy(self) -> np.ndarray: """Reload ``numpy`` from self.tensor.""" return self.tensor.numpy() def to(self, device: Union[str, torch.device], *args, **kwargs) -> 'BaseInstance3DBoxes': """Convert current boxes to a specific device. Args: device (str or :obj:`torch.device`): The name of the device. Returns: :obj:`BaseInstance3DBoxes`: A new boxes object on the specific device. """ original_type = type(self) return original_type(self.tensor.to(device, *args, **kwargs), box_dim=self.box_dim, with_yaw=self.with_yaw) def cpu(self) -> 'BaseInstance3DBoxes': """Convert current boxes to cpu device. Returns: :obj:`BaseInstance3DBoxes`: A new boxes object on the cpu device. """ original_type = type(self) return original_type(self.tensor.cpu(), box_dim=self.box_dim, with_yaw=self.with_yaw) def cuda(self, *args, **kwargs) -> 'BaseInstance3DBoxes': """Convert current boxes to cuda device. Returns: :obj:`BaseInstance3DBoxes`: A new boxes object on the cuda device. """ original_type = type(self) return original_type(self.tensor.cuda(*args, **kwargs), box_dim=self.box_dim, with_yaw=self.with_yaw) def clone(self) -> 'BaseInstance3DBoxes': """Clone the boxes. Returns: :obj:`BaseInstance3DBoxes`: Box object with the same properties as self. """ original_type = type(self) return original_type(self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw) def detach(self) -> 'BaseInstance3DBoxes': """Detach the boxes. Returns: :obj:`BaseInstance3DBoxes`: Box object with the same properties as self. """ original_type = type(self) return original_type(self.tensor.detach(), box_dim=self.box_dim, with_yaw=self.with_yaw) @property def device(self) -> torch.device: """torch.device: The device of the boxes are on.""" return self.tensor.device def __iter__(self) -> Iterator[Tensor]: """Yield a box as a Tensor at a time. Returns: Iterator[Tensor]: A box of shape (box_dim, ). """ yield from self.tensor @classmethod def height_overlaps(cls, boxes1: 'BaseInstance3DBoxes', boxes2: 'BaseInstance3DBoxes') -> Tensor: """Calculate height overlaps of two boxes. Note: This function calculates the height overlaps between ``boxes1`` and ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. Args: boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. Returns: Tensor: Calculated height overlap of the boxes. """ assert isinstance(boxes1, BaseInstance3DBoxes) assert isinstance(boxes2, BaseInstance3DBoxes) assert type(boxes1) == type(boxes2), \ '"boxes1" and "boxes2" should be in the same type, ' \ f'but got {type(boxes1)} and {type(boxes2)}.' boxes1_top_height = boxes1.top_height.view(-1, 1) boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) boxes2_top_height = boxes2.top_height.view(1, -1) boxes2_bottom_height = boxes2.bottom_height.view(1, -1) heighest_of_bottom = torch.max(boxes1_bottom_height, boxes2_bottom_height) lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height) overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0) return overlaps_h def new_box( self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]] ) -> 'BaseInstance3DBoxes': """Create a new box object with data. The new box and its tensor has the similar properties as self and self.tensor, respectively. Args: data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to be copied. Returns: :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ if not isinstance(data, Tensor) else data.to(self.device) original_type = type(self) return original_type(new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw) ================================================ FILE: bip3d/structures/bbox_3d/box_3d_mode.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. from enum import IntEnum, unique from typing import Optional, Sequence, Union import numpy as np import torch from torch import Tensor from .base_box3d import BaseInstance3DBoxes from .utils import limit_period @unique class Box3DMode(IntEnum): """Enum of different ways to represent a box. Coordinates in LiDAR: .. code-block:: none up z ^ x front | / | / left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. Coordinates in Camera: .. code-block:: none z front / / 0 ------> x right | | v down y The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), and the yaw is around the y axis, thus the rotation axis=1. Coordinates in Depth: .. code-block:: none up z ^ y front | / | / 0 ------> x right The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. """ LIDAR = 0 CAM = 1 DEPTH = 2 EULER_CAM = 3 EULER_DEPTH = 4 @staticmethod def convert( box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes], src: 'Box3DMode', dst: 'Box3DMode', rt_mat: Optional[Union[np.ndarray, Tensor]] = None, with_yaw: bool = True, correct_yaw: bool = False ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]: """Convert boxes from ``src`` mode to ``dst`` mode. Args: box (Sequence[float] or np.ndarray or Tensor or :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk array/tensor. src (:obj:`Box3DMode`): The source box mode. dst (:obj:`Box3DMode`): The target box mode. rt_mat (np.ndarray or Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. with_yaw (bool): If ``box`` is an instance of :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. Defaults to True. correct_yaw (bool): If the yaw is rotated by rt_mat. Defaults to False. Returns: Sequence[float] or np.ndarray or Tensor or :obj:`BaseInstance3DBoxes`: The converted box of the same type. """ if src == dst: return box is_numpy = isinstance(box, np.ndarray) is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) single_box = isinstance(box, (list, tuple)) if single_box: assert len(box) >= 7, ( 'Box3DMode.convert takes either a k-tuple/list or ' 'an Nxk array/tensor, where k >= 7') arr = torch.tensor(box)[None, :] else: # avoid modifying the input box if is_numpy: arr = torch.from_numpy(np.asarray(box)).clone() elif is_Instance3DBoxes: arr = box.tensor.clone() else: arr = box.clone() if is_Instance3DBoxes: with_yaw = box.with_yaw # convert box from `src` mode to `dst` mode. x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] if with_yaw: yaw = arr[..., 6:7] if src == Box3DMode.LIDAR and dst == Box3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: if correct_yaw: yaw_vector = torch.cat([ torch.cos(yaw), torch.sin(yaw), torch.zeros_like(yaw) ], dim=1) else: yaw = -yaw - np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: if correct_yaw: yaw_vector = torch.cat([ torch.cos(-yaw), torch.zeros_like(yaw), torch.sin(-yaw) ], dim=1) else: yaw = -yaw - np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: if correct_yaw: yaw_vector = torch.cat([ torch.cos(yaw), torch.sin(yaw), torch.zeros_like(yaw) ], dim=1) else: yaw = -yaw elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: if correct_yaw: yaw_vector = torch.cat([ torch.cos(-yaw), torch.zeros_like(yaw), torch.sin(-yaw) ], dim=1) else: yaw = -yaw elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) if with_yaw: if correct_yaw: yaw_vector = torch.cat([ torch.cos(yaw), torch.sin(yaw), torch.zeros_like(yaw) ], dim=1) else: yaw = yaw + np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) if with_yaw: if correct_yaw: yaw_vector = torch.cat([ torch.cos(yaw), torch.sin(yaw), torch.zeros_like(yaw) ], dim=1) else: yaw = yaw - np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) else: # TODO: add transformation between euler boxes raise NotImplementedError( f'Conversion from Box3DMode {src} to {dst} ' 'is not supported yet') if not isinstance(rt_mat, Tensor): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: xyz = arr[..., :3] @ rt_mat.t() # Note: we only use rotation in rt_mat # so don't need to extend yaw_vector if with_yaw and correct_yaw: rot_yaw_vector = yaw_vector @ rt_mat[:3, :3].t() if dst == Box3DMode.CAM: yaw = torch.atan2(-rot_yaw_vector[:, [2]], rot_yaw_vector[:, [0]]) elif dst in [Box3DMode.LIDAR, Box3DMode.DEPTH]: yaw = torch.atan2(rot_yaw_vector[:, [1]], rot_yaw_vector[:, [0]]) yaw = limit_period(yaw, period=np.pi * 2) if with_yaw: remains = arr[..., 7:] arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1) else: remains = arr[..., 6:] arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1) # convert arr to the original type original_type = type(box) if single_box: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() elif is_Instance3DBoxes: raise NotImplementedError( f'Conversion to {dst} through {original_type} ' 'is not supported yet') else: return arr ================================================ FILE: bip3d/structures/bbox_3d/coord_3d_mode.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. from enum import IntEnum, unique from typing import Optional, Sequence, Union import numpy as np import torch from torch import Tensor from bip3d.structures.points import (BasePoints, CameraPoints, DepthPoints, LiDARPoints) from .base_box3d import BaseInstance3DBoxes from .box_3d_mode import Box3DMode @unique class Coord3DMode(IntEnum): """Enum of different ways to represent a box and point cloud. Coordinates in LiDAR: .. code-block:: none up z ^ x front | / | / left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. Coordinates in Camera: .. code-block:: none z front / / 0 ------> x right | | v down y The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), and the yaw is around the y axis, thus the rotation axis=1. Coordinates in Depth: .. code-block:: none up z ^ y front | / | / 0 ------> x right The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. """ LIDAR = 0 CAM = 1 DEPTH = 2 @staticmethod def convert(input: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes, BasePoints], src: Union[Box3DMode, 'Coord3DMode'], dst: Union[Box3DMode, 'Coord3DMode'], rt_mat: Optional[Union[np.ndarray, Tensor]] = None, with_yaw: bool = True, correct_yaw: bool = False, is_point: bool = True): """Convert boxes or points from ``src`` mode to ``dst`` mode. Args: input (Sequence[float] or np.ndarray or Tensor or :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`): Can be a k-tuple, k-list or an Nxk array/tensor. src (:obj:`Box3DMode` or :obj:`Coord3DMode`): The source mode. dst (:obj:`Box3DMode` or :obj:`Coord3DMode`): The target mode. rt_mat (np.ndarray or Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. with_yaw (bool): If ``box`` is an instance of :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. Defaults to True. correct_yaw (bool): If the yaw is rotated by rt_mat. Defaults to False. is_point (bool): If ``input`` is neither an instance of :obj:`BaseInstance3DBoxes` nor an instance of :obj:`BasePoints`, whether or not it is point data. Defaults to True. Returns: Sequence[float] or np.ndarray or Tensor or :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`: The converted box or points of the same type. """ if isinstance(input, BaseInstance3DBoxes): return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw, correct_yaw=correct_yaw) elif isinstance(input, BasePoints): return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat) elif isinstance(input, (tuple, list, np.ndarray, Tensor)): if is_point: return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat) else: return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw, correct_yaw=correct_yaw) else: raise NotImplementedError @staticmethod def convert_box( box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes], src: Box3DMode, dst: Box3DMode, rt_mat: Optional[Union[np.ndarray, Tensor]] = None, with_yaw: bool = True, correct_yaw: bool = False ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]: """Convert boxes from ``src`` mode to ``dst`` mode. Args: box (Sequence[float] or np.ndarray or Tensor or :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk array/tensor. src (:obj:`Box3DMode`): The source box mode. dst (:obj:`Box3DMode`): The target box mode. rt_mat (np.ndarray or Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. with_yaw (bool): If ``box`` is an instance of :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. Defaults to True. correct_yaw (bool): If the yaw is rotated by rt_mat. Defaults to False. Returns: Sequence[float] or np.ndarray or Tensor or :obj:`BaseInstance3DBoxes`: The converted box of the same type. """ return Box3DMode.convert(box, src, dst, rt_mat=rt_mat, with_yaw=with_yaw, correct_yaw=correct_yaw) @staticmethod def convert_point( point: Union[Sequence[float], np.ndarray, Tensor, BasePoints], src: 'Coord3DMode', dst: 'Coord3DMode', rt_mat: Optional[Union[np.ndarray, Tensor]] = None, ) -> Union[Sequence[float], np.ndarray, Tensor, BasePoints]: """Convert points from ``src`` mode to ``dst`` mode. Args: box (Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`): Can be a k-tuple, k-list or an Nxk array/tensor. src (:obj:`Coord3DMode`): The source point mode. dst (:obj:`Coord3DMode`): The target point mode. rt_mat (np.ndarray or Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`: The converted point of the same type. """ if src == dst: return point is_numpy = isinstance(point, np.ndarray) is_InstancePoints = isinstance(point, BasePoints) single_point = isinstance(point, (list, tuple)) if single_point: assert len(point) >= 3, ( 'Coord3DMode.convert takes either a k-tuple/list or ' 'an Nxk array/tensor, where k >= 3') arr = torch.tensor(point)[None, :] else: # avoid modifying the input point if is_numpy: arr = torch.from_numpy(np.asarray(point)).clone() elif is_InstancePoints: arr = point.tensor.clone() else: arr = point.clone() # convert point from `src` mode to `dst` mode. if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) else: raise NotImplementedError( f'Conversion from Coord3DMode {src} to {dst} ' 'is not supported yet') if not isinstance(rt_mat, Tensor): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: xyz = arr[..., :3] @ rt_mat.t() remains = arr[..., 3:] arr = torch.cat([xyz[..., :3], remains], dim=-1) # convert arr to the original type original_type = type(point) if single_point: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() elif is_InstancePoints: if dst == Coord3DMode.CAM: target_type = CameraPoints elif dst == Coord3DMode.LIDAR: target_type = LiDARPoints elif dst == Coord3DMode.DEPTH: target_type = DepthPoints else: raise NotImplementedError( f'Conversion to {dst} through {original_type} ' 'is not supported yet') return target_type(arr, points_dim=arr.size(-1), attribute_dims=point.attribute_dims) else: return arr ================================================ FILE: bip3d/structures/bbox_3d/euler_box3d.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. import numpy as np import torch from pytorch3d.ops import box3d_overlap from pytorch3d.transforms import euler_angles_to_matrix, matrix_to_euler_angles from ..points.base_points import BasePoints from .base_box3d import BaseInstance3DBoxes from .utils import rotation_3d_in_euler class EulerInstance3DBoxes(BaseInstance3DBoxes): """3D boxes with 1-D orientation represented by three Euler angles. See https://en.wikipedia.org/wiki/Euler_angles for regarding the definition of Euler angles. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicates the dimension of a box Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma). """ def __init__(self, tensor, box_dim=9, origin=(0.5, 0.5, 0.5)): if isinstance(tensor, torch.Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that # does not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() if tensor.shape[-1] == 6: # If the dimension of boxes is 6, we expand box_dim by padding # (0, 0, 0) as a fake euler angle. assert box_dim == 6 fake_rot = tensor.new_zeros(tensor.shape[0], 3) tensor = torch.cat((tensor, fake_rot), dim=-1) self.box_dim = box_dim + 3 elif tensor.shape[-1] == 7: assert box_dim == 7 fake_euler = tensor.new_zeros(tensor.shape[0], 2) tensor = torch.cat((tensor, fake_euler), dim=-1) self.box_dim = box_dim + 2 else: assert tensor.shape[-1] == 9 self.box_dim = box_dim self.tensor = tensor.clone() self.origin = origin if origin != (0.5, 0.5, 0.5): dst = self.tensor.new_tensor((0.5, 0.5, 0.5)) src = self.tensor.new_tensor(origin) self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) def get_corners(self, tensor1): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to corners in clockwise order, in form of ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` .. code-block:: none up z front y ^ / | / | (x0, y1, z1) + ----------- + (x1, y1, z1) /| / | / | / | (x0, y0, z1) + ----------- + + (x1, y1, z0) | / . | / | / origin | / (x0, y0, z0) + ----------- + --------> right x (x1, y0, z0) """ if tensor1.numel() == 0: return torch.empty([0, 8, 3], device=tensor1.device) dims = tensor1[:, 3:6] corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin assert self.origin == (0.5, 0.5, 0.5), \ 'self.origin != (0.5, 0.5, 0.5) needs to be checked!' corners_norm = corners_norm - dims.new_tensor(self.origin) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate corners = rotation_3d_in_euler(corners, tensor1[:, 6:]) corners += tensor1[:, :3].view(-1, 1, 3) return corners @classmethod def overlaps(cls, boxes1, boxes2, mode='iou', eps=1e-4): """Calculate 3D overlaps of two boxes. Note: This function calculates the overlaps between ``boxes1`` and ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. Args: boxes1 (:obj:`EulerInstance3DBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`EulerInstance3DBoxes`): Boxes 2 contain M boxes. mode (str): Mode of iou calculation. Defaults to 'iou'. eps (bool): Epsilon. Defaults to 1e-4. Returns: torch.Tensor: Calculated 3D overlaps of the boxes. """ assert isinstance(boxes1, EulerInstance3DBoxes) assert isinstance(boxes2, EulerInstance3DBoxes) assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \ f'be in the same type, got {type(boxes1)} and {type(boxes2)}.' assert mode in ['iou'] rows = len(boxes1) cols = len(boxes2) if rows * cols == 0: return boxes1.tensor.new(rows, cols) corners1 = boxes1.corners corners2 = boxes2.corners _, iou3d = box3d_overlap(corners1, corners2, eps=eps) return iou3d @property def gravity_center(self): """torch.Tensor: A tensor with center of each box in shape (N, 3).""" return self.tensor[:, :3] @property def corners(self): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to corners in clockwise order, in form of ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` .. code-block:: none up z front y ^ / | / | (x0, y1, z1) + ----------- + (x1, y1, z1) /| / | / | / | (x0, y0, z1) + ----------- + + (x1, y1, z0) | / . | / | / origin | / (x0, y0, z0) + ----------- + --------> right x (x1, y0, z0) """ if self.tensor.numel() == 0: return torch.empty([0, 8, 3], device=self.tensor.device) dims = self.dims corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin assert self.origin == (0.5, 0.5, 0.5), \ 'self.origin != (0.5, 0.5, 0.5) needs to be checked!' corners_norm = corners_norm - dims.new_tensor(self.origin) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate corners = rotation_3d_in_euler(corners, self.tensor[:, 6:]) corners += self.tensor[:, :3].view(-1, 1, 3) return corners def transform(self, matrix): if self.tensor.shape[0] == 0: return if not isinstance(matrix, torch.Tensor): matrix = self.tensor.new_tensor(matrix) points = self.tensor[:, :3] constant = points.new_ones(points.shape[0], 1) points_extend = torch.concat([points, constant], dim=-1) points_trans = torch.matmul(points_extend, matrix.transpose(-2, -1))[:, :3] size = self.tensor[:, 3:6] # angle_delta = matrix_to_euler_angles(matrix[:3,:3], 'ZXY') # angle = self.tensor[:,6:] + angle_delta ori_matrix = euler_angles_to_matrix(self.tensor[:, 6:], 'ZXY') rot_matrix = matrix[:3, :3].expand_as(ori_matrix) final = torch.bmm(rot_matrix, ori_matrix) angle = matrix_to_euler_angles(final, 'ZXY') self.tensor = torch.cat([points_trans, size, angle], dim=-1) def scale(self, scale_factor: float) -> None: """Scale the box with horizontal and vertical scaling factors. Args: scale_factors (float): Scale factors to scale the boxes. """ self.tensor[:, :6] *= scale_factor def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle or rotation matrix. Args: angle (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) if angle.numel() == 1: # only given yaw angle for rotation angle = self.tensor.new_tensor([angle, 0., 0.]) rot_matrix = euler_angles_to_matrix(angle, 'ZXY') elif angle.numel() == 3: rot_matrix = euler_angles_to_matrix(angle, 'ZXY') elif angle.shape == torch.Size([3, 3]): rot_matrix = angle else: raise NotImplementedError rot_mat_T = rot_matrix.T transform_matrix = torch.eye(4) transform_matrix[:3, :3] = rot_matrix self.transform(transform_matrix) if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): rot_mat_T = rot_mat_T.cpu().numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): points.rotate(rot_mat_T) else: raise ValueError return points, rot_mat_T else: return rot_mat_T def flip(self, direction='X'): """Flip the boxes along the corresponding axis. Args: direction (str, optional): Flip axis. Defaults to 'X'. """ assert direction in ['X', 'Y', 'Z'] if direction == 'X': self.tensor[:, 0] = -self.tensor[:, 0] self.tensor[:, 6] = -self.tensor[:, 6] + np.pi self.tensor[:, 8] = -self.tensor[:, 8] elif direction == 'Y': self.tensor[:, 1] = -self.tensor[:, 1] self.tensor[:, 6] = -self.tensor[:, 6] self.tensor[:, 7] = -self.tensor[:, 7] + np.pi elif direction == 'Z': self.tensor[:, 2] = -self.tensor[:, 2] self.tensor[:, 7] = -self.tensor[:, 7] self.tensor[:, 8] = -self.tensor[:, 8] + np.pi ================================================ FILE: bip3d/structures/bbox_3d/euler_depth_box3d.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. import numpy as np import torch from mmcv.ops import points_in_boxes_all, points_in_boxes_part from ..points.base_points import BasePoints from .euler_box3d import EulerInstance3DBoxes class EulerDepthInstance3DBoxes(EulerInstance3DBoxes): """3D boxes of instances in Depth coordinates. We keep the "Depth" coordinate system definition in MMDet3D just for clarification of the points coordinates and the flipping augmentation. Coordinates in Depth: .. code-block:: none up z y front (alpha=0.5*pi) ^ ^ | / | / 0 ------> x right (alpha=0) The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at the positive direction of x axis, and decreases from the positive direction of x to the positive direction of y. Also note that rotation of DepthInstance3DBoxes is counterclockwise, which is reverse to the definition of the yaw angle (clockwise). Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicates the dimension of a box Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ def __init__(self, tensor, box_dim=9, with_yaw=True, origin=(0.5, 0.5, 0.5)): super().__init__(tensor, box_dim, origin) self.with_yaw = with_yaw def flip(self, bev_direction='horizontal', points=None): """Flip the boxes in BEV along given BEV direction. In Depth coordinates, it flips x (horizontal) or y (vertical) axis. Args: bev_direction (str, optional): Flip direction (horizontal or vertical). Defaults to 'horizontal'. points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: torch.Tensor, numpy.ndarray or None: Flipped points. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': super().flip(direction='X') elif bev_direction == 'vertical': super().flip(direction='Y') if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) if isinstance(points, (torch.Tensor, np.ndarray)): if bev_direction == 'horizontal': points[:, 0] = -points[:, 0] elif bev_direction == 'vertical': points[:, 1] = -points[:, 1] elif isinstance(points, BasePoints): points.flip(bev_direction) return points def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`DepthInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode assert dst == Box3DMode.EULER_DEPTH return self def points_in_boxes_part(self, points, boxes_override=None): """Find the box in which each point is. Args: points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions are (x, y, z) in LiDAR or depth coordinate. boxes_override (torch.Tensor, optional): Boxes to override `self.tensor`. Defaults to None. Returns: torch.Tensor: The index of the first box that each point is in, in shape (M, ). Default value is -1 (if the point is not enclosed by any box). Note: If a point is enclosed by multiple boxes, the index of the first box will be returned. """ if boxes_override is not None: boxes = boxes_override else: boxes = self.tensor if points.dim() == 2: points = points.unsqueeze(0) # TODO: take euler angles into consideration aligned_boxes = boxes[..., :7].clone() aligned_boxes[..., 6] = 0 box_idx = points_in_boxes_part( points, aligned_boxes.unsqueeze(0).to(points.device)).squeeze(0) return box_idx def points_in_boxes_all(self, points, boxes_override=None): """Find all boxes in which each point is. Args: points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions are (x, y, z) in LiDAR or depth coordinate. boxes_override (torch.Tensor, optional): Boxes to override `self.tensor`. Defaults to None. Returns: torch.Tensor: A tensor indicating whether a point is in a box, in shape (M, T). T is the number of boxes. Denote this tensor as A, if the m^th point is in the t^th box, then `A[m, t] == 1`, elsewise `A[m, t] == 0`. """ if boxes_override is not None: boxes = boxes_override else: boxes = self.tensor points_clone = points.clone()[..., :3] if points_clone.dim() == 2: points_clone = points_clone.unsqueeze(0) else: assert points_clone.dim() == 3 and points_clone.shape[0] == 1 boxes = boxes.to(points_clone.device).unsqueeze(0) # TODO: take euler angles into consideration aligned_boxes = boxes[..., :7].clone() aligned_boxes[..., 6] = 0 box_idxs_of_pts = points_in_boxes_all(points_clone, aligned_boxes) return box_idxs_of_pts.squeeze(0) ================================================ FILE: bip3d/structures/bbox_3d/utils.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. from logging import warning from typing import Tuple, Union import numpy as np import torch from pytorch3d.transforms import euler_angles_to_matrix, quaternion_to_matrix from torch import Tensor from bip3d.utils.array_converter import array_converter @array_converter(apply_to=('val', )) def limit_period(val: Union[np.ndarray, Tensor], offset: float = 0.5, period: float = np.pi) -> Union[np.ndarray, Tensor]: """Limit the value into a period for periodic function. Args: val (np.ndarray or Tensor): The value to be converted. offset (float): Offset to set the value range. Defaults to 0.5. period (float): Period of the value. Defaults to np.pi. Returns: np.ndarray or Tensor: Value in the range of [-offset * period, (1-offset) * period]. """ limited_val = val - torch.floor(val / period + offset) * period return limited_val @array_converter(apply_to=('points', 'angles')) def rotation_3d_in_euler(points, angles, return_mat=False, clockwise=False): """Rotate points by angles according to axis. Args: points (np.ndarray | torch.Tensor | list | tuple ): Points of shape (N, M, 3). angles (np.ndarray | torch.Tensor | list | tuple): Vector of angles in shape (N, 3) return_mat: Whether or not return the rotation matrix (transposed). Defaults to False. clockwise: Whether the rotation is clockwise. Defaults to False. Raises: ValueError: when the axis is not in range [0, 1, 2], it will raise value error. Returns: (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3). """ batch_free = len(points.shape) == 2 if batch_free: points = points[None] if len(angles.shape) == 1: angles = angles.expand(points.shape[:1] + (3, )) # angles = torch.full(points.shape[:1], angles) assert len(points.shape) == 3 and len(angles.shape) == 2 \ and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \ f'angles: {points.shape}, {angles.shape}' assert points.shape[-1] in [2, 3], \ f'Points size should be 2 or 3 instead of {points.shape[-1]}' if angles.shape[1] == 3: rot_mat_T = euler_angles_to_matrix(angles, 'ZXY') # N, 3,3 else: rot_mat_T = quaternion_to_matrix(angles) # N, 3,3 rot_mat_T = rot_mat_T.transpose(-2, -1) if clockwise: raise NotImplementedError('clockwise') if points.shape[0] == 0: points_new = points else: points_new = torch.bmm(points, rot_mat_T) if batch_free: points_new = points_new.squeeze(0) if return_mat: if batch_free: rot_mat_T = rot_mat_T.squeeze(0) return points_new, rot_mat_T else: return points_new @array_converter(apply_to=('points', 'angles')) def rotation_3d_in_axis( points: Union[np.ndarray, Tensor], angles: Union[np.ndarray, Tensor, float], axis: int = 0, return_mat: bool = False, clockwise: bool = False ) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[Tensor, Tensor], np.ndarray, Tensor]: """Rotate points by angles according to axis. Args: points (np.ndarray or Tensor): Points with shape (N, M, 3). angles (np.ndarray or Tensor or float): Vector of angles with shape (N, ). axis (int): The axis to be rotated. Defaults to 0. return_mat (bool): Whether or not to return the rotation matrix (transposed). Defaults to False. clockwise (bool): Whether the rotation is clockwise. Defaults to False. Raises: ValueError: When the axis is not in range [-3, -2, -1, 0, 1, 2], it will raise ValueError. Returns: Tuple[np.ndarray, np.ndarray] or Tuple[Tensor, Tensor] or np.ndarray or Tensor: Rotated points with shape (N, M, 3) and rotation matrix with shape (N, 3, 3). """ batch_free = len(points.shape) == 2 if batch_free: points = points[None] if isinstance(angles, float) or len(angles.shape) == 0: angles = torch.full(points.shape[:1], angles) assert len(points.shape) == 3 and len(angles.shape) == 1 and \ points.shape[0] == angles.shape[0], 'Incorrect shape of points ' \ f'angles: {points.shape}, {angles.shape}' assert points.shape[-1] in [2, 3], \ f'Points size should be 2 or 3 instead of {points.shape[-1]}' rot_sin = torch.sin(angles) rot_cos = torch.cos(angles) ones = torch.ones_like(rot_cos) zeros = torch.zeros_like(rot_cos) if points.shape[-1] == 3: if axis == 1 or axis == -2: rot_mat_T = torch.stack([ torch.stack([rot_cos, zeros, -rot_sin]), torch.stack([zeros, ones, zeros]), torch.stack([rot_sin, zeros, rot_cos]) ]) elif axis == 2 or axis == -1: rot_mat_T = torch.stack([ torch.stack([rot_cos, rot_sin, zeros]), torch.stack([-rot_sin, rot_cos, zeros]), torch.stack([zeros, zeros, ones]) ]) elif axis == 0 or axis == -3: rot_mat_T = torch.stack([ torch.stack([ones, zeros, zeros]), torch.stack([zeros, rot_cos, rot_sin]), torch.stack([zeros, -rot_sin, rot_cos]) ]) else: raise ValueError( f'axis should in range [-3, -2, -1, 0, 1, 2], got {axis}') else: rot_mat_T = torch.stack([ torch.stack([rot_cos, rot_sin]), torch.stack([-rot_sin, rot_cos]) ]) if clockwise: rot_mat_T = rot_mat_T.transpose(0, 1) if points.shape[0] == 0: points_new = points else: points_new = torch.einsum('aij,jka->aik', points, rot_mat_T) if batch_free: points_new = points_new.squeeze(0) if return_mat: rot_mat_T = torch.einsum('jka->ajk', rot_mat_T) if batch_free: rot_mat_T = rot_mat_T.squeeze(0) return points_new, rot_mat_T else: return points_new @array_converter(apply_to=('boxes_xywhr', )) def xywhr2xyxyr( boxes_xywhr: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]: """Convert a rotated boxes in XYWHR format to XYXYR format. Args: boxes_xywhr (Tensor or np.ndarray): Rotated boxes in XYWHR format. Returns: Tensor or np.ndarray: Converted boxes in XYXYR format. """ boxes = torch.zeros_like(boxes_xywhr) half_w = boxes_xywhr[..., 2] / 2 half_h = boxes_xywhr[..., 3] / 2 boxes[..., 0] = boxes_xywhr[..., 0] - half_w boxes[..., 1] = boxes_xywhr[..., 1] - half_h boxes[..., 2] = boxes_xywhr[..., 0] + half_w boxes[..., 3] = boxes_xywhr[..., 1] + half_h boxes[..., 4] = boxes_xywhr[..., 4] return boxes def get_box_type(box_type: str) -> Tuple[type, int]: """Get the type and mode of box structure. We temporarily only support EulerDepthInstance3DBoxes to support 9-DoF box operations and will consider refactoring this class with further experience. Args: box_type (str): The type of box structure. The valid value are "LiDAR", "Camera" and "Depth". Raises: ValueError: A ValueError is raised when ``box_type`` does not belong to the three valid types. Returns: tuple: Box type and box mode. """ from .box_3d_mode import Box3DMode from .euler_depth_box3d import EulerDepthInstance3DBoxes box_type_lower = box_type.lower() if box_type_lower == 'euler-depth': box_type_3d = EulerDepthInstance3DBoxes box_mode_3d = Box3DMode.EULER_DEPTH # elif box_type_lower == 'euler-camera': # box_type_3d = EulerCameraInstance3DBoxes # box_mode_3d = Box3DMode.EULER_CAM else: raise ValueError( 'Only "box_type" of "camera", "lidar", "depth", "euler"' f' are supported, got {box_type}') return box_type_3d, box_mode_3d @array_converter(apply_to=('points_3d', 'proj_mat')) def points_cam2img(points_3d: Union[Tensor, np.ndarray], proj_mat: Union[Tensor, np.ndarray], with_depth: bool = False) -> Union[Tensor, np.ndarray]: """Project points in camera coordinates to image coordinates. Args: points_3d (Tensor or np.ndarray): Points in shape (N, 3). proj_mat (Tensor or np.ndarray): Transformation matrix between coordinates. with_depth (bool): Whether to keep depth in the output. Defaults to False. Returns: Tensor or np.ndarray: Points in image coordinates with shape [N, 2] if ``with_depth=False``, else [N, 3]. """ points_shape = list(points_3d.shape) points_shape[-1] = 1 assert len(proj_mat.shape) == 2, \ 'The dimension of the projection matrix should be 2 ' \ f'instead of {len(proj_mat.shape)}.' d1, d2 = proj_mat.shape[:2] assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or \ (d1 == 4 and d2 == 4), 'The shape of the projection matrix ' \ f'({d1}*{d2}) is not supported.' if d1 == 3: proj_mat_expanded = torch.eye(4, device=proj_mat.device, dtype=proj_mat.dtype) proj_mat_expanded[:d1, :d2] = proj_mat proj_mat = proj_mat_expanded # previous implementation use new_zeros, new_one yields better results points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1) point_2d = points_4 @ proj_mat.T point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] if with_depth: point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) return point_2d_res @array_converter(apply_to=('points_3d', 'proj_mat')) def batch_points_cam2img(points_3d, proj_mat, with_depth=False): """Project points in camera coordinates to image coordinates. Args: points_3d (torch.Tensor | np.ndarray): Points in shape (N, D, 3) proj_mat (torch.Tensor | np.ndarray): Transformation matrix between coordinates. with_depth (bool, optional): Whether to keep depth in the output. Defaults to False. Returns: (torch.Tensor | np.ndarray): Points in image coordinates, with shape [N, D, 2] if `with_depth=False`, else [N, D, 3]. """ points_shape = list(points_3d.shape) points_shape[-1] = 1 assert len(proj_mat.shape) == 3, 'The dimension of the projection'\ f' matrix should be 2 instead of {len(proj_mat.shape)}.' d0, d1, d2 = proj_mat.shape[:3] assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or ( d1 == 4 and d2 == 4), 'The shape of the projection matrix'\ f' ({d1}*{d2}) is not supported.' if d1 == 3: proj_mat_expanded = torch.eye(4, device=proj_mat.device, dtype=proj_mat.dtype) proj_mat_expanded = proj_mat_expanded[None, :, :].expand(d0, -1, -1) proj_mat_expanded[:, :d1, :d2] = proj_mat proj_mat = proj_mat_expanded # previous implementation use new_zeros, new_one yields better results points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1) # do the batch wise operation point_2d = torch.bmm(points_4, proj_mat.permute(0, 2, 1)) # point_2d = points_4 @ proj_mat.T point_2d_res = point_2d[..., :2] / point_2d[..., 2:3].clamp(min=1e-3) if with_depth: point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) return point_2d_res @array_converter(apply_to=('points', 'cam2img')) def points_img2cam( points: Union[Tensor, np.ndarray], cam2img: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]: """Project points in image coordinates to camera coordinates. Args: points (Tensor or np.ndarray): 2.5D points in 2D images with shape [N, 3], 3 corresponds with x, y in the image and depth. cam2img (Tensor or np.ndarray): Camera intrinsic matrix. The shape can be [3, 3], [3, 4] or [4, 4]. Returns: Tensor or np.ndarray: Points in 3D space with shape [N, 3], 3 corresponds with x, y, z in 3D space. """ assert cam2img.shape[0] <= 4 assert cam2img.shape[1] <= 4 assert points.shape[1] == 3 xys = points[:, :2] depths = points[:, 2].view(-1, 1) unnormed_xys = torch.cat([xys * depths, depths], dim=1) pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device) pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1) # Do operation in homogeneous coordinates. num_points = unnormed_xys.shape[0] homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1) points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3] return points3D def mono_cam_box2vis(cam_box): """This is a post-processing function on the bboxes from Mono-3D task. If we want to perform projection visualization, we need to: 1. rotate the box along x-axis for np.pi / 2 (roll) 2. change orientation from local yaw to global yaw 3. convert yaw by (np.pi / 2 - yaw) After applying this function, we can project and draw it on 2D images. Args: cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate system before conversion. Could be gt bbox loaded from dataset or network prediction output. Returns: :obj:`CameraInstance3DBoxes`: Box after conversion. """ warning.warn('DeprecationWarning: The hack of yaw and dimension in the ' 'monocular 3D detection on nuScenes has been removed. The ' 'function mono_cam_box2vis will be deprecated.') from .cam_box3d import CameraInstance3DBoxes assert isinstance(cam_box, CameraInstance3DBoxes), \ 'input bbox should be CameraInstance3DBoxes!' loc = cam_box.gravity_center dim = cam_box.dims yaw = cam_box.yaw feats = cam_box.tensor[:, 7:] # rotate along x-axis for np.pi / 2 # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557 # noqa dim[:, [1, 2]] = dim[:, [2, 1]] # change local yaw to global yaw for visualization # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166 # noqa yaw += torch.atan2(loc[:, 0], loc[:, 2]) # convert yaw by (-yaw - np.pi / 2) # this is because mono 3D box class such as `NuScenesBox` has different # definition of rotation with our `CameraInstance3DBoxes` yaw = -yaw - np.pi / 2 cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1) cam_box = CameraInstance3DBoxes(cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5)) return cam_box def get_proj_mat_by_coord_type(img_meta: dict, coord_type: str) -> Tensor: """Obtain image features using points. Args: img_meta (dict): Meta information. coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Can be case- insensitive. Returns: Tensor: Transformation matrix. """ coord_type = coord_type.upper() mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'} assert coord_type in mapping.keys() return img_meta[mapping[coord_type]] def yaw2local(yaw: Tensor, loc: Tensor) -> Tensor: """Transform global yaw to local yaw (alpha in kitti) in camera coordinates, ranges from -pi to pi. Args: yaw (Tensor): A vector with local yaw of each box in shape (N, ). loc (Tensor): Gravity center of each box in shape (N, 3). Returns: Tensor: Local yaw (alpha in kitti). """ local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2]) larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False) small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False) if len(larger_idx) != 0: local_yaw[larger_idx] -= 2 * np.pi if len(small_idx) != 0: local_yaw[small_idx] += 2 * np.pi return local_yaw def get_lidar2img(cam2img: Tensor, lidar2cam: Tensor) -> Tensor: """Get the projection matrix of lidar2img. Args: cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix. lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix. Returns: Tensor: Transformation matrix with shape 4x4. """ if cam2img.shape == (3, 3): temp = cam2img.new_zeros(4, 4) temp[:3, :3] = cam2img temp[3, 3] = 1 cam2img = temp if lidar2cam.shape == (3, 3): temp = lidar2cam.new_zeros(4, 4) temp[:3, :3] = lidar2cam temp[3, 3] = 1 lidar2cam = temp return torch.matmul(cam2img, lidar2cam) ================================================ FILE: bip3d/structures/ops/__init__.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. # yapf:disable from .box_np_ops import (box2d_to_corner_jit, box3d_to_bbox, box_camera_to_lidar, boxes3d_to_corners3d_lidar, camera_to_lidar, center_to_corner_box2d, center_to_corner_box3d, center_to_minmax_2d, corner_to_standup_nd_jit, corner_to_surfaces_3d, corner_to_surfaces_3d_jit, corners_nd, create_anchors_3d_range, depth_to_lidar_points, depth_to_points, get_frustum, iou_jit, minmax_to_corner_2d, points_in_convex_polygon_3d_jit, points_in_convex_polygon_jit, points_in_rbbox, projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox, remove_outside_points, rotation_points_single_angle, surface_equ_3d) # yapf:enable from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D, BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d, bbox_overlaps_nearest_3d) from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back __all__ = [ 'box2d_to_corner_jit', 'box3d_to_bbox', 'box_camera_to_lidar', 'boxes3d_to_corners3d_lidar', 'camera_to_lidar', 'center_to_corner_box2d', 'center_to_corner_box3d', 'center_to_minmax_2d', 'corner_to_standup_nd_jit', 'corner_to_surfaces_3d', 'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range', 'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit', 'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit', 'points_in_convex_polygon_jit', 'points_in_rbbox', 'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox', 'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d', 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi', 'bbox3d2result' ] ================================================ FILE: bip3d/structures/ops/box_np_ops.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. # TODO: clean the functions in this file and move the APIs into box bbox_3d # in the future # NOTICE: All functions in this file are valid for LiDAR or depth boxes only # if we use default parameters. import numba import numpy as np from bip3d.structures.bbox_3d import (limit_period, points_cam2img, rotation_3d_in_axis) def camera_to_lidar(points, r_rect, velo2cam): """Convert points in camera coordinate to lidar coordinate. Note: This function is for KITTI only. Args: points (np.ndarray, shape=[N, 3]): Points in camera coordinate. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray, shape=[N, 3]: Points in lidar coordinate. """ points_shape = list(points.shape[0:-1]) if points.shape[-1] == 3: points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) return lidar_points[..., :3] def box_camera_to_lidar(data, r_rect, velo2cam): """Convert boxes in camera coordinate to lidar coordinate. Note: This function is for KITTI only. Args: data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray, shape=[N, 3]: Boxes in lidar coordinate. """ xyz = data[:, 0:3] x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6] r = data[:, 6:7] xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) # yaw and dims also needs to be converted r_new = -r - np.pi / 2 r_new = limit_period(r_new, period=np.pi * 2) return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1) def corners_nd(dims, origin=0.5): """Generate relative box corners based on length per dim and origin point. Args: dims (np.ndarray, shape=[N, ndim]): Array of length per dim origin (list or array or float, optional): origin point relate to smallest point. Defaults to 0.5 Returns: np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners. point layout example: (2d) x0y0, x0y1, x1y0, x1y1; (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 where x0 < x1, y0 < y1, z0 < z1. """ ndim = int(dims.shape[1]) corners_norm = np.stack(np.unravel_index(np.arange(2**ndim), [2] * ndim), axis=1).astype(dims.dtype) # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 # so need to convert to a format which is convenient to do other computing. # for 2d boxes, format is clockwise start with minimum point # for 3d boxes, please draw lines by your hand. if ndim == 2: # generate clockwise box corners corners_norm = corners_norm[[0, 1, 3, 2]] elif ndim == 3: corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape( [1, 2**ndim, ndim]) return corners def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): """Convert kitti locations, dimensions and angles to corners. format: center(xy), dims(xy), angles(counterclockwise when positive) Args: centers (np.ndarray): Locations in kitti label file with shape (N, 2). dims (np.ndarray): Dimensions in kitti label file with shape (N, 2). angles (np.ndarray, optional): Rotation_y in kitti label file with shape (N). Defaults to None. origin (list or array or float, optional): origin point relate to smallest point. Defaults to 0.5. Returns: np.ndarray: Corners with the shape of (N, 4, 2). """ # 'length' in kitti format is in x axis. # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) # center in kitti format is [0.5, 1.0, 0.5] in xyz. corners = corners_nd(dims, origin=origin) # corners: [N, 4, 2] if angles is not None: corners = rotation_3d_in_axis(corners, angles) corners += centers.reshape([-1, 1, 2]) return corners @numba.jit(nopython=True) def depth_to_points(depth, trunc_pixel): """Convert depth map to points. Args: depth (np.array, shape=[H, W]): Depth map which the row of [0~`trunc_pixel`] are truncated. trunc_pixel (int): The number of truncated row. Returns: np.ndarray: Points in camera coordinates. """ num_pts = np.sum(depth[trunc_pixel:, ] > 0.1) points = np.zeros((num_pts, 3), dtype=depth.dtype) x = np.array([0, 0, 1], dtype=depth.dtype) k = 0 for i in range(trunc_pixel, depth.shape[0]): for j in range(depth.shape[1]): if depth[i, j] > 0.1: x = np.array([j, i, 1], dtype=depth.dtype) points[k] = x * depth[i, j] k += 1 return points def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam): """Convert depth map to points in lidar coordinate. Args: depth (np.array, shape=[H, W]): Depth map which the row of [0~`trunc_pixel`] are truncated. trunc_pixel (int): The number of truncated row. P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray: Points in lidar coordinates. """ pts = depth_to_points(depth, trunc_pixel) points_shape = list(pts.shape[0:-1]) points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1) points = points @ np.linalg.inv(P2.T) lidar_points = camera_to_lidar(points, r_rect, velo2cam) return lidar_points def center_to_corner_box3d(centers, dims, angles=None, origin=(0.5, 1.0, 0.5), axis=1): """Convert kitti locations, dimensions and angles to corners. Args: centers (np.ndarray): Locations in kitti label file with shape (N, 3). dims (np.ndarray): Dimensions in kitti label file with shape (N, 3). angles (np.ndarray, optional): Rotation_y in kitti label file with shape (N). Defaults to None. origin (list or array or float, optional): Origin point relate to smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. Defaults to (0.5, 1.0, 0.5). axis (int, optional): Rotation axis. 1 for camera and 2 for lidar. Defaults to 1. Returns: np.ndarray: Corners with the shape of (N, 8, 3). """ # 'length' in kitti format is in x axis. # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar) # center in kitti format is [0.5, 1.0, 0.5] in xyz. corners = corners_nd(dims, origin=origin) # corners: [N, 8, 3] if angles is not None: corners = rotation_3d_in_axis(corners, angles, axis=axis) corners += centers.reshape([-1, 1, 3]) return corners @numba.jit(nopython=True) def box2d_to_corner_jit(boxes): """Convert box2d to corner. Args: boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation. Returns: box_corners (np.ndarray, shape=[N, 4, 2]): Box corners. """ num_box = boxes.shape[0] corners_norm = np.zeros((4, 2), dtype=boxes.dtype) corners_norm[1, 1] = 1.0 corners_norm[2] = 1.0 corners_norm[3, 0] = 1.0 corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape( 1, 4, 2) rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) for i in range(num_box): rot_sin = np.sin(boxes[i, -1]) rot_cos = np.cos(boxes[i, -1]) rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = rot_sin rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] return box_corners @numba.njit def corner_to_standup_nd_jit(boxes_corner): """Convert boxes_corner to aligned (min-max) boxes. Args: boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners. Returns: np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes. """ num_boxes = boxes_corner.shape[0] ndim = boxes_corner.shape[-1] result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) for i in range(num_boxes): for j in range(ndim): result[i, j] = np.min(boxes_corner[i, :, j]) for j in range(ndim): result[i, j + ndim] = np.max(boxes_corner[i, :, j]) return result @numba.jit(nopython=True) def corner_to_surfaces_3d_jit(corners): """Convert 3d box corners from corner function above to surfaces that normal vectors all direct to internal. Args: corners (np.ndarray): 3d box corners with the shape of (N, 8, 3). Returns: np.ndarray: Surfaces with the shape of (N, 6, 4, 3). """ # box_corners: [N, 8, 3], must from corner functions in this module num_boxes = corners.shape[0] surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) corner_idxes = np.array([ 0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7 ]).reshape(6, 4) for i in range(num_boxes): for j in range(6): for k in range(4): surfaces[i, j, k] = corners[i, corner_idxes[j, k]] return surfaces def rotation_points_single_angle(points, angle, axis=0): """Rotate points with a single angle. Args: points (np.ndarray, shape=[N, 3]]): angle (np.ndarray, shape=[1]]): axis (int, optional): Axis to rotate at. Defaults to 0. Returns: np.ndarray: Rotated points. """ # points: [N, 3] rot_sin = np.sin(angle) rot_cos = np.cos(angle) if axis == 1: rot_mat_T = np.array( [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]], dtype=points.dtype) elif axis == 2 or axis == -1: rot_mat_T = np.array( [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]], dtype=points.dtype) elif axis == 0: rot_mat_T = np.array( [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]], dtype=points.dtype) else: raise ValueError('axis should in range') return points @ rot_mat_T, rot_mat_T def box3d_to_bbox(box3d, P2): """Convert box3d in camera coordinates to bbox in image coordinates. Args: box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. P2 (np.array, shape=[4, 4]): Intrinsics of Camera2. Returns: np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates. """ box_corners = center_to_corner_box3d(box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1) box_corners_in_image = points_cam2img(box_corners, P2) # box_corners_in_image: [N, 8, 2] minxy = np.min(box_corners_in_image, axis=1) maxxy = np.max(box_corners_in_image, axis=1) bbox = np.concatenate([minxy, maxxy], axis=1) return bbox def corner_to_surfaces_3d(corners): """convert 3d box corners from corner function above to surfaces that normal vectors all direct to internal. Args: corners (np.ndarray): 3D box corners with shape of (N, 8, 3). Returns: np.ndarray: Surfaces with the shape of (N, 6, 4, 3). """ # box_corners: [N, 8, 3], must from corner functions in this module surfaces = np.array([ [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], ]).transpose([2, 0, 1, 3]) return surfaces def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)): """Check points in rotated bbox and return indices. Note: This function is for counterclockwise boxes. Args: points (np.ndarray, shape=[N, 3+dim]): Points to query. rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation. z_axis (int, optional): Indicate which axis is height. Defaults to 2. origin (tuple[int], optional): Indicate the position of box center. Defaults to (0.5, 0.5, 0). Returns: np.ndarray, shape=[N, M]: Indices of points in each box. """ # TODO: this function is different from PointCloud3D, be careful # when start to use nuscene, check the input rbbox_corners = center_to_corner_box3d(rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis) surfaces = corner_to_surfaces_3d(rbbox_corners) indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) return indices def minmax_to_corner_2d(minmax_box): """Convert minmax box to corners2d. Args: minmax_box (np.ndarray, shape=[N, dims]): minmax boxes. Returns: np.ndarray: 2d corners of boxes """ ndim = minmax_box.shape[-1] // 2 center = minmax_box[..., :ndim] dims = minmax_box[..., ndim:] - center return center_to_corner_box2d(center, dims, origin=0.0) def create_anchors_3d_range(feature_size, anchor_range, sizes=((3.9, 1.6, 1.56), ), rotations=(0, np.pi / 2), dtype=np.float32): """Create anchors 3d by range. Args: feature_size (list[float] | tuple[float]): Feature map size. It is either a list of a tuple of [D, H, W](in order of z, y, and x). anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). sizes (list[list] | np.ndarray | torch.Tensor, optional): Anchor size with shape [N, 3], in order of x, y, z. Defaults to ((3.9, 1.6, 1.56), ). rotations (list[float] | np.ndarray | torch.Tensor, optional): Rotations of anchors in a single feature grid. Defaults to (0, np.pi / 2). dtype (type, optional): Data type. Defaults to np.float32. Returns: np.ndarray: Range based anchors with shape of (*feature_size, num_sizes, num_rots, 7). """ anchor_range = np.array(anchor_range, dtype) z_centers = np.linspace(anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype) y_centers = np.linspace(anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype) x_centers = np.linspace(anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype) sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3]) rotations = np.array(rotations, dtype=dtype) rets = np.meshgrid(x_centers, y_centers, z_centers, rotations, indexing='ij') tile_shape = [1] * 5 tile_shape[-2] = int(sizes.shape[0]) for i in range(len(rets)): rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape) rets[i] = rets[i][..., np.newaxis] # for concat sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3]) tile_size_shape = list(rets[0].shape) tile_size_shape[3] = 1 sizes = np.tile(sizes, tile_size_shape) rets.insert(3, sizes) ret = np.concatenate(rets, axis=-1) return np.transpose(ret, [2, 1, 0, 3, 4, 5]) def center_to_minmax_2d(centers, dims, origin=0.5): """Center to minmax. Args: centers (np.ndarray): Center points. dims (np.ndarray): Dimensions. origin (list or array or float, optional): Origin point relate to smallest point. Defaults to 0.5. Returns: np.ndarray: Minmax points. """ if origin == 0.5: return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1) corners = center_to_corner_box2d(centers, dims, origin=origin) return corners[:, [0, 2]].reshape([-1, 4]) def rbbox2d_to_near_bbox(rbboxes): """convert rotated bbox to nearest 'standing' or 'lying' bbox. Args: rbboxes (np.ndarray): Rotated bboxes with shape of (N, 5(x, y, xdim, ydim, rad)). Returns: np.ndarray: Bounding boxes with the shape of (N, 4(xmin, ymin, xmax, ymax)). """ rots = rbboxes[..., -1] rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) return bboxes @numba.jit(nopython=True) def iou_jit(boxes, query_boxes, mode='iou', eps=0.0): """Calculate box iou. Note that jit version runs ~10x faster than the box_overlaps function in mmdet3d.core.evaluation. Note: This function is for counterclockwise boxes. Args: boxes (np.ndarray): Input bounding boxes with shape of (N, 4). query_boxes (np.ndarray): Query boxes with shape of (K, 4). mode (str, optional): IoU mode. Defaults to 'iou'. eps (float, optional): Value added to denominator. Defaults to 0. Returns: np.ndarray: Overlap between boxes and query_boxes with the shape of [N, K]. """ N = boxes.shape[0] K = query_boxes.shape[0] overlaps = np.zeros((N, K), dtype=boxes.dtype) for k in range(K): box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) * (query_boxes[k, 3] - query_boxes[k, 1] + eps)) for n in range(N): iw = (min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + eps) if iw > 0: ih = (min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + eps) if ih > 0: if mode == 'iou': ua = ((boxes[n, 2] - boxes[n, 0] + eps) * (boxes[n, 3] - boxes[n, 1] + eps) + box_area - iw * ih) else: ua = ((boxes[n, 2] - boxes[n, 0] + eps) * (boxes[n, 3] - boxes[n, 1] + eps)) overlaps[n, k] = iw * ih / ua return overlaps def projection_matrix_to_CRT_kitti(proj): """Split projection matrix of KITTI. Note: This function is for KITTI only. P = C @ [R|T] C is upper triangular matrix, so we need to inverse CR and use QR stable for all kitti camera projection matrix. Args: proj (p.array, shape=[4, 4]): Intrinsics of camera. Returns: tuple[np.ndarray]: Splited matrix of C, R and T. """ CR = proj[0:3, 0:3] CT = proj[0:3, 3] RinvCinv = np.linalg.inv(CR) Rinv, Cinv = np.linalg.qr(RinvCinv) C = np.linalg.inv(Cinv) R = np.linalg.inv(Rinv) T = Cinv @ CT return C, R, T def remove_outside_points(points, rect, Trv2c, P2, image_shape): """Remove points which are outside of image. Note: This function is for KITTI only. Args: points (np.ndarray, shape=[N, 3+dims]): Total points. rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. image_shape (list[int]): Shape of image. Returns: np.ndarray, shape=[N, 3+dims]: Filtered points. """ # 5x faster than remove_outside_points_v1(2ms vs 10ms) C, R, T = projection_matrix_to_CRT_kitti(P2) image_bbox = [0, 0, image_shape[1], image_shape[0]] frustum = get_frustum(image_bbox, C) frustum -= T frustum = np.linalg.inv(R) @ frustum.T frustum = camera_to_lidar(frustum.T, rect, Trv2c) frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) points = points[indices.reshape([-1])] return points def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): """Get frustum corners in camera coordinates. Args: bbox_image (list[int]): box in image coordinates. C (np.ndarray): Intrinsics. near_clip (float, optional): Nearest distance of frustum. Defaults to 0.001. far_clip (float, optional): Farthest distance of frustum. Defaults to 100. Returns: np.ndarray, shape=[8, 3]: coordinates of frustum corners. """ fku = C[0, 0] fkv = -C[1, 1] u0v0 = C[0:2, 2] z_points = np.array([near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis] b = bbox_image box_corners = np.array( [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], dtype=C.dtype) near_box_corners = (box_corners - u0v0) / np.array( [fku / near_clip, -fkv / near_clip], dtype=C.dtype) far_box_corners = (box_corners - u0v0) / np.array( [fku / far_clip, -fkv / far_clip], dtype=C.dtype) ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=0) # [8, 2] ret_xyz = np.concatenate([ret_xy, z_points], axis=1) return ret_xyz def surface_equ_3d(polygon_surfaces): """ Args: polygon_surfaces (np.ndarray): Polygon surfaces with shape of [num_polygon, max_num_surfaces, max_num_points_of_surface, 3]. All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. Returns: tuple: normal vector and its direction. """ # return [a, b, c], d in ax+by+cz+d=0 # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] surface_vec = polygon_surfaces[:, :, :2, :] - \ polygon_surfaces[:, :, 1:3, :] # normal_vec: [..., 3] normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :]) # print(normal_vec.shape, points[..., 0, :].shape) # d = -np.inner(normal_vec, points[..., 0, :]) d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :]) return normal_vec, -d @numba.njit def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, num_surfaces): """ Args: points (np.ndarray): Input points with shape of (num_points, 3). polygon_surfaces (np.ndarray): Polygon surfaces with shape of (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. normal_vec (np.ndarray): Normal vector of polygon_surfaces. d (int): Directions of normal vector. num_surfaces (np.ndarray): Number of surfaces a polygon contains shape of (num_polygon). Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] num_points = points.shape[0] num_polygons = polygon_surfaces.shape[0] ret = np.ones((num_points, num_polygons), dtype=np.bool_) sign = 0.0 for i in range(num_points): for j in range(num_polygons): for k in range(max_num_surfaces): if k > num_surfaces[j]: break sign = (points[i, 0] * normal_vec[j, k, 0] + points[i, 1] * normal_vec[j, k, 1] + points[i, 2] * normal_vec[j, k, 2] + d[j, k]) if sign >= 0: ret[i, j] = False break return ret def points_in_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): """Check points is in 3d convex polygons. Args: points (np.ndarray): Input points with shape of (num_points, 3). polygon_surfaces (np.ndarray): Polygon surfaces with shape of (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. num_surfaces (np.ndarray, optional): Number of surfaces a polygon contains shape of (num_polygon). Defaults to None. Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] # num_points = points.shape[0] num_polygons = polygon_surfaces.shape[0] if num_surfaces is None: num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64) normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :]) # normal_vec: [num_polygon, max_num_surfaces, 3] # d: [num_polygon, max_num_surfaces] return _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, num_surfaces) @numba.njit def points_in_convex_polygon_jit(points, polygon, clockwise=False): """Check points is in 2d convex polygons. True when point in polygon. Args: points (np.ndarray): Input points with the shape of [num_points, 2]. polygon (np.ndarray): Input polygon with the shape of [num_polygon, num_points_of_polygon, 2]. clockwise (bool, optional): Indicate polygon is clockwise. Defaults to True. Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ # first convert polygon to directed lines num_points_of_polygon = polygon.shape[1] num_points = points.shape[0] num_polygons = polygon.shape[0] # vec for all the polygons if clockwise: vec1 = polygon - polygon[:, np.array([num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1))), :] else: vec1 = polygon[:, np.array([num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1))), :] - polygon ret = np.zeros((num_points, num_polygons), dtype=np.bool_) success = True cross = 0.0 for i in range(num_points): for j in range(num_polygons): success = True for k in range(num_points_of_polygon): vec = vec1[j, k] cross = vec[1] * (polygon[j, k, 0] - points[i, 0]) cross -= vec[0] * (polygon[j, k, 1] - points[i, 1]) if cross >= 0: success = False break ret[i, j] = success return ret def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True): """Convert kitti center boxes to corners. 7 -------- 4 /| /| 6 -------- 5 . | | | | . 3 -------- 0 |/ |/ 2 -------- 1 Note: This function is for LiDAR boxes only. Args: boxes3d (np.ndarray): Boxes with shape of (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords, see the definition of ry in KITTI dataset. bottom_center (bool, optional): Whether z is on the bottom center of object. Defaults to True. Returns: np.ndarray: Box corners with the shape of [N, 8, 3]. """ boxes_num = boxes3d.shape[0] x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5] x_corners = np.array([ x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2., -x_size / 2., -x_size / 2., x_size / 2. ], dtype=np.float32).T y_corners = np.array([ -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2., -y_size / 2., y_size / 2., y_size / 2. ], dtype=np.float32).T if bottom_center: z_corners = np.zeros((boxes_num, 8), dtype=np.float32) z_corners[:, 4:8] = z_size.reshape(boxes_num, 1).repeat(4, axis=1) # (N, 8) else: z_corners = np.array([ -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2., z_size / 2., z_size / 2., z_size / 2., z_size / 2. ], dtype=np.float32).T ry = boxes3d[:, 6] zeros, ones = np.zeros(ry.size, dtype=np.float32), np.ones(ry.size, dtype=np.float32) rot_list = np.array([[np.cos(ry), np.sin(ry), zeros], [-np.sin(ry), np.cos(ry), zeros], [zeros, zeros, ones]]) # (3, 3, N) R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3) temp_corners = np.concatenate((x_corners.reshape( -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)), axis=2) # (N, 8, 3) rotated_corners = np.matmul(temp_corners, R_list) # (N, 8, 3) x_corners = rotated_corners[:, :, 0] y_corners = rotated_corners[:, :, 1] z_corners = rotated_corners[:, :, 2] x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2] x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8) y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8) z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8) corners = np.concatenate( (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)), axis=2) return corners.astype(np.float32) ================================================ FILE: bip3d/structures/ops/iou3d_calculator.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. import torch from mmdet.structures.bbox import bbox_overlaps from bip3d.registry import TASK_UTILS from bip3d.structures.bbox_3d import get_box_type @TASK_UTILS.register_module() class BboxOverlapsNearest3D(object): """Nearest 3D IoU Calculator. Note: This IoU calculator first finds the nearest 2D boxes in bird eye view (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. Args: coordinate (str): 'camera', 'lidar', or 'depth' coordinate system. """ def __init__(self, coordinate='lidar'): assert coordinate in ['camera', 'lidar', 'depth'] self.coordinate = coordinate def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): """Calculate nearest 3D IoU. Note: If ``is_aligned`` is ``False``, then it calculates the ious between each bbox of bboxes1 and bboxes2, otherwise it calculates the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): shape (N, 7+N) [x, y, z, x_size, y_size, z_size, ry, v]. bboxes2 (torch.Tensor): shape (M, 7+N) [x, y, z, x_size, y_size, z_size, ry, v]. mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned. Return: torch.Tensor: If ``is_aligned`` is ``True``, return ious between bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is ``False``, return shape is M. """ return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned, self.coordinate) def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(coordinate={self.coordinate}' return repr_str @TASK_UTILS.register_module() class BboxOverlaps3D(object): """3D IoU Calculator. Args: coordinate (str): The coordinate system, valid options are 'camera', 'lidar', and 'depth'. """ def __init__(self, coordinate): assert coordinate in ['camera', 'lidar', 'depth'] self.coordinate = coordinate def __call__(self, bboxes1, bboxes2, mode='iou'): """Calculate 3D IoU using cuda implementation. Note: This function calculate the IoU of 3D boxes based on their volumes. IoU calculator ``:class:BboxOverlaps3D`` uses this function to calculate the actual 3D IoUs of boxes. Args: bboxes1 (torch.Tensor): with shape (N, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). bboxes2 (torch.Tensor): with shape (M, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). Return: torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 with shape (M, N) (aligned mode is not supported currently). """ return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate) def __repr__(self): """str: return a string that describes the module""" repr_str = self.__class__.__name__ repr_str += f'(coordinate={self.coordinate}' return repr_str def bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode='iou', is_aligned=False, coordinate='lidar'): """Calculate nearest 3D IoU. Note: This function first finds the nearest 2D boxes in bird eye view (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. This IoU calculator :class:`BboxOverlapsNearest3D` uses this function to calculate IoUs of boxes. If ``is_aligned`` is ``False``, then it calculates the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): with shape (N, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). bboxes2 (torch.Tensor): with shape (M, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned Return: torch.Tensor: If ``is_aligned`` is ``True``, return ious between bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is ``False``, return shape is M. """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 box_type, _ = get_box_type(coordinate) bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) # Change the bboxes to bev # box conversion and iou calculation in torch version on CUDA # is 10x faster than that in numpy version bboxes1_bev = bboxes1.nearest_bev bboxes2_bev = bboxes2.nearest_bev ret = bbox_overlaps(bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned) return ret def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'): """Calculate 3D IoU using cuda implementation. Note: This function calculates the IoU of 3D boxes based on their volumes. IoU calculator :class:`BboxOverlaps3D` uses this function to calculate the actual IoUs of boxes. Args: bboxes1 (torch.Tensor): with shape (N, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). bboxes2 (torch.Tensor): with shape (M, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). coordinate (str): 'camera' or 'lidar' coordinate system. Return: torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 with shape (M, N) (aligned mode is not supported currently). """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 box_type, _ = get_box_type(coordinate) bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) return bboxes1.overlaps(bboxes1, bboxes2, mode=mode) @TASK_UTILS.register_module() class AxisAlignedBboxOverlaps3D(object): """Axis-aligned 3D Overlaps (IoU) Calculator.""" def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): """Calculate IoU between 2D bboxes. Args: bboxes1 (Tensor): shape (B, m, 6) in format or empty. bboxes2 (Tensor): shape (B, n, 6) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned`` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. Defaults to False. Returns: Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) """ assert bboxes1.size(-1) == bboxes2.size(-1) == 6 return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode, is_aligned) def __repr__(self): """str: a string describing the module""" repr_str = self.__class__.__name__ + '()' return repr_str def axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): """Calculate overlap between two set of axis aligned 3D bboxes. If ``is_aligned`` is ``False``, then calculate the overlaps between each bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (Tensor): shape (B, m, 6) in format or empty. bboxes2 (Tensor): shape (B, n, 6) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned`` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. Defaults to False. eps (float, optional): A value added to the denominator for numerical stability. Defaults to 1e-6. Returns: Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) Example: >>> bboxes1 = torch.FloatTensor([ >>> [0, 0, 0, 10, 10, 10], >>> [10, 10, 10, 20, 20, 20], >>> [32, 32, 32, 38, 40, 42], >>> ]) >>> bboxes2 = torch.FloatTensor([ >>> [0, 0, 0, 10, 20, 20], >>> [0, 10, 10, 10, 19, 20], >>> [10, 10, 10, 20, 20, 20], >>> ]) >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2) >>> assert overlaps.shape == (3, 3) >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True) >>> assert overlaps.shape == (3, ) Example: >>> empty = torch.empty(0, 6) >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]]) >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) """ assert mode in ['iou', 'giou'], f'Unsupported mode {mode}' # Either the boxes are empty or the length of boxes's last dimension is 6 assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0) assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0) # Batch dim must be the same # Batch dim: (B1, B2, ... Bn) assert bboxes1.shape[:-2] == bboxes2.shape[:-2] batch_shape = bboxes1.shape[:-2] rows = bboxes1.size(-2) cols = bboxes2.size(-2) if is_aligned: assert rows == cols if rows * cols == 0: if is_aligned: return bboxes1.new(batch_shape + (rows, )) else: return bboxes1.new(batch_shape + (rows, cols)) area1 = (bboxes1[..., 3] - bboxes1[..., 0]) * ( bboxes1[..., 4] - bboxes1[..., 1]) * (bboxes1[..., 5] - bboxes1[..., 2]) area2 = (bboxes2[..., 3] - bboxes2[..., 0]) * ( bboxes2[..., 4] - bboxes2[..., 1]) * (bboxes2[..., 5] - bboxes2[..., 2]) if is_aligned: lt = torch.max(bboxes1[..., :3], bboxes2[..., :3]) # [B, rows, 3] rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:]) # [B, rows, 3] wh = (rb - lt).clamp(min=0) # [B, rows, 2] overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] if mode in ['iou', 'giou']: union = area1 + area2 - overlap else: union = area1 if mode == 'giou': enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3]) enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:]) else: lt = torch.max(bboxes1[..., :, None, :3], bboxes2[..., None, :, :3]) # [B, rows, cols, 3] rb = torch.min(bboxes1[..., :, None, 3:], bboxes2[..., None, :, 3:]) # [B, rows, cols, 3] wh = (rb - lt).clamp(min=0) # [B, rows, cols, 3] overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] if mode in ['iou', 'giou']: union = area1[..., None] + area2[..., None, :] - overlap if mode == 'giou': enclosed_lt = torch.min(bboxes1[..., :, None, :3], bboxes2[..., None, :, :3]) enclosed_rb = torch.max(bboxes1[..., :, None, 3:], bboxes2[..., None, :, 3:]) eps = union.new_tensor([eps]) union = torch.max(union, eps) ious = overlap / union if mode in ['iou']: return ious # calculate gious enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0) enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2] enclose_area = torch.max(enclose_area, eps) gious = ious - (enclose_area - union) / enclose_area return gious ================================================ FILE: bip3d/structures/ops/transforms.py ================================================ # Copyright (c) OpenRobotLab. All rights reserved. import torch def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical): """Map bboxes from testing scale to original image scale. Args: bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back. scale_factor (float): Scale factor. flip_horizontal (bool): Whether to flip horizontally. flip_vertical (bool): Whether to flip vertically. Returns: :obj:`BaseInstance3DBoxes`: Boxes mapped back. """ new_bboxes = bboxes.clone() if flip_horizontal: new_bboxes.flip('horizontal') if flip_vertical: new_bboxes.flip('vertical') new_bboxes.scale(1 / scale_factor) return new_bboxes def bbox3d2roi(bbox_list): """Convert a list of bounding boxes to roi format. Args: bbox_list (list[torch.Tensor]): A list of bounding boxes corresponding to a batch of images. Returns: torch.Tensor: Region of interests in shape (n, c), where the channels are in order of [batch_ind, x, y ...]. """ rois_list = [] for img_id, bboxes in enumerate(bbox_list): if bboxes.size(0) > 0: img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) rois = torch.cat([img_inds, bboxes], dim=-1) else: rois = torch.zeros_like(bboxes) rois_list.append(rois) rois = torch.cat(rois_list, 0) return rois # TODO delete this def bbox3d2result(bboxes, scores, labels, attrs=None): """Convert detection results to a list of numpy arrays. Args: bboxes (torch.Tensor): Bounding boxes with shape (N, 5). labels (torch.Tensor): Labels with shape (N, ). scores (torch.Tensor): Scores with shape (N, ). attrs (torch.Tensor, optional): Attributes with shape (N, ). Defaults to None. Returns: dict[str, torch.Tensor]: Bounding box results in cpu mode. - boxes_3d (torch.Tensor): 3D boxes. - scores (torch.Tensor): Prediction scores. - labels_3d (torch.Tensor): Box labels. - attrs_3d (torch.Tensor, optional): Box attributes. """ result_dict = dict(bboxes_3d=bboxes.to('cpu'), scores_3d=scores.cpu(), labels_3d=labels.cpu()) if attrs is not None: result_dict['attr_labels'] = attrs.cpu() return result_dict ================================================ FILE: bip3d/structures/points/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_points import BasePoints from .cam_points import CameraPoints from .depth_points import DepthPoints from .lidar_points import LiDARPoints __all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints'] def get_points_type(points_type: str) -> type: """Get the class of points according to coordinate type. Args: points_type (str): The type of points coordinate. The valid value are "CAMERA", "LIDAR" and "DEPTH". Returns: type: Points type. """ points_type_upper = points_type.upper() if points_type_upper == 'CAMERA': points_cls = CameraPoints elif points_type_upper == 'LIDAR': points_cls = LiDARPoints elif points_type_upper == 'DEPTH': points_cls = DepthPoints else: raise ValueError('Only "points_type" of "CAMERA", "LIDAR" and "DEPTH" ' f'are supported, got {points_type}') return points_cls ================================================ FILE: bip3d/structures/points/base_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from abc import abstractmethod from typing import Iterator, Optional, Sequence, Union import numpy as np import torch from torch import Tensor from bip3d.structures.bbox_3d.utils import (rotation_3d_in_axis, rotation_3d_in_euler) class BasePoints: """Base class for Points. Args: tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points data with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (Tensor): Float matrix with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], points_dim: int = 3, attribute_dims: Optional[dict] = None) -> None: if isinstance(tensor, Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that does # not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((-1, points_dim)) assert tensor.dim() == 2 and tensor.size(-1) == points_dim, \ ('The points dimension must be 2 and the length of the last ' f'dimension must be {points_dim}, but got points with shape ' f'{tensor.shape}.') self.tensor = tensor.clone() self.points_dim = points_dim self.attribute_dims = attribute_dims self.rotation_axis = 0 @property def coord(self) -> Tensor: """Tensor: Coordinates of each point in shape (N, 3).""" return self.tensor[:, :3] @coord.setter def coord(self, tensor: Union[Tensor, np.ndarray]) -> None: """Set the coordinates of each point. Args: tensor (Tensor or np.ndarray): Coordinates of each point with shape (N, 3). """ try: tensor = tensor.reshape(self.shape[0], 3) except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray raise ValueError(f'got unexpected shape {tensor.shape}') if not isinstance(tensor, Tensor): tensor = self.tensor.new_tensor(tensor) self.tensor[:, :3] = tensor @property def height(self) -> Union[Tensor, None]: """Tensor or None: Returns a vector with height of each point in shape (N, ).""" if self.attribute_dims is not None and \ 'height' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['height']] else: return None @height.setter def height(self, tensor: Union[Tensor, np.ndarray]) -> None: """Set the height of each point. Args: tensor (Tensor or np.ndarray): Height of each point with shape (N, ). """ try: tensor = tensor.reshape(self.shape[0]) except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray raise ValueError(f'got unexpected shape {tensor.shape}') if not isinstance(tensor, Tensor): tensor = self.tensor.new_tensor(tensor) if self.attribute_dims is not None and \ 'height' in self.attribute_dims.keys(): self.tensor[:, self.attribute_dims['height']] = tensor else: # add height attribute if self.attribute_dims is None: self.attribute_dims = dict() attr_dim = self.shape[1] self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1) self.attribute_dims.update(dict(height=attr_dim)) self.points_dim += 1 @property def color(self) -> Union[Tensor, None]: """Tensor or None: Returns a vector with color of each point in shape (N, 3).""" if self.attribute_dims is not None and \ 'color' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['color']] else: return None @color.setter def color(self, tensor: Union[Tensor, np.ndarray]) -> None: """Set the color of each point. Args: tensor (Tensor or np.ndarray): Color of each point with shape (N, 3). """ try: tensor = tensor.reshape(self.shape[0], 3) except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray raise ValueError(f'got unexpected shape {tensor.shape}') if tensor.max() >= 256 or tensor.min() < 0: warnings.warn('point got color value beyond [0, 255]') if not isinstance(tensor, Tensor): tensor = self.tensor.new_tensor(tensor) if self.attribute_dims is not None and \ 'color' in self.attribute_dims.keys(): self.tensor[:, self.attribute_dims['color']] = tensor else: # add color attribute if self.attribute_dims is None: self.attribute_dims = dict() attr_dim = self.shape[1] self.tensor = torch.cat([self.tensor, tensor], dim=1) self.attribute_dims.update( dict(color=[attr_dim, attr_dim + 1, attr_dim + 2])) self.points_dim += 3 @property def shape(self) -> torch.Size: """torch.Size: Shape of points.""" return self.tensor.shape def shuffle(self) -> Tensor: """Shuffle the points. Returns: Tensor: The shuffled index. """ idx = torch.randperm(self.__len__(), device=self.tensor.device) self.tensor = self.tensor[idx] return idx def rotate(self, rotation: Union[Tensor, np.ndarray, float], axis: Optional[int] = None) -> Tensor: """Rotate points with the given rotation matrix or angle. Args: rotation (Tensor or np.ndarray or float): Rotation matrix or angle. axis (int, optional): Axis to rotate at. Defaults to None. Returns: Tensor: Rotation matrix. """ if not isinstance(rotation, Tensor): rotation = self.tensor.new_tensor(rotation) assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, \ f'invalid rotation shape {rotation.shape}' if axis is None: axis = self.rotation_axis if rotation.numel() == 1: rotated_points, rot_mat_T = rotation_3d_in_axis( self.tensor[:, :3][None], rotation, axis=axis, return_mat=True) self.tensor[:, :3] = rotated_points.squeeze(0) rot_mat_T = rot_mat_T.squeeze(0) elif rotation.numel() == 3: rotated_points, rot_mat_T = rotation_3d_in_euler( self.tensor[:, :3][None], rotation, return_mat=True) self.tensor[:, :3] = rotated_points.squeeze(0) rot_mat_T = rot_mat_T.squeeze(0) else: # rotation.numel() == 9 self.tensor[:, :3] = self.tensor[:, :3] @ rotation rot_mat_T = rotation return rot_mat_T @abstractmethod def flip(self, bev_direction: str = 'horizontal') -> None: """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). Defaults to 'horizontal'. """ pass def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None: """Translate points with the given translation vector. Args: trans_vector (Tensor or np.ndarray): Translation vector of size 3 or nx3. """ if not isinstance(trans_vector, Tensor): trans_vector = self.tensor.new_tensor(trans_vector) trans_vector = trans_vector.squeeze(0) if trans_vector.dim() == 1: assert trans_vector.shape[0] == 3 elif trans_vector.dim() == 2: assert trans_vector.shape[0] == self.tensor.shape[0] and \ trans_vector.shape[1] == 3 else: raise NotImplementedError( f'Unsupported translation vector of shape {trans_vector.shape}' ) self.tensor[:, :3] += trans_vector def in_range_3d( self, point_range: Union[Tensor, np.ndarray, Sequence[float]]) -> Tensor: """Check whether the points are in the given range. Args: point_range (Tensor or np.ndarray or Sequence[float]): The range of point (x_min, y_min, z_min, x_max, y_max, z_max). Note: In the original implementation of SECOND, checking whether a box in the range checks whether the points are in a convex polygon, we try to reduce the burden for simpler cases. Returns: Tensor: A binary vector indicating whether each point is inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > point_range[0]) & (self.tensor[:, 1] > point_range[1]) & (self.tensor[:, 2] > point_range[2]) & (self.tensor[:, 0] < point_range[3]) & (self.tensor[:, 1] < point_range[4]) & (self.tensor[:, 2] < point_range[5])) return in_range_flags @property def bev(self) -> Tensor: """Tensor: BEV of the points in shape (N, 2).""" return self.tensor[:, [0, 1]] def in_range_bev( self, point_range: Union[Tensor, np.ndarray, Sequence[float]]) -> Tensor: """Check whether the points are in the given range. Args: point_range (Tensor or np.ndarray or Sequence[float]): The range of point in order of (x_min, y_min, x_max, y_max). Returns: Tensor: A binary vector indicating whether each point is inside the reference range. """ in_range_flags = ((self.bev[:, 0] > point_range[0]) & (self.bev[:, 1] > point_range[1]) & (self.bev[:, 0] < point_range[2]) & (self.bev[:, 1] < point_range[3])) return in_range_flags @abstractmethod def convert_to(self, dst: int, rt_mat: Optional[Union[Tensor, np.ndarray]] = None) -> 'BasePoints': """Convert self to ``dst`` mode. Args: dst (int): The target Point mode. rt_mat (Tensor or np.ndarray, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type in the ``dst`` mode. """ pass def scale(self, scale_factor: float) -> None: """Scale the points with horizontal and vertical scaling factors. Args: scale_factors (float): Scale factors to scale the points. """ self.tensor[:, :3] *= scale_factor def __getitem__( self, item: Union[int, tuple, slice, np.ndarray, Tensor]) -> 'BasePoints': """ Args: item (int or tuple or slice or np.ndarray or Tensor): Index of points. Note: The following usage are allowed: 1. `new_points = points[3]`: Return a `Points` that contains only one point. 2. `new_points = points[2:10]`: Return a slice of points. 3. `new_points = points[vector]`: Whether vector is a torch.BoolTensor with `length = len(points)`. Nonzero elements in the vector will be selected. 4. `new_points = points[3:11, vector]`: Return a slice of points and attribute dims. 5. `new_points = points[4:12, 2]`: Return a slice of points with single attribute. Note that the returned Points might share storage with this Points, subject to PyTorch's indexing semantics. Returns: :obj:`BasePoints`: A new object of :class:`BasePoints` after indexing. """ original_type = type(self) if isinstance(item, int): return original_type(self.tensor[item].view(1, -1), points_dim=self.points_dim, attribute_dims=self.attribute_dims) elif isinstance(item, tuple) and len(item) == 2: if isinstance(item[1], slice): start = 0 if item[1].start is None else item[1].start stop = self.tensor.shape[1] \ if item[1].stop is None else item[1].stop step = 1 if item[1].step is None else item[1].step item = list(item) item[1] = list(range(start, stop, step)) item = tuple(item) elif isinstance(item[1], int): item = list(item) item[1] = [item[1]] item = tuple(item) p = self.tensor[item[0], item[1]] keep_dims = list( set(item[1]).intersection(set(range(3, self.tensor.shape[1])))) if self.attribute_dims is not None: attribute_dims = self.attribute_dims.copy() for key in self.attribute_dims.keys(): cur_attribute_dims = attribute_dims[key] if isinstance(cur_attribute_dims, int): cur_attribute_dims = [cur_attribute_dims] intersect_attr = list( set(cur_attribute_dims).intersection(set(keep_dims))) if len(intersect_attr) == 1: attribute_dims[key] = intersect_attr[0] elif len(intersect_attr) > 1: attribute_dims[key] = intersect_attr else: attribute_dims.pop(key) else: attribute_dims = None elif isinstance(item, (slice, np.ndarray, Tensor)): p = self.tensor[item] attribute_dims = self.attribute_dims else: raise NotImplementedError(f'Invalid slice {item}!') assert p.dim() == 2, \ f'Indexing on Points with {item} failed to return a matrix!' return original_type(p, points_dim=p.shape[1], attribute_dims=attribute_dims) def __len__(self) -> int: """int: Number of points in the current object.""" return self.tensor.shape[0] def __repr__(self) -> str: """str: Return a string that describes the object.""" return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' @classmethod def cat(cls, points_list: Sequence['BasePoints']) -> 'BasePoints': """Concatenate a list of Points into a single Points. Args: points_list (Sequence[:obj:`BasePoints`]): List of points. Returns: :obj:`BasePoints`: The concatenated points. """ assert isinstance(points_list, (list, tuple)) if len(points_list) == 0: return cls(torch.empty(0)) assert all(isinstance(points, cls) for points in points_list) # use torch.cat (v.s. layers.cat) # so the returned points never share storage with input cat_points = cls(torch.cat([p.tensor for p in points_list], dim=0), points_dim=points_list[0].points_dim, attribute_dims=points_list[0].attribute_dims) return cat_points def numpy(self) -> np.ndarray: """Reload ``numpy`` from self.tensor.""" return self.tensor.numpy() def to(self, device: Union[str, torch.device], *args, **kwargs) -> 'BasePoints': """Convert current points to a specific device. Args: device (str or :obj:`torch.device`): The name of the device. Returns: :obj:`BasePoints`: A new points object on the specific device. """ original_type = type(self) return original_type(self.tensor.to(device, *args, **kwargs), points_dim=self.points_dim, attribute_dims=self.attribute_dims) def cpu(self) -> 'BasePoints': """Convert current points to cpu device. Returns: :obj:`BasePoints`: A new points object on the cpu device. """ original_type = type(self) return original_type(self.tensor.cpu(), points_dim=self.points_dim, attribute_dims=self.attribute_dims) def cuda(self, *args, **kwargs) -> 'BasePoints': """Convert current points to cuda device. Returns: :obj:`BasePoints`: A new points object on the cuda device. """ original_type = type(self) return original_type(self.tensor.cuda(*args, **kwargs), points_dim=self.points_dim, attribute_dims=self.attribute_dims) def clone(self) -> 'BasePoints': """Clone the points. Returns: :obj:`BasePoints`: Point object with the same properties as self. """ original_type = type(self) return original_type(self.tensor.clone(), points_dim=self.points_dim, attribute_dims=self.attribute_dims) def detach(self) -> 'BasePoints': """Detach the points. Returns: :obj:`BasePoints`: Point object with the same properties as self. """ original_type = type(self) return original_type(self.tensor.detach(), points_dim=self.points_dim, attribute_dims=self.attribute_dims) @property def device(self) -> torch.device: """torch.device: The device of the points are on.""" return self.tensor.device def __iter__(self) -> Iterator[Tensor]: """Yield a point as a Tensor at a time. Returns: Iterator[Tensor]: A point of shape (points_dim, ). """ yield from self.tensor def new_point( self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]] ) -> 'BasePoints': """Create a new point object with data. The new point and its tensor has the similar properties as self and self.tensor, respectively. Args: data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to be copied. Returns: :obj:`BasePoints`: A new point object with ``data``, the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ if not isinstance(data, Tensor) else data.to(self.device) original_type = type(self) return original_type(new_tensor, points_dim=self.points_dim, attribute_dims=self.attribute_dims) ================================================ FILE: bip3d/structures/points/cam_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Optional, Sequence, Union import numpy as np from torch import Tensor from .base_points import BasePoints class CameraPoints(BasePoints): """Points of instances in CAM coordinates. Args: tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points data with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (Tensor): Float matrix with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], points_dim: int = 3, attribute_dims: Optional[dict] = None) -> None: super(CameraPoints, self).__init__(tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 1 def flip(self, bev_direction: str = 'horizontal') -> None: """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). Defaults to 'horizontal'. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 2] = -self.tensor[:, 2] @property def bev(self) -> Tensor: """Tensor: BEV of the points in shape (N, 2).""" return self.tensor[:, [0, 2]] def convert_to(self, dst: int, rt_mat: Optional[Union[Tensor, np.ndarray]] = None) -> 'BasePoints': """Convert self to ``dst`` mode. Args: dst (int): The target Point mode. rt_mat (Tensor or np.ndarray, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type in the ``dst`` mode. """ from embodiedscan.structures.bbox_3d import Coord3DMode return Coord3DMode.convert_point(point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat) ================================================ FILE: bip3d/structures/points/depth_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Optional, Sequence, Union import numpy as np from torch import Tensor from .base_points import BasePoints class DepthPoints(BasePoints): """Points of instances in DEPTH coordinates. Args: tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points data with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (Tensor): Float matrix with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], points_dim: int = 3, attribute_dims: Optional[dict] = None) -> None: super(DepthPoints, self).__init__(tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 2 def flip(self, bev_direction: str = 'horizontal') -> None: """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). Defaults to 'horizontal'. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 1] = -self.tensor[:, 1] def convert_to(self, dst: int, rt_mat: Optional[Union[Tensor, np.ndarray]] = None) -> 'BasePoints': """Convert self to ``dst`` mode. Args: dst (int): The target Point mode. rt_mat (Tensor or np.ndarray, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type in the ``dst`` mode. """ from embodiedscan.structures.bbox_3d import Coord3DMode return Coord3DMode.convert_point(point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat) ================================================ FILE: bip3d/structures/points/lidar_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Optional, Sequence, Union import numpy as np from torch import Tensor from .base_points import BasePoints class LiDARPoints(BasePoints): """Points of instances in LIDAR coordinates. Args: tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points data with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (Tensor): Float matrix with shape (N, points_dim). points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], points_dim: int = 3, attribute_dims: Optional[dict] = None) -> None: super(LiDARPoints, self).__init__(tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 2 def flip(self, bev_direction: str = 'horizontal') -> None: """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). Defaults to 'horizontal'. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 1] = -self.tensor[:, 1] elif bev_direction == 'vertical': self.tensor[:, 0] = -self.tensor[:, 0] def convert_to(self, dst: int, rt_mat: Optional[Union[Tensor, np.ndarray]] = None) -> 'BasePoints': """Convert self to ``dst`` mode. Args: dst (int): The target Point mode. rt_mat (Tensor or np.ndarray, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type in the ``dst`` mode. """ from embodiedscan.structures.bbox_3d import Coord3DMode return Coord3DMode.convert_point(point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat) ================================================ FILE: bip3d/utils/__init__.py ================================================ from .array_converter import ArrayConverter, array_converter from .typing_config import ConfigType, OptConfigType, OptMultiConfig __all__ = [ 'ConfigType', 'OptConfigType', 'OptMultiConfig', 'ArrayConverter', 'array_converter' ] ================================================ FILE: bip3d/utils/array_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import functools from inspect import getfullargspec from typing import Callable, Optional, Tuple, Type, Union import numpy as np import torch TemplateArrayType = Union[np.ndarray, torch.Tensor, list, tuple, int, float] def array_converter(to_torch: bool = True, apply_to: Tuple[str, ...] = tuple(), template_arg_name_: Optional[str] = None, recover: bool = True) -> Callable: """Wrapper function for data-type agnostic processing. First converts input arrays to PyTorch tensors or NumPy arrays for middle calculation, then convert output to original data-type if `recover=True`. Args: to_torch (bool): Whether to convert to PyTorch tensors for middle calculation. Defaults to True. apply_to (Tuple[str]): The arguments to which we apply data-type conversion. Defaults to an empty tuple. template_arg_name_ (str, optional): Argument serving as the template (return arrays should have the same dtype and device as the template). Defaults to None. If None, we will use the first argument in `apply_to` as the template argument. recover (bool): Whether or not to recover the wrapped function outputs to the `template_arg_name_` type. Defaults to True. Raises: ValueError: When template_arg_name_ is not among all args, or when apply_to contains an arg which is not among all args, a ValueError will be raised. When the template argument or an argument to convert is a list or tuple, and cannot be converted to a NumPy array, a ValueError will be raised. TypeError: When the type of the template argument or an argument to convert does not belong to the above range, or the contents of such an list-or-tuple-type argument do not share the same data type, a TypeError will be raised. Returns: Callable: Wrapped function. Examples: >>> import torch >>> import numpy as np >>> >>> # Use torch addition for a + b, >>> # and convert return values to the type of a >>> @array_converter(apply_to=('a', 'b')) >>> def simple_add(a, b): >>> return a + b >>> >>> a = np.array([1.1]) >>> b = np.array([2.2]) >>> simple_add(a, b) >>> >>> # Use numpy addition for a + b, >>> # and convert return values to the type of b >>> @array_converter(to_torch=False, apply_to=('a', 'b'), >>> template_arg_name_='b') >>> def simple_add(a, b): >>> return a + b >>> >>> simple_add(a, b) >>> >>> # Use torch funcs for floor(a) if flag=True else ceil(a), >>> # and return the torch tensor >>> @array_converter(apply_to=('a',), recover=False) >>> def floor_or_ceil(a, flag=True): >>> return torch.floor(a) if flag else torch.ceil(a) >>> >>> floor_or_ceil(a, flag=False) """ def array_converter_wrapper(func): """Outer wrapper for the function.""" @functools.wraps(func) def new_func(*args, **kwargs): """Inner wrapper for the arguments.""" if len(apply_to) == 0: return func(*args, **kwargs) func_name = func.__name__ arg_spec = getfullargspec(func) arg_names = arg_spec.args arg_num = len(arg_names) default_arg_values = arg_spec.defaults if default_arg_values is None: default_arg_values = [] no_default_arg_num = len(arg_names) - len(default_arg_values) kwonly_arg_names = arg_spec.kwonlyargs kwonly_default_arg_values = arg_spec.kwonlydefaults if kwonly_default_arg_values is None: kwonly_default_arg_values = {} all_arg_names = arg_names + kwonly_arg_names # in case there are args in the form of *args if len(args) > arg_num: named_args = args[:arg_num] nameless_args = args[arg_num:] else: named_args = args nameless_args = [] # template argument data type is used for all array-like arguments if template_arg_name_ is None: template_arg_name = apply_to[0] else: template_arg_name = template_arg_name_ if template_arg_name not in all_arg_names: raise ValueError(f'{template_arg_name} is not among the ' f'argument list of function {func_name}') # inspect apply_to for arg_to_apply in apply_to: if arg_to_apply not in all_arg_names: raise ValueError( f'{arg_to_apply} is not an argument of {func_name}') new_args = [] new_kwargs = {} converter = ArrayConverter() target_type = torch.Tensor if to_torch else np.ndarray # non-keyword arguments for i, arg_value in enumerate(named_args): if arg_names[i] in apply_to: new_args.append( converter.convert(input_array=arg_value, target_type=target_type)) else: new_args.append(arg_value) if arg_names[i] == template_arg_name: template_arg_value = arg_value kwonly_default_arg_values.update(kwargs) kwargs = kwonly_default_arg_values # keyword arguments and non-keyword arguments using default value for i in range(len(named_args), len(all_arg_names)): arg_name = all_arg_names[i] if arg_name in kwargs: if arg_name in apply_to: new_kwargs[arg_name] = converter.convert( input_array=kwargs[arg_name], target_type=target_type) else: new_kwargs[arg_name] = kwargs[arg_name] else: default_value = default_arg_values[i - no_default_arg_num] if arg_name in apply_to: new_kwargs[arg_name] = converter.convert( input_array=default_value, target_type=target_type) else: new_kwargs[arg_name] = default_value if arg_name == template_arg_name: template_arg_value = kwargs[arg_name] # add nameless args provided by *args (if exists) new_args += nameless_args return_values = func(*new_args, **new_kwargs) converter.set_template(template_arg_value) def recursive_recover(input_data): if isinstance(input_data, (tuple, list)): new_data = [] for item in input_data: new_data.append(recursive_recover(item)) return tuple(new_data) if isinstance(input_data, tuple) else new_data elif isinstance(input_data, dict): new_data = {} for k, v in input_data.items(): new_data[k] = recursive_recover(v) return new_data elif isinstance(input_data, (torch.Tensor, np.ndarray)): return converter.recover(input_data) else: return input_data if recover: return recursive_recover(return_values) else: return return_values return new_func return array_converter_wrapper class ArrayConverter: """Utility class for data-type agnostic processing. Args: template_array (np.ndarray or torch.Tensor or list or tuple or int or float, optional): Template array. Defaults to None. """ SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float16, np.float32, np.float64) def __init__(self, template_array: Optional[TemplateArrayType] = None) -> None: if template_array is not None: self.set_template(template_array) def set_template(self, array: TemplateArrayType) -> None: """Set template array. Args: array (np.ndarray or torch.Tensor or list or tuple or int or float): Template array. Raises: ValueError: If input is list or tuple and cannot be converted to a NumPy array, a ValueError is raised. TypeError: If input type does not belong to the above range, or the contents of a list or tuple do not share the same data type, a TypeError is raised. """ self.array_type = type(array) self.is_num = False self.device = 'cpu' if isinstance(array, np.ndarray): self.dtype = array.dtype elif isinstance(array, torch.Tensor): self.dtype = array.dtype self.device = array.device elif isinstance(array, (list, tuple)): try: array = np.array(array) if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES: raise TypeError self.dtype = array.dtype except (ValueError, TypeError): print('The following list cannot be converted to a numpy ' f'array of supported dtype:\n{array}') raise elif isinstance(array, (int, float)): self.array_type = np.ndarray self.is_num = True self.dtype = np.dtype(type(array)) else: raise TypeError( f'Template type {self.array_type} is not supported.') def convert( self, input_array: TemplateArrayType, target_type: Optional[Type] = None, target_array: Optional[Union[np.ndarray, torch.Tensor]] = None ) -> Union[np.ndarray, torch.Tensor]: """Convert input array to target data type. Args: input_array (np.ndarray or torch.Tensor or list or tuple or int or float): Input array. target_type (Type, optional): Type to which input array is converted. It should be `np.ndarray` or `torch.Tensor`. Defaults to None. target_array (np.ndarray or torch.Tensor, optional): Template array to which input array is converted. Defaults to None. Raises: ValueError: If input is list or tuple and cannot be converted to a NumPy array, a ValueError is raised. TypeError: If input type does not belong to the above range, or the contents of a list or tuple do not share the same data type, a TypeError is raised. Returns: np.ndarray or torch.Tensor: The converted array. """ if isinstance(input_array, (list, tuple)): try: input_array = np.array(input_array) if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES: raise TypeError except (ValueError, TypeError): print('The input cannot be converted to a single-type numpy ' f'array:\n{input_array}') raise elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES): input_array = np.array(input_array) array_type = type(input_array) assert target_type is not None or target_array is not None, \ 'must specify a target' if target_type is not None: assert target_type in (np.ndarray, torch.Tensor), \ 'invalid target type' if target_type == array_type: return input_array elif target_type == np.ndarray: # default dtype is float32 converted_array = input_array.cpu().numpy().astype(np.float32) else: # default dtype is float32, device is 'cpu' converted_array = torch.tensor(input_array, dtype=torch.float32) else: assert isinstance(target_array, (np.ndarray, torch.Tensor)), \ 'invalid target array type' if isinstance(target_array, array_type): return input_array elif isinstance(target_array, np.ndarray): converted_array = input_array.cpu().numpy().astype( target_array.dtype) else: converted_array = target_array.new_tensor(input_array) return converted_array def recover( self, input_array: Union[np.ndarray, torch.Tensor] ) -> Union[np.ndarray, torch.Tensor, int, float]: """Recover input type to original array type. Args: input_array (np.ndarray or torch.Tensor): Input array. Returns: np.ndarray or torch.Tensor or int or float: Converted array. """ assert isinstance(input_array, (np.ndarray, torch.Tensor)), \ 'invalid input array type' if isinstance(input_array, self.array_type): return input_array elif isinstance(input_array, torch.Tensor): converted_array = input_array.cpu().numpy().astype(self.dtype) else: converted_array = torch.tensor(input_array, dtype=self.dtype, device=self.device) if self.is_num: converted_array = converted_array.item() return converted_array ================================================ FILE: bip3d/utils/default_color_map.py ================================================ DEFAULT_COLOR_MAP = { 'floor': [255, 193, 193], 'wall': [137, 54, 74], 'chair': [153, 69, 1], 'cabinet': [134, 199, 156], 'door': [92, 136, 89], 'table': [255.0, 187.0, 120.0], 'couch': [3, 95, 161], 'shelf': [255, 160, 98], 'window': [183, 121, 142], 'bed': [123, 104, 238], 'curtain': [210, 170, 100], 'plant': [163, 255, 0], 'stairs': [104, 84, 109], 'pillow': [76, 91, 113], 'counter': [146, 112, 198], 'bench': [250, 0, 30], 'rail': [230, 150, 140], 'sink': [135, 206, 250], 'mirror': [154, 208, 0], 'toilet': [0, 165, 120], 'refrigerator': [59, 105, 106], 'book': [142, 108, 45], 'tv': [183, 130, 88], 'blanket': [147, 211, 203], 'rack': [255, 208, 186], 'towel': [225, 199, 255], 'backpack': [255, 179, 240], 'roof': [92, 82, 55], 'bag': [209, 0, 151], 'board': [133, 129, 255], 'bicycle': [119, 11, 32], 'oven': [178, 90, 62], 'microwave': [79, 210, 114], 'desk': [109, 63, 54], 'doorframe': [199, 100, 0], 'wardrobe': [7, 246, 231], 'picture': [171, 134, 1], 'bathtub': [92, 0, 73], 'box': [188, 208, 182], 'stand': [146, 139, 141], 'clothes': [96, 96, 96], 'lamp': [107, 255, 200], 'dresser': [206, 186, 171], 'stool': [73, 77, 174], 'fireplace': [255, 77, 255], 'commode': [102, 102, 156], 'washing machine': [152, 251, 152], 'monitor': [208, 195, 210], 'window frame': [227, 255, 205], 'radiator': [191, 162, 208], 'mat': [250, 141, 255], 'shower': [154, 255, 154], 'ottoman': [95, 32, 0], 'column': [60, 143, 255], 'blinds': [134, 134, 103], 'stove': [128, 64, 128], 'bar': [72, 0, 118], 'pillar': [220, 20, 60], 'bin': [187, 255, 255], 'heater': [209, 226, 140], 'clothes dryer': [100, 170, 30], 'blackboard': [0, 82, 0], 'decoration': [107, 142, 35], 'steps': [120, 166, 157], 'windowsill': [9, 80, 61], 'cushion': [0, 228, 0], 'carpet': [175, 116, 175], 'copier': [241, 129, 0], 'countertop': [207, 138, 255], 'basket': [0, 0, 70], 'mailbox': [150, 100, 100], 'kitchen island': [220, 220, 0], 'washbasin': [0, 80, 100], 'drawer': [0, 220, 176], 'piano': [78, 180, 255], 'exercise equipment': [151, 0, 95], 'beam': [255, 255, 128], 'partition': [168, 171, 172], 'printer': [179, 0, 194], 'frame': [255, 180, 195], 'object': [0, 0, 0], 'adhesive tape': [0, 220, 176], 'air conditioner': [109, 63, 54], 'alarm': [0, 114, 143], 'album': [147, 186, 208], 'arch': [135, 158, 223], 'balcony': [70, 70, 70], 'ball': [96, 96, 96], 'banister': [196, 172, 0], 'barricade': [45, 89, 255], 'baseboard': [153, 69, 1], 'basin': [255.0, 187.0, 120.0], 'beanbag': [190, 153, 153], 'bidet': [123, 104, 238], 'body loofah': [196, 172, 0], 'boots': [134, 199, 156], 'bottle': [241, 129, 0], 'bowl': [92, 136, 89], 'bread': [119, 11, 32], 'broom': [0, 226, 252], 'brush': [255, 255, 128], 'bucket': [255, 73, 97], 'calendar': [76, 91, 113], 'camera': [72, 0, 118], 'can': [109, 63, 54], 'candle': [78, 180, 255], 'candlestick': [104, 84, 109], 'cap': [128, 76, 255], 'car': [107, 142, 35], 'cart': [255, 255, 128], 'case': [0, 0, 230], 'chandelier': [169, 164, 131], 'cleanser': [0, 165, 120], 'clock': [190, 153, 153], 'coat hanger': [179, 0, 194], 'coffee maker': [0, 82, 0], 'coil': [255, 179, 240], 'computer': [225, 199, 255], 'conducting wire': [150, 100, 100], 'container': [0, 0, 70], 'control': [255, 77, 255], 'cosmetics': [142, 108, 45], 'crate': [0, 226, 252], 'crib': [169, 164, 131], 'cube': [116, 112, 0], 'cup': [175, 116, 175], 'detergent': [255, 208, 186], 'device': [146, 139, 141], 'dish rack': [0, 0, 142], 'dishwasher': [92, 82, 55], 'dispenser': [95, 32, 0], 'divider': [219, 142, 185], 'door knob': [166, 74, 118], 'doorway': [134, 134, 103], 'dress': [0, 114, 143], 'drum': [107, 142, 35], 'duct': [0, 80, 100], 'dumbbell': [0, 0, 192], 'dustpan': [78, 180, 255], 'dvd': [0, 143, 149], 'eraser': [0, 82, 0], 'fan': [0, 0, 70], 'faucet': [84, 105, 51], 'fence': [190, 153, 153], 'file': [255, 228, 255], 'fire extinguisher': [107, 255, 200], 'flowerpot': [9, 80, 61], 'flush': [227, 255, 205], 'folder': [208, 229, 228], 'food': [109, 63, 54], 'footstool': [133, 129, 255], 'fruit': [179, 0, 194], 'furniture': [220, 20, 60], 'garage door': [217, 17, 255], 'garbage': [0, 82, 0], 'glass': [255, 99, 164], 'globe': [255, 77, 255], 'glove': [166, 196, 102], 'grab bar': [145, 148, 174], 'grass': [0, 60, 100], 'guitar': [73, 77, 174], 'hair dryer': [169, 164, 131], 'hamper': [241, 129, 0], 'handle': [142, 108, 45], 'hanger': [150, 100, 100], 'hat': [154, 208, 0], 'headboard': [171, 134, 1], 'headphones': [124, 74, 181], 'helmets': [209, 226, 140], 'holder': [151, 0, 95], 'hook': [92, 136, 89], 'humidifier': [209, 99, 106], 'ironware': [127, 167, 115], 'jacket': [255, 73, 97], 'jalousie': [255, 179, 240], 'jar': [106, 154, 176], 'kettle': [196, 172, 0], 'keyboard': [0, 125, 92], 'kitchenware': [74, 65, 105], 'knife': [70, 130, 180], 'label': [0, 228, 0], 'ladder': [0, 114, 143], 'laptop': [255, 180, 195], 'ledge': [58, 41, 149], 'letter': [0, 0, 192], 'light': [78, 180, 255], 'luggage': [0, 226, 252], 'machine': [197, 226, 255], 'magazine': [199, 100, 0], 'map': [183, 121, 142], 'mask': [74, 65, 105], 'mattress': [255, 179, 240], 'menu': [255, 255, 128], 'molding': [104, 84, 109], 'mop': [199, 100, 0], 'mouse': [5, 121, 0], 'napkins': [165, 42, 42], 'notebook': [175, 116, 175], 'pack': [0, 143, 149], 'package': [166, 196, 102], 'pad': [208, 229, 228], 'pan': [209, 99, 106], 'panel': [201, 57, 1], 'paper': [255, 179, 240], 'paper cutter': [207, 138, 255], 'pedestal': [64, 170, 64], 'pen': [193, 0, 92], 'person': [7, 246, 231], 'pipe': [255, 180, 195], 'pitcher': [220, 20, 60], 'plate': [142, 108, 45], 'player': [0, 143, 149], 'plug': [255, 77, 255], 'plunger': [165, 42, 42], 'pool': [153, 69, 1], 'pool table': [0, 0, 230], 'poster': [130, 114, 135], 'pot': [96, 36, 108], 'price tag': [255, 77, 255], 'projector': [179, 0, 194], 'purse': [0, 228, 0], 'radio': [116, 112, 0], 'range hood': [199, 100, 0], 'remote control': [188, 208, 182], 'ridge': [59, 105, 106], 'rod': [207, 138, 255], 'roll': [123, 104, 238], 'rope': [110, 76, 0], 'sack': [190, 153, 153], 'salt': [250, 0, 30], 'scale': [58, 41, 149], 'scissors': [60, 143, 255], 'screen': [0, 82, 0], 'seasoning': [254, 212, 124], 'shampoo': [70, 130, 180], 'sheet': [151, 0, 95], 'shirt': [190, 153, 153], 'shoe': [199, 100, 0], 'shovel': [241, 129, 0], 'sign': [208, 195, 210], 'soap': [109, 63, 54], 'soap dish': [166, 74, 118], 'soap dispenser': [95, 32, 0], 'socket': [255, 255, 255], 'speaker': [65, 70, 15], 'sponge': [0, 220, 176], 'spoon': [134, 134, 103], 'stall': [0, 60, 100], 'stapler': [246, 0, 122], 'statue': [196, 172, 0], 'stick': [0, 165, 120], 'stopcock': [0, 60, 100], 'structure': [220, 20, 60], 'sunglasses': [142, 108, 45], 'support': [209, 226, 140], 'switch': [7, 246, 231], 'tablet': [137, 54, 74], 'teapot': [0, 80, 100], 'telephone': [220, 220, 0], 'thermostat': [128, 76, 255], 'tissue': [73, 77, 174], 'tissue box': [96, 96, 96], 'toaster': [106, 0, 228], 'toilet paper': [84, 105, 51], 'toiletry': [128, 64, 128], 'tool': [220, 20, 60], 'toothbrush': [130, 114, 135], 'toothpaste': [0, 143, 149], 'toy': [255.0, 187.0, 120.0], 'tray': [255, 179, 240], 'treadmill': [166, 74, 118], 'trophy': [0, 220, 176], 'tube': [255, 255, 128], 'umbrella': [250, 0, 30], 'urn': [152, 251, 152], 'utensil': [220, 220, 0], 'vacuum cleaner': [96, 36, 108], 'vanity': [5, 121, 0], 'vase': [255, 193, 193], 'vent': [209, 226, 140], 'ventilation': [123, 104, 238], 'water cooler': [255, 255, 128], 'water heater': [145, 148, 174], 'wine': [220, 220, 0], 'wire': [96, 36, 108], 'wood': [127, 167, 115], 'wrap': [175, 116, 175], } ================================================ FILE: bip3d/utils/dist_utils.py ================================================ import torch.distributed as dist def reduce_mean(tensor): """"Obtain the mean of tensor on different GPUs.""" if not (dist.is_available() and dist.is_initialized()): return tensor tensor = tensor.clone() dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) return tensor ================================================ FILE: bip3d/utils/line_mesh.py ================================================ """Adapted from https://github.com/isl- org/Open3D/pull/738#issuecomment-564785941. Module which creates mesh lines from a line set Open3D relies upon using glLineWidth to set line width on a LineSet However, this method is now deprecated and not fully supported in newer OpenGL versions. See: Open3D Github Pull Request: https://github.com/intel-isl/Open3D/pull/738 Other Framework Issues: https://github.com/openframeworks/openFrameworks/issues/3460 This module aims to solve this by converting a line into a triangular mesh (which has thickness). The basic idea is to create a cylinder for each line segment, translate it, and then rotate it. License: MIT """ import numpy as np import open3d as o3d def align_vector_to_another(a=np.array([0, 0, 1]), b=np.array([1, 0, 0])): """Aligns vector a to vector b with axis angle rotation.""" if np.array_equal(a, b): return None, None axis_ = np.cross(a, b) axis_ = axis_ / np.linalg.norm(axis_) angle = np.arccos(np.dot(a, b)) return axis_, angle def normalized(a, axis=-1, order=2): """Normalizes a numpy array of points.""" l2 = np.atleast_1d(np.linalg.norm(a, order, axis)) l2[l2 == 0] = 1 return a / np.expand_dims(l2, axis), l2 class LineMesh(object): def __init__(self, points, lines=None, colors=[0, 1, 0], radius=0.15): """Creates a line represented as sequence of cylinder triangular meshes. Arguments: points {ndarray} -- Numpy array of ponts Nx3. Keyword Arguments: lines {list[list] or None} -- List of point index pairs denoting line segments. If None, implicit lines from ordered pairwise points. (default: {None}) colors {list} -- list of colors, or single color of the line (default: {[0, 1, 0]}) radius {float} -- radius of cylinder (default: {0.15}) """ self.points = np.array(points) self.lines = np.array( lines) if lines is not None else self.lines_from_ordered_points( self.points) self.colors = np.array(colors) self.radius = radius self.cylinder_segments = [] self.create_line_mesh() @staticmethod def lines_from_ordered_points(points): lines = [[i, i + 1] for i in range(0, points.shape[0] - 1, 1)] return np.array(lines) def create_line_mesh(self): first_points = self.points[self.lines[:, 0], :] second_points = self.points[self.lines[:, 1], :] line_segments = second_points - first_points line_segments_unit, line_lengths = normalized(line_segments) z_axis = np.array([0, 0, 1]) # Create triangular mesh cylinder segments of line for i in range(line_segments_unit.shape[0]): line_segment = line_segments_unit[i, :] line_length = line_lengths[i] # get axis angle rotation to allign cylinder with line segment axis, angle = align_vector_to_another(z_axis, line_segment) # Get translation vector translation = first_points[i, :] + \ line_segment * line_length * 0.5 # create cylinder and apply transformations cylinder_segment = o3d.geometry.TriangleMesh.create_cylinder( self.radius, line_length) cylinder_segment = cylinder_segment.translate(translation, relative=False) if axis is not None: axis_a = axis * angle cylinder_segment = cylinder_segment.rotate( R=o3d.geometry.get_rotation_matrix_from_axis_angle( axis_a)) # center=True) # cylinder_segment = cylinder_segment.rotate( # axis_a, center=True, # type=o3d.geometry.RotationType.AxisAngle) # color cylinder color = self.colors if self.colors.ndim == 1 \ else self.colors[i, :] cylinder_segment.paint_uniform_color(color) self.cylinder_segments.append(cylinder_segment) def add_line(self, vis): """Adds this line to the visualizer.""" for cylinder in self.cylinder_segments: vis.add_geometry(cylinder) def remove_line(self, vis): """Removes this line from the visualizer.""" for cylinder in self.cylinder_segments: vis.remove_geometry(cylinder) if __name__ == '__main__': def main(): print('Demonstrating LineMesh vs LineSet') # Create Line Set points = [[0, 0, 0], [1, 0, 0], [0, 1, 0], [1, 1, 0], [0, 0, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1]] lines = [[0, 1], [0, 2], [1, 3], [2, 3], [4, 5], [4, 6], [5, 7], [6, 7], [0, 4], [1, 5], [2, 6], [3, 7]] colors = [[1, 0, 0] for i in range(len(lines))] line_set = o3d.geometry.LineSet() line_set.points = o3d.utility.Vector3dVector(points) line_set.lines = o3d.utility.Vector2iVector(lines) line_set.colors = o3d.utility.Vector3dVector(colors) # Create Line Mesh 1 points = np.array(points) + [0, 0, 2] line_mesh1 = LineMesh(points, lines, colors, radius=0.02) line_mesh1_geoms = line_mesh1.cylinder_segments # Create Line Mesh 1 points = np.array(points) + [0, 2, 0] line_mesh2 = LineMesh(points, radius=0.03) line_mesh2_geoms = line_mesh2.cylinder_segments o3d.visualization.draw_geometries( [line_set, *line_mesh1_geoms, *line_mesh2_geoms]) main() ================================================ FILE: bip3d/utils/typing_config.py ================================================ from collections.abc import Sized from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch from mmdet.models.task_modules.samplers import SamplingResult from mmengine.config import ConfigDict from mmengine.structures import BaseDataElement, InstanceData class Det3DDataElement(BaseDataElement): @property def gt_instances_3d(self) -> InstanceData: return self._gt_instances_3d @gt_instances_3d.setter def gt_instances_3d(self, value: InstanceData) -> None: self.set_field(value, '_gt_instances_3d', dtype=InstanceData) @gt_instances_3d.deleter def gt_instances_3d(self) -> None: del self._gt_instances_3d @property def pred_instances_3d(self) -> InstanceData: return self._pred_instances_3d @pred_instances_3d.setter def pred_instances_3d(self, value: InstanceData) -> None: self.set_field(value, '_pred_instances_3d', dtype=InstanceData) @pred_instances_3d.deleter def pred_instances_3d(self) -> None: del self._pred_instances_3d IndexType = Union[str, slice, int, list, torch.LongTensor, torch.cuda.LongTensor, torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray] class PointData(BaseDataElement): """Data structure for point-level annotations or predictions. All data items in ``data_fields`` of ``PointData`` meet the following requirements: - They are all one dimension. - They should have the same length. `PointData` is used to save point-level semantic and instance mask, it also can save `instances_labels` and `instances_scores` temporarily. In the future, we would consider to move the instance-level info into `gt_instances_3d` and `pred_instances_3d`. Examples: >>> metainfo = dict( ... sample_idx=random.randint(0, 100)) >>> points = np.random.randint(0, 255, (100, 3)) >>> point_data = PointData(metainfo=metainfo, ... points=points) >>> print(len(point_data)) 100 >>> # slice >>> slice_data = point_data[10:60] >>> assert len(slice_data) == 50 >>> # set >>> point_data.pts_semantic_mask = torch.randint(0, 255, (100,)) >>> point_data.pts_instance_mask = torch.randint(0, 255, (100,)) >>> assert tuple(point_data.pts_semantic_mask.shape) == (100,) >>> assert tuple(point_data.pts_instance_mask.shape) == (100,) """ def __setattr__(self, name: str, value: Sized) -> None: """setattr is only used to set data. The value must have the attribute of `__len__` and have the same length of `PointData`. """ if name in ('_metainfo_fields', '_data_fields'): if not hasattr(self, name): super().__setattr__(name, value) else: raise AttributeError(f'{name} has been used as a ' 'private attribute, which is immutable.') else: assert isinstance(value, Sized), 'value must contain `__len__` attribute' # TODO: make sure the input value share the same length super().__setattr__(name, value) __setitem__ = __setattr__ def __getitem__(self, item: IndexType) -> 'PointData': """ Args: item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`, :obj:`torch.LongTensor`, :obj:`torch.BoolTensor`): Get the corresponding values according to item. Returns: :obj:`PointData`: Corresponding values. """ if isinstance(item, list): item = np.array(item) if isinstance(item, np.ndarray): # The default int type of numpy is platform dependent, int32 for # windows and int64 for linux. `torch.Tensor` requires the index # should be int64, therefore we simply convert it to int64 here. # Mode details in https://github.com/numpy/numpy/issues/9464 item = item.astype(np.int64) if item.dtype == np.int32 else item item = torch.from_numpy(item) assert isinstance( item, (str, slice, int, torch.LongTensor, torch.cuda.LongTensor, torch.BoolTensor, torch.cuda.BoolTensor)) if isinstance(item, str): return getattr(self, item) if isinstance(item, int): if item >= len(self) or item < -len(self): # type: ignore raise IndexError(f'Index {item} out of range!') else: # keep the dimension item = slice(item, None, len(self)) new_data = self.__class__(metainfo=self.metainfo) if isinstance(item, torch.Tensor): assert item.dim() == 1, 'Only support to get the' \ ' values along the first dimension.' if isinstance(item, (torch.BoolTensor, torch.cuda.BoolTensor)): assert len(item) == len(self), 'The shape of the ' \ 'input(BoolTensor) ' \ f'{len(item)} ' \ 'does not match the shape ' \ 'of the indexed tensor ' \ 'in results_field ' \ f'{len(self)} at ' \ 'first dimension.' for k, v in self.items(): if isinstance(v, torch.Tensor): new_data[k] = v[item] elif isinstance(v, np.ndarray): new_data[k] = v[item.cpu().numpy()] elif isinstance( v, (str, list, tuple)) or (hasattr(v, '__getitem__') and hasattr(v, 'cat')): # convert to indexes from BoolTensor if isinstance(item, (torch.BoolTensor, torch.cuda.BoolTensor)): indexes = torch.nonzero(item).view( -1).cpu().numpy().tolist() else: indexes = item.cpu().numpy().tolist() slice_list = [] if indexes: for index in indexes: slice_list.append(slice(index, None, len(v))) else: slice_list.append(slice(None, 0, None)) r_list = [v[s] for s in slice_list] if isinstance(v, (str, list, tuple)): new_value = r_list[0] for r in r_list[1:]: new_value = new_value + r else: new_value = v.cat(r_list) new_data[k] = new_value else: raise ValueError( f'The type of `{k}` is `{type(v)}`, which has no ' 'attribute of `cat`, so it does not ' 'support slice with `bool`') else: # item is a slice for k, v in self.items(): new_data[k] = v[item] return new_data # type: ignore def __len__(self) -> int: """int: The length of `PointData`.""" if len(self._data_fields) > 0: return len(self.values()[0]) else: return 0 # Type hint of config data ConfigType = Union[ConfigDict, dict] OptConfigType = Optional[ConfigType] # Type hint of one or more config data MultiConfig = Union[ConfigType, List[ConfigType]] OptMultiConfig = Optional[MultiConfig] InstanceList = List[InstanceData] OptInstanceList = Optional[InstanceList] ForwardResults = Union[Dict[str, torch.Tensor], List[Det3DDataElement], Tuple[torch.Tensor], torch.Tensor] SamplingResultList = List[SamplingResult] OptSamplingResultList = Optional[SamplingResultList] SampleList = List[Det3DDataElement] OptSampleList = Optional[SampleList] ================================================ FILE: configs/__init__.py ================================================ ================================================ FILE: configs/bip3d_det.py ================================================ _base_ = ["./default_runtime.py"] import os from bip3d.datasets.embodiedscan_det_grounding_dataset import ( class_names, head_labels, common_labels, tail_labels ) DEBUG = os.environ.get("CLUSTER") is None del os backend_args = None metainfo = dict(classes="all") depth = True depth_loss = True z_range=[-0.2, 3] min_depth = 0.25 max_depth = 10 num_depth = 64 model = dict( type="BIP3D", input_3d="depth_img", use_depth_grid_mask=True, data_preprocessor=dict( type="CustomDet3DDataPreprocessor", mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], bgr_to_rgb=True, pad_size_divisor=32, ), backbone=dict( type="mmdet.SwinTransformer", embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=(1, 2, 3), with_cp=True, convert_weights=False, ), neck=dict( type="mmdet.ChannelMapper", in_channels=[192, 384, 768], kernel_size=1, out_channels=256, act_cfg=None, bias=True, norm_cfg=dict(type="GN", num_groups=32), num_outs=4, ), text_encoder=dict( type="BertModel", special_tokens_list=["[CLS]", "[SEP]"], name="./ckpt/bert-base-uncased", pad_to_max=False, use_sub_sentence_represent=True, add_pooling_layer=False, max_tokens=768, use_checkpoint=True, return_tokenized=True, ), backbone_3d=( dict( type="mmdet.ResNet", depth=34, in_channels=1, base_channels=4, num_stages=4, out_indices=(1, 2, 3), norm_cfg=dict(type="BN", requires_grad=True), norm_eval=True, with_cp=True, style="pytorch", ) if depth else None ), neck_3d=( dict( type="mmdet.ChannelMapper", in_channels=[8, 16, 32], kernel_size=1, out_channels=32, act_cfg=None, bias=True, norm_cfg=dict(type="GN", num_groups=4), num_outs=4, ) if depth else None ), feature_enhancer=dict( type="TextImageDeformable2DEnhancer", num_layers=6, text_img_attn_block=dict( v_dim=256, l_dim=256, embed_dim=1024, num_heads=4, init_values=1e-4 ), img_attn_block=dict( self_attn_cfg=dict( embed_dims=256, num_levels=4, dropout=0.0, im2col_step=64 ), ffn_cfg=dict( embed_dims=256, feedforward_channels=2048, ffn_drop=0.0 ), ), text_attn_block=dict( self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0), ffn_cfg=dict( embed_dims=256, feedforward_channels=1024, ffn_drop=0.0 ), ), embed_dims=256, num_feature_levels=4, positional_encoding=dict( num_feats=128, normalize=True, offset=0.0, temperature=20 ), ), spatial_enhancer=dict( type="DepthFusionSpatialEnhancer", embed_dims=256, feature_3d_dim=32, num_depth_layers=2, min_depth=min_depth, max_depth=max_depth, num_depth=num_depth, with_feature_3d=depth, loss_depth_weight=1.0 if depth_loss else -1, ), decoder=dict( type="BBox3DDecoder", look_forward_twice=True, instance_bank=dict( type="InstanceBank", num_anchor=50, anchor="anchor_files/embodiedscan_kmeans.npy", embed_dims=256, anchor_in_camera=True, ), anchor_encoder=dict( type="DoF9BoxEncoder", embed_dims=256, rot_dims=3, ), graph_model=dict( type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0, batch_first=True, ), ffn=dict( type="FFN", embed_dims=256, feedforward_channels=2048, ffn_drop=0.0, ), norm_layer=dict(type="LN", normalized_shape=256), deformable_model=dict( type="DeformableFeatureAggregation", embed_dims=256, num_groups=8, num_levels=4, use_camera_embed=True, use_deformable_func=True, with_depth=True, min_depth=min_depth, max_depth=max_depth, kps_generator=dict( type="SparseBox3DKeyPointsGenerator", fix_scale=[ [0, 0, 0], [0.45, 0, 0], [-0.45, 0, 0], [0, 0.45, 0], [0, -0.45, 0], [0, 0, 0.45], [0, 0, -0.45], ], num_learnable_pts=9, ), with_value_proj=True, filter_outlier=True, ), text_cross_attn=dict( type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0, batch_first=True, ), refine_layer=dict( type="GroundingRefineClsHead", embed_dims=256, output_dim=9, cls_bias=True, # cls_layers=True, ), loss_cls=dict( type="mmdet.FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0, ), loss_reg=dict( type="DoF9BoxLoss", loss_weight_wd=1.0, loss_weight_pcd=0.0, loss_weight_cd=0.8, ), sampler=dict( type="Grounding3DTarget", cls_weight=1.0, box_weight=1.0, num_dn=100, cost_weight_wd=1.0, cost_weight_pcd=0.0, cost_weight_cd=0.8, with_dn_query=True, num_classes=284, embed_dims=256, ), gt_reg_key="gt_bboxes_3d", gt_cls_key="tokens_positive", post_processor=dict( type="GroundingBox3DPostProcess", num_output=1000, ), with_instance_id=False, ), ) dataset_type = "EmbodiedScanDetGroundingDataset" data_root = "data" metainfo = dict( classes=class_names, classes_split=(head_labels, common_labels, tail_labels), box_type_3d="euler-depth", ) image_wh = (512, 512) rotate_3rscan = True sep_token = "[SEP]" cam_standardization = True if cam_standardization: resize = dict( type="CamIntrisicStandardization", dst_intrinsic=[ [432.57943431339237, 0.0, 256], [0.0, 539.8570854208559, 256], [0.0, 0.0, 1.0], ], dst_wh=image_wh, ) else: resize = dict(type="CustomResize", scale=image_wh, keep_ratio=False) train_pipeline = [ dict(type="LoadAnnotations3D"), dict( type="MultiViewPipeline", n_images=1, max_n_images=18, transforms=[ dict(type="LoadImageFromFile", backend_args=backend_args), dict(type="LoadDepthFromFile", backend_args=backend_args), resize, dict( type="ResizeCropFlipImage", data_aug_conf={ "resize_lim": (1.0, 1.0), "final_dim": image_wh, "bot_pct_lim": (0.0, 0.05), "rot_lim": (0, 0), "H": image_wh[1], "W": image_wh[0], "rand_flip": False, }, ), ], rotate_3rscan=rotate_3rscan, ordered=False, ), dict( type="CategoryGroundingDataPrepare", classes=class_names, filter_others=True, sep_token=sep_token, max_class=128, training=True, z_range=z_range, ), dict( type="Pack3DDetInputs", keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"], ), ] if depth_loss: train_pipeline.append( dict( type="DepthProbLabelGenerator", origin_stride=4, min_depth=min_depth, max_depth=max_depth, num_depth=num_depth, ), ) test_pipeline = [ dict(type="LoadAnnotations3D"), dict( type="MultiViewPipeline", n_images=1, max_n_images=50, ordered=True, transforms=[ dict(type="LoadImageFromFile", backend_args=backend_args), dict(type="LoadDepthFromFile", backend_args=backend_args), resize, ], rotate_3rscan=rotate_3rscan, ), dict( type="CategoryGroundingDataPrepare", classes=class_names, sep_token=sep_token, training=False, filter_others=False, ), dict( type="Pack3DDetInputs", keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"], ), ] trainval = False data_version = "v1" if data_version == "v1": train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl" val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl" elif data_version == "v2": train_ann_file = "embodiedscan-v2/embodiedscan_infos_train.pkl" val_ann_file = "embodiedscan-v2/embodiedscan_infos_val.pkl" else: assert False train_dataset = dict( type=dataset_type, data_root=data_root, ann_file=train_ann_file, pipeline=train_pipeline, test_mode=False, filter_empty_gt=True, box_type_3d="Euler-Depth", metainfo=metainfo, ) if trainval: train_dataset = dict( type="ConcatDataset", datasets=[ train_dataset, dict( type=dataset_type, data_root=data_root, ann_file=val_ann_file, pipeline=train_pipeline, test_mode=False, filter_empty_gt=True, box_type_3d="Euler-Depth", metainfo=metainfo, ) ] ) train_dataloader = dict( batch_size=1, num_workers=4 if not DEBUG else 0, persistent_workers=False, sampler=dict(type="DefaultSampler", shuffle=True), dataset=dict( type="RepeatDataset", times=10, dataset=train_dataset, ), ) val_dataloader = dict( batch_size=1, num_workers=4, persistent_workers=False, drop_last=False, sampler=dict(type="DefaultSampler", shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file=val_ann_file, pipeline=test_pipeline, test_mode=True, filter_empty_gt=True, box_type_3d="Euler-Depth", metainfo=metainfo, ), ) test_dataloader = val_dataloader val_evaluator = dict( type="IndoorDetMetric", collect_dir="/job_data/.dist_test" if not DEBUG else None, # collect_device="gpu" ) test_evaluator = val_evaluator max_epochs = 24 train_cfg = dict( type="EpochBasedTrainLoop", max_epochs=max_epochs, val_interval=1 ) val_cfg = dict(type="ValLoop") test_cfg = dict(type="TestLoop") lr = 2e-4 optim_wrapper = dict( type="OptimWrapper", optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005), paramwise_cfg=dict( custom_keys={ "backbone.": dict(lr_mult=0.1), "text_encoder": dict(lr_mult=0.05), "absolute_pos_embed": dict(decay_mult=0.0), } ), clip_grad=dict(max_norm=10, norm_type=2), ) # learning rate param_scheduler = [ dict( type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500 ), dict( type="MultiStepLR", begin=0, end=max_epochs, by_epoch=True, milestones=[int(max_epochs / 12 * 8), int(max_epochs / 12 * 11)], gamma=0.1, ), ] custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)] default_hooks = dict( checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3), ) vis_backends = [ dict( type="TensorboardVisBackend", save_dir="/job_tboard" if not DEBUG else "./work-dir", ), ] visualizer = dict( type="Visualizer", vis_backends=vis_backends, name="visualizer", ) load_from = "ckpt/groundingdino_swint_ogc_mmdet-822d7e9d-rename.pth" ================================================ FILE: configs/bip3d_det_grounding.py ================================================ _base_ = ["./bip3d_grounding.py"] from mmengine import Config import os det_config = Config.fromfile("configs/bip3d_det.py") det_train_dataset = det_config.train_dataset del Config, det_config train_dataloader = _base_.train_dataloader DEBUG = os.environ.get("CLUSTER") is None train_dataloader["dataset"] = dict( type="ConcatDataset", datasets=[ dict( type="RepeatDataset", times=20, dataset=det_train_dataset, ), train_dataloader["dataset"], ] ) max_iters = 50000 train_cfg = dict( type="IterBasedTrainLoop", max_iters=max_iters, val_interval=25000, _delete_=True, ) val_cfg = dict(type="ValLoop") test_cfg = dict(type="TestLoop") lr = 2e-4 optim_wrapper = dict( type="OptimWrapper", optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005), paramwise_cfg=dict( custom_keys={ "backbone.": dict(lr_mult=0.1), "text_encoder": dict(lr_mult=0.05), "absolute_pos_embed": dict(decay_mult=0.0), } ), clip_grad=dict(max_norm=10, norm_type=2), ) # learning rate param_scheduler = [ dict( type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500 ), dict( type="MultiStepLR", begin=0, end=max_iters, by_epoch=False, milestones=[int(max_iters*0.8), int(max_iters*0.95)], gamma=0.1, ), ] custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)] default_hooks = dict( logger=dict( type="LoggerHook", interval=25, log_metric_by_epoch=False, ), checkpoint=dict( type="CheckpointHook", interval=25000, max_keep_ckpts=3, by_epoch=False ), ) vis_backends = [ dict( type="TensorboardVisBackend", save_dir="/job_tboard" if not DEBUG else "./work-dir", ), ] visualizer = dict( type="Visualizer", vis_backends=vis_backends, name="visualizer", ) log_processor = dict(type="LogProcessor", window_size=50, by_epoch=False) load_from = "ckpt/bip3d_det.pth" ================================================ FILE: configs/bip3d_det_rgb.py ================================================ _base_ = ["./bip3d_det.py"] model = dict( backbone_3d=None, neck_3d=None, spatial_enhancer=dict(with_feature_3d=False), decoder=dict(deformable_model=dict(with_depth=False)), ) ================================================ FILE: configs/bip3d_grounding.py ================================================ _base_ = ["./default_runtime.py"] import os from bip3d.datasets.embodiedscan_det_grounding_dataset import ( class_names, head_labels, common_labels, tail_labels ) DEBUG = os.environ.get("CLUSTER") is None del os backend_args = None metainfo = dict(classes="all") depth = True depth_loss = True z_range=[-0.2, 3] min_depth = 0.25 max_depth = 10 num_depth = 64 model = dict( type="BIP3D", input_3d="depth_img", use_depth_grid_mask=True, data_preprocessor=dict( type="CustomDet3DDataPreprocessor", mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], bgr_to_rgb=True, pad_size_divisor=32, ), backbone=dict( type="mmdet.SwinTransformer", embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=(1, 2, 3), with_cp=True, convert_weights=False, ), neck=dict( type="mmdet.ChannelMapper", in_channels=[192, 384, 768], kernel_size=1, out_channels=256, act_cfg=None, bias=True, norm_cfg=dict(type="GN", num_groups=32), num_outs=4, ), text_encoder=dict( type="BertModel", special_tokens_list=["[CLS]", "[SEP]"], name="./ckpt/bert-base-uncased", pad_to_max=False, use_sub_sentence_represent=True, add_pooling_layer=False, max_tokens=768, use_checkpoint=True, return_tokenized=True, ), backbone_3d=( dict( type="mmdet.ResNet", depth=34, in_channels=1, base_channels=4, num_stages=4, out_indices=(1, 2, 3), norm_cfg=dict(type="BN", requires_grad=True), norm_eval=True, with_cp=True, style="pytorch", ) if depth else None ), neck_3d=( dict( type="mmdet.ChannelMapper", in_channels=[8, 16, 32], kernel_size=1, out_channels=32, act_cfg=None, bias=True, norm_cfg=dict(type="GN", num_groups=4), num_outs=4, ) if depth else None ), feature_enhancer=dict( type="TextImageDeformable2DEnhancer", num_layers=6, text_img_attn_block=dict( v_dim=256, l_dim=256, embed_dim=1024, num_heads=4, init_values=1e-4 ), img_attn_block=dict( self_attn_cfg=dict( embed_dims=256, num_levels=4, dropout=0.0, im2col_step=64 ), ffn_cfg=dict( embed_dims=256, feedforward_channels=2048, ffn_drop=0.0 ), ), text_attn_block=dict( self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0), ffn_cfg=dict( embed_dims=256, feedforward_channels=1024, ffn_drop=0.0 ), ), embed_dims=256, num_feature_levels=4, positional_encoding=dict( num_feats=128, normalize=True, offset=0.0, temperature=20 ), ), spatial_enhancer=dict( type="DepthFusionSpatialEnhancer", embed_dims=256, feature_3d_dim=32, num_depth_layers=2, min_depth=min_depth, max_depth=max_depth, num_depth=num_depth, with_feature_3d=depth, loss_depth_weight=1.0 if depth_loss else -1, ), decoder=dict( type="BBox3DDecoder", look_forward_twice=True, instance_bank=dict( type="InstanceBank", num_anchor=50, anchor="anchor_files/embodiedscan_kmeans.npy", embed_dims=256, anchor_in_camera=True, ), anchor_encoder=dict( type="DoF9BoxEncoder", embed_dims=256, rot_dims=3, ), graph_model=dict( type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0, batch_first=True, ), ffn=dict( type="FFN", embed_dims=256, feedforward_channels=2048, ffn_drop=0.0, ), norm_layer=dict(type="LN", normalized_shape=256), deformable_model=dict( type="DeformableFeatureAggregation", embed_dims=256, num_groups=8, num_levels=4, use_camera_embed=True, use_deformable_func=True, with_depth=True, min_depth=min_depth, max_depth=max_depth, kps_generator=dict( type="SparseBox3DKeyPointsGenerator", fix_scale=[ [0, 0, 0], [0.45, 0, 0], [-0.45, 0, 0], [0, 0.45, 0], [0, -0.45, 0], [0, 0, 0.45], [0, 0, -0.45], ], num_learnable_pts=9, ), with_value_proj=True, filter_outlier=True, ), text_cross_attn=dict( type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0, batch_first=True, ), refine_layer=dict( type="GroundingRefineClsHead", embed_dims=256, output_dim=9, cls_bias=True, # cls_layers=True, ), loss_cls=dict( type="mmdet.FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0, ), loss_reg=dict( type="DoF9BoxLoss", loss_weight_wd=1.0, loss_weight_pcd=0.0, loss_weight_cd=0.8, ), sampler=dict( type="Grounding3DTarget", cls_weight=1.0, box_weight=1.0, num_dn=100, cost_weight_wd=1.0, cost_weight_pcd=0.0, cost_weight_cd=0.8, with_dn_query=True, num_classes=284, embed_dims=256, ), gt_reg_key="gt_bboxes_3d", gt_cls_key="tokens_positive", post_processor=dict( type="GroundingBox3DPostProcess", num_output=1000, ), with_instance_id=False, ), ) dataset_type = "EmbodiedScanDetGroundingDataset" data_root = "data" metainfo = dict( classes=class_names, classes_split=(head_labels, common_labels, tail_labels), box_type_3d="euler-depth", ) image_wh = (512, 512) rotate_3rscan = True sep_token = "[SEP]" cam_standardization = True if cam_standardization: resize = dict( type="CamIntrisicStandardization", dst_intrinsic=[ [432.57943431339237, 0.0, 256], [0.0, 539.8570854208559, 256], [0.0, 0.0, 1.0], ], dst_wh=image_wh, ) else: resize = dict(type="CustomResize", scale=image_wh, keep_ratio=False) train_pipeline = [ dict(type="LoadAnnotations3D"), dict( type="MultiViewPipeline", n_images=1, max_n_images=18, transforms=[ dict(type="LoadImageFromFile", backend_args=backend_args), dict(type="LoadDepthFromFile", backend_args=backend_args), resize, dict( type="ResizeCropFlipImage", data_aug_conf={ "resize_lim": (1.0, 1.0), "final_dim": image_wh, "bot_pct_lim": (0.0, 0.05), "rot_lim": (0, 0), "H": image_wh[1], "W": image_wh[0], "rand_flip": False, }, ), ], rotate_3rscan=rotate_3rscan, ordered=True, ), dict( type="Pack3DDetInputs", keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"], ), ] if depth_loss: train_pipeline.append( dict( type="DepthProbLabelGenerator", origin_stride=4, min_depth=min_depth, max_depth=max_depth, num_depth=num_depth, ), ) test_pipeline = [ dict(type="LoadAnnotations3D"), dict( type="MultiViewPipeline", n_images=1, max_n_images=50, ordered=True, transforms=[ dict(type="LoadImageFromFile", backend_args=backend_args), dict(type="LoadDepthFromFile", backend_args=backend_args), resize, ], rotate_3rscan=rotate_3rscan, ), dict( type="Pack3DDetInputs", keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"], ), ] trainval = False data_version = "v1" if data_version == "v1": train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl" val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl" train_vg_file = "embodiedscan/embodiedscan_train_vg_all.json" val_vg_file = "embodiedscan/embodiedscan_val_vg_all.json" elif data_version == "v1-mini": train_ann_file = "embodiedscan/embodiedscan_infos_train.pkl" val_ann_file = "embodiedscan/embodiedscan_infos_val.pkl" train_vg_file = "embodiedscan/embodiedscan_train_mini_vg.json" val_vg_file = "embodiedscan/embodiedscan_val_mini_vg.json" elif data_version == "v2": train_ann_file = "embodiedscan-v2/embodiedscan_infos_train.pkl" val_ann_file = "embodiedscan-v2/embodiedscan_infos_val.pkl" train_vg_file = "embodiedscan-v2/embodiedscan_train_vg.json" val_vg_file = "embodiedscan-v2/embodiedscan_val_vg.json" test_ann_file = "embodiedscan/embodiedscan_infos_test.pkl" test_vg_file = "embodiedscan/embodiedscan_test_vg.json" train_dataset = dict( type=dataset_type, data_root=data_root, ann_file=train_ann_file, pipeline=train_pipeline, test_mode=False, filter_empty_gt=True, box_type_3d="Euler-Depth", metainfo=metainfo, mode="grounding", vg_file=train_vg_file, num_text=10, sep_token=sep_token, ) if trainval: train_dataset = dict( type="ConcatDataset", datasets=[ train_dataset, dict( type=dataset_type, data_root=data_root, ann_file=val_ann_file, pipeline=train_pipeline, test_mode=False, filter_empty_gt=True, box_type_3d="Euler-Depth", metainfo=metainfo, mode="grounding", vg_file=val_vg_file, num_text=10, sep_token=sep_token, ) ], ) train_dataloader = dict( batch_size=1, num_workers=4 if not DEBUG else 0, persistent_workers=False, sampler=dict(type="DefaultSampler", shuffle=True), dataset=train_dataset, ) val_dataloader = dict( batch_size=1, num_workers=4 if not DEBUG else 0, persistent_workers=False, drop_last=False, sampler=dict(type="DefaultSampler", shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file=val_ann_file, pipeline=test_pipeline, test_mode=True, filter_empty_gt=True, box_type_3d="Euler-Depth", metainfo=metainfo, mode="grounding", vg_file=val_vg_file, ), ) test_dataloader = dict( batch_size=1, num_workers=4 if not DEBUG else 0, persistent_workers=False, drop_last=False, sampler=dict(type="DefaultSampler", shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file=test_ann_file, pipeline=[ dict(type="LoadAnnotations3D"), dict( type="MultiViewPipeline", n_images=1, max_n_images=50, ordered=True, transforms=[ dict(type="LoadImageFromFile", backend_args=backend_args), dict(type="LoadDepthFromFile", backend_args=backend_args), resize, ], rotate_3rscan=rotate_3rscan, ), dict( type="Pack3DDetInputs", keys=["img", "depth_img", "gt_bboxes_3d", "gt_labels_3d"], ), ], test_mode=True, filter_empty_gt=True, box_type_3d="Euler-Depth", metainfo=metainfo, mode="grounding", vg_file=test_vg_file, ), ) val_evaluator = dict( type="GroundingMetric", collect_dir="/job_data/.dist_test" if not DEBUG else None, ) test_evaluator = dict( type="GroundingMetric", collect_dir="/job_data/.dist_test" if not DEBUG else None, format_only=True, submit_info={ 'method': 'BIP3D', 'team': 'robot-lab manipulation team', 'authors': ['xuewu lin'], 'e-mail': '878585984@qq.com', 'institution': 'Horizon', 'country': 'China', }, result_dir="/job_data" if not DEBUG else "./", ) max_epochs = 2 train_cfg = dict( type="EpochBasedTrainLoop", max_epochs=max_epochs, val_interval=1 ) val_cfg = dict(type="ValLoop") test_cfg = dict(type="TestLoop") lr = 2e-4 optim_wrapper = dict( type="OptimWrapper", optimizer=dict(type="AdamW", lr=lr, weight_decay=0.0005), paramwise_cfg=dict( custom_keys={ "backbone.": dict(lr_mult=0.1), # "text_encoder": dict(lr_mult=0.05), "absolute_pos_embed": dict(decay_mult=0.0), } ), clip_grad=dict(max_norm=10, norm_type=2), ) # learning rate param_scheduler = [ dict( type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500 ), dict( type="MultiStepLR", begin=0, end=62000, by_epoch=False, milestones=[40000, 55000], gamma=0.1, ), ] custom_hooks = [dict(type="EmptyCacheHook", after_iter=False)] default_hooks = dict( checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3), ) vis_backends = [ dict( type="TensorboardVisBackend", save_dir="/job_tboard" if not DEBUG else "./work-dir", ), ] visualizer = dict( type="Visualizer", vis_backends=vis_backends, name="visualizer", ) load_from = "ckpt/bip3d_det.pth" ================================================ FILE: configs/bip3d_grounding_rgb.py ================================================ _base_ = ["./bip3d_grounding.py"] model = dict( backbone_3d=None, neck_3d=None, spatial_enhancer=dict(with_feature_3d=False), decoder=dict(deformable_model=dict(with_depth=False)), ) load_from = "ckpt/bip3d_det_rgb.pth" ================================================ FILE: configs/default_runtime.py ================================================ default_scope = "bip3d" default_hooks = dict( timer=dict(type="IterTimerHook"), logger=dict( type="LoggerHook", interval=25, ), param_scheduler=dict(type="ParamSchedulerHook"), checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=4), sampler_seed=dict(type="DistSamplerSeedHook"), ) # visualization=dict(type='Det3DVisualizationHook')) env_cfg = dict( cudnn_benchmark=False, mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), dist_cfg=dict(backend="nccl"), ) log_processor = dict(type="LogProcessor", window_size=50, by_epoch=True) log_level = "INFO" load_from = None resume = False # TODO: support auto scaling lr randomness = dict(seed=0) find_unused_parameters = False ================================================ FILE: docs/quick_start.md ================================================ # Quick Start ### Set up python environment ```bash virtualenv mm_bip3d --python=python3.8 source mm_bip3d/bin/activate pip3 install --upgrade pip bip3d_path="path/to/bip3d" cd ${bip3d_path} # MMCV recommends installing via a wheel package, url: https://download.openmmlab.com/mmcv/dist/cu{$cuda_version}/torch{$torch_version}/index.html pip3 install -r requirement.txt ``` ### Compile the deformable_aggregation CUDA op ```bash cd bip3d/ops python3 setup.py develop cd ../../ ``` ### Prepare the data Download the [EmbodiedScan dataset](https://github.com/OpenRobotLab/EmbodiedScan) and create symbolic links. ```bash cd ${bip3d_path} mkdir data ln -s path/to/embodiedscan ./data/embodiedscan ``` Download datasets [ScanNet](https://github.com/ScanNet/ScanNet), [3RScan](https://github.com/WaldJohannaU/3RScan), [Matterport3D](https://github.com/niessner/Matterport), and optionally download [ARKitScenes](https://github.com/apple/ARKitScenes). Adjust the data directory structure as follows: ```bash ${bip3d_path} └──data ├──embodiedscan │   ├──embodiedscan_infos_train.pkl │   ├──embodiedscan_infos_val.pkl │   ... ├──3rscan │   ├──00d42bed-778d-2ac6-86a7-0e0e5f5f5660 │   ... ├──scannet │   └──posed_images ├──matterport3d │   ├──17DRP5sb8fy │   ... └──arkitscenes ├──Training    └──Validation ``` ### Prepare pre-trained weights Download the required Grounding-DINO pre-trained weights: [Swin-Tiny](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth) and [Swin-Base](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth). ```bash mkdir ckpt # Swin-Tiny wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth -O ckpt/groundingdino_swint_ogc_mmdet-822d7e9d.pth python tools/ckpt_rename.py ckpt/groundingdino_swint_ogc_mmdet-822d7e9d.pth # Swin-Base wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth -O ckpt/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth python tools/ckpt_rename.py ckpt/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth ``` Download bert config and pretrain weights from [huggingface](https://huggingface.co/google-bert/bert-base-uncased/tree/main). ```bash ${bip3d_path} └──ckpt ├──groundingdino_swint_ogc_mmdet-822d7e9d.pth ├──groundingdino_swinb_cogcoor_mmdet-55949c9c.pth └──bert-base-uncased ├──config.json ├──tokenizer_config.json ├──tokenizer.json ├──pytorch_model.bin ... ``` ### Generate anchors by K-means ```bash mkdir anchor_files python3 tools/anchor_bbox3d_kmeans.py \ --ann_file data/embodiedscan/embodiedscan_infos_train.pkl \ --output_file anchor_files/embodiedscan_kmeans.npy ``` You can also download the anchor file we provide. ### Modify config According to personal needs, modify some config items, such as [`DEBUG`](../configs/bip3d_det.py#L8), [`collect_dir`](../configs/bip3d_det.py#L425), [`save_dir`](../configs/bip3d_det.py#L475), and [`load_from`](../configs/bip3d_det.py#L485). If performing multi-machine training, ensure that [`collect_dir`](../configs/bip3d_det.py#L425) is a shared folder accessible by all machines. ### Run local training and testing ```bash export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # train bash engine.sh configs/xxx.py # test bash test.sh configs/xxx.py path/to/checkpoint ``` ================================================ FILE: engine.sh ================================================ export PYTHONPATH=./:$PYTHONPATH CONFIG=configs/bip3d_det.py if ! [[ -z $1 ]]; then echo $1 CONFIG=$1 fi nvcc -V which nvcc gpus=(${CUDA_VISIBLE_DEVICES//,/ }) gpu_num=${#gpus[@]} echo "number of gpus: "${gpu_num} echo $CONFIG if [ ${gpu_num} -gt 1 ] then bash ./tools/dist_train.sh \ ${CONFIG} \ ${gpu_num} \ --work-dir=work-dirs else python ./tools/train.py \ ${CONFIG} \ --work-dir ./work-dir fi ================================================ FILE: requirements.txt ================================================ einops==0.8.0 huggingface-hub==0.26.5 mmcv==2.1.0 mmdet==3.3.0 mmengine==0.10.4 ninja==1.11.1.3 numpy==1.26.4 opencv-python==4.10.0.84 pytorch3d==0.7.5 requests==2.32.3 scipy==1.14.1 tensorboard==2.18.0 timm==1.0.3 tokenizers==0.21.0 torch==2.1.0+cu118 torchaudio==2.1.0+cu118 torchmetrics==1.6.1 torchvision==0.16.0+cu118 ================================================ FILE: test.sh ================================================ export PYTHONPATH=./:$PYTHONPATH CONFIG=$1 CKPT=$2 nvcc -V which nvcc gpus=(${CUDA_VISIBLE_DEVICES//,/ }) gpu_num=${#gpus[@]} echo "number of gpus: "${gpu_num} echo "ckeckpoint: "$CKPT echo $CONFIG if [ ${gpu_num} -gt 1 ] then bash ./tools/dist_test.sh \ ${CONFIG} \ $CKPT \ $gpu_num \ --work-dir ./work-dir else python3 tools/test.py \ $CONFIG \ $CKPT \ --work-dir ./work-dir fi ================================================ FILE: tools/anchor_bbox3d_kmeans.py ================================================ import torch import pickle import tqdm from sklearn.cluster import KMeans import numpy as np from bip3d.structures.bbox_3d import EulerDepthInstance3DBoxes def sample(ids, n): if n == len(ids): return np.copy(ids) elif n > len(ids): return np.concatenate( [ids, sample(ids, n - len(ids))] ) else: interval = len(ids) / n output = [] for i in range(n): output.append(ids[int(interval*i)]) return np.array(output) def kmeans( ann_file, output_file, z_min=-0.2, z_max=3, ): ann = pickle.load(open(ann_file, "rb")) all_cam_bbox = [] for x in tqdm.tqdm(ann["data_list"]): bbox = np.array([y["bbox_3d"] for y in x["instances"]]) ids = np.arange(len(x["images"])) ids = sample(ids, 50) for idx in ids: image = x["images"][idx] global2cam = np.linalg.inv(x['axis_align_matrix'] @ image['cam2global']) _bbox = EulerDepthInstance3DBoxes(np.copy(bbox[image["visible_instance_ids"]])) mask = torch.logical_and( _bbox.tensor[:, 2] > z_min, _bbox.tensor[:, 2] < z_max ) _bbox = _bbox[mask] _bbox.transform(global2cam) all_cam_bbox.append(_bbox.tensor.numpy()) all_cam_bbox = np.concatenate(all_cam_bbox) print("start to kmeans, please wait") cluster_cam = KMeans(n_clusters=100).fit(all_cam_bbox).cluster_centers_ cluster_cam[:, 3:6] = np.log(cluster_cam[:, 3:6]) np.save(output_file, cluster_cam) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description='anchor bbox3d kmeans for embodiedscan dataset') parser.add_argument("ann_file") parser.add_argument("--output_file") parser.add_argument("--z_min", defaule=-0.2) parser.add_argument("--z_max", defaule=3) args = parser.parse_args() kmeans(args.ann_file, args.output_file, args.z_min, args.z_max) ================================================ FILE: tools/benchmark.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import os from mmengine import MMLogger from mmengine.config import Config, DictAction from mmengine.dist import init_dist from mmengine.registry import init_default_scope from mmengine.utils import mkdir_or_exist from mmdet.utils.benchmark import (DataLoaderBenchmark, DatasetBenchmark, InferenceBenchmark) from embodiedscan import * def parse_args(): parser = argparse.ArgumentParser(description='MMDet benchmark') parser.add_argument('config', help='test config file path') parser.add_argument('--checkpoint', help='checkpoint file') parser.add_argument( '--task', choices=['inference', 'dataloader', 'dataset'], default='dataloader', help='Which task do you want to go to benchmark') parser.add_argument( '--repeat-num', type=int, default=1, help='number of repeat times of measurement for averaging the results') parser.add_argument( '--max-iter', type=int, default=2000, help='num of max iter') parser.add_argument( '--log-interval', type=int, default=50, help='interval of logging') parser.add_argument( '--num-warmup', type=int, default=5, help='Number of warmup') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--dataset-type', choices=['train', 'val', 'test'], default='test', help='Benchmark dataset type. only supports train, val and test') parser.add_argument( '--work-dir', help='the directory to save the file containing ' 'benchmark metrics') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) return args def inference_benchmark(args, cfg, distributed, logger): benchmark = InferenceBenchmark( cfg, args.checkpoint, distributed, args.fuse_conv_bn, args.max_iter, args.log_interval, args.num_warmup, logger=logger) return benchmark def dataloader_benchmark(args, cfg, distributed, logger): benchmark = DataLoaderBenchmark( cfg, distributed, args.dataset_type, args.max_iter, args.log_interval, args.num_warmup, logger=logger) return benchmark def dataset_benchmark(args, cfg, distributed, logger): benchmark = DatasetBenchmark( cfg, args.dataset_type, args.max_iter, args.log_interval, args.num_warmup, logger=logger) return benchmark def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) init_default_scope(cfg.get('default_scope', 'mmdet')) distributed = False if args.launcher != 'none': init_dist(args.launcher, **cfg.get('env_cfg', {}).get('dist_cfg', {})) distributed = True log_file = None if args.work_dir: log_file = os.path.join(args.work_dir, 'benchmark.log') mkdir_or_exist(args.work_dir) logger = MMLogger.get_instance( 'mmdet', log_file=log_file, log_level='INFO') benchmark = eval(f'{args.task}_benchmark')(args, cfg, distributed, logger) benchmark.run(args.repeat_num) if __name__ == '__main__': main() ================================================ FILE: tools/ckpt_rename.py ================================================ import os import torch def get_renamed_ckpt(file, output="./"): ckpt_rename = dict() ckpt = torch.load(file) if "state_dict" in ckpt: ckpt = ckpt["state_dict"] for key,value in ckpt.items(): k = None if key.startswith("backbone") or key.startswith("neck"): k = key elif key.startswith("language_model."): k = key.replace("language_model.", "text_encoder.") elif key.startswith("encoder."): if key.startswith("encoder.layers."): k = key.replace("encoder.layers.", "feature_enhancer.img_attn_blocks.") elif key.startswith("encoder.fusion_layers."): k = key.replace("encoder.fusion_layers.", "feature_enhancer.text_img_attn_blocks.") elif key.startswith("encoder.text_layers."): k = key.replace("encoder.text_layers.", "feature_enhancer.text_attn_blocks.") elif key.startswith("decoder.layers."): ops = [ "gnn", "norm", "text_cross_attn", "norm", "deformable", "norm", "ffn", "norm", "refine", ] layer_id = int(key.split(".")[2]) name = key.split(".")[3] for op in ops: if name == "self_attn": block_id = 0 elif name == "cross_attn_text": block_id = 2 elif name == "cross_attn": block_id = 4 elif name == "ffn": block_id = 6 elif name == "norms": norm_id = int(key.split(".")[4]) block_id = (norm_id + 1) * 2 - 1 op_id = block_id + layer_id * len(ops) k = f"decoder.layers.{op_id}." if name == "norms": k += ".".join(key.split(".")[5:]) else: k += ".".join(key.split(".")[4:]) elif key.startswith("bbox_head."): layer_id = int(key.split(".")[2]) op_id = 8 + layer_id * 9 if "reg_branches" in key: k = f"decoder.layers.{op_id}." + ".".join(key.split(".")[3:]) elif "cls_branches" in key: k = f"decoder.layers.{op_id}.bias" elif "pts_prob_fc" in key: k = "spatial_enhancer." + key elif key in [ "pts_prob_pre_fc.weight", "pts_prob_pre_fc.bias", "pts_fc.weight", "pts_fc.bias", "fusion_fc.0.layers.0.0.weight", "fusion_fc.0.layers.0.0.bias", "fusion_fc.0.layers.1.weight", "fusion_fc.0.layers.1.bias", "fusion_fc.1.weight", "fusion_fc.1.bias", "fusion_norm.weight", "fusion_norm.bias", ]: k = "spatial_enhancer." + key elif key == "level_embed": k = "feature_enhancer.level_embed" elif key == "query_embedding.weight": k = "decoder.instance_bank.instance_feature" if k is None: # print(key) k = key if k == "decoder.norm.weight": print(key) ckpt_rename[k] = value path, file_name = os.path.split(file) file_name = file_name[:-4]+"-rename.pth" output_file = os.path.join(output, file_name) torch.save(ckpt_rename, output_file) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description='mmdet Grounding-DINO checkpoint rename to BIP3D') parser.add_argument("file") parser.add_argument("--output", default=None) args = parser.parse_args() if args.output is None: output = "./" get_renamed_ckpt(args.file, output) ================================================ FILE: tools/dist_test.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 PORT=${PORT:-11000} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} ================================================ FILE: tools/dist_train.sh ================================================ #!/usr/bin/env bash CONFIG=$1 GPUS=$2 PORT=${PORT:-12050} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} ================================================ FILE: tools/test.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import time import os def wait_before_import_config(): t = int(os.environ.get('LOCAL_RANK', 0)) time.sleep(t * 0.5) def wait_after_import_config(): t = int(os.environ.get('WORLD_SIZE', 0)) - int(os.environ.get('LOCAL_RANK', 0)) time.sleep(t * 0.5) wait_before_import_config() from mmengine.config import Config wait_after_import_config() import argparse import os import os.path as osp from mmengine.config import Config, ConfigDict, DictAction from mmengine.registry import RUNNERS from mmengine.runner import Runner import torch torch.multiprocessing.set_sharing_strategy('file_system') # from embodiedscan.utils import replace_ceph_backend # TODO: support fuse_conv_bn and format_only def parse_args(): parser = argparse.ArgumentParser( description='MMDet3D test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument( '--work-dir', help='the directory to save the file containing evaluation metrics') parser.add_argument('--task-name', help='task names') parser.add_argument('--ceph', action='store_true', help='Use ceph as data storage backend') parser.add_argument('--show', action='store_true', help='show prediction results') parser.add_argument('--show-dir', help='directory where painted images will be saved. ' 'If specified, it will be automatically saved ' 'to the work_dir/timestamp/show_dir') parser.add_argument('--score-thr', type=float, default=0.1, help='bbox score threshold') parser.add_argument( '--task', type=str, choices=[ 'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg', 'multi-modality_det' ], help='Determine the visualization method depending on the task.') parser.add_argument('--wait-time', type=float, default=2, help='the interval of show (s)') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--tta', action='store_true', help='Test time augmentation') # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` # will pass the `--local-rank` parameter to `tools/test.py` instead # of `--local_rank`. parser.add_argument('--local_rank', '--local-rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) return args def trigger_visualization_hook(cfg, args): default_hooks = cfg.default_hooks if 'visualization' in default_hooks: visualization_hook = default_hooks['visualization'] # Turn on visualization visualization_hook['draw'] = True if args.show: visualization_hook['show'] = True visualization_hook['wait_time'] = args.wait_time if args.show_dir: visualization_hook['test_out_dir'] = args.show_dir all_task_choices = [ 'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg', 'multi-modality_det' ] assert args.task in all_task_choices, 'You must set '\ f"'--task' in {all_task_choices} in the command " \ 'if you want to use visualization hook' visualization_hook['vis_task'] = args.task visualization_hook['score_thr'] = args.score_thr else: raise RuntimeError( 'VisualizationHook must be included in default_hooks.' 'refer to usage ' '"visualization=dict(type=\'VisualizationHook\')"') return cfg def main(): args = parse_args() # load config cfg = Config.fromfile(args.config) # TODO: We will unify the ceph support approach with other OpenMMLab repos # if args.ceph: # cfg = replace_ceph_backend(cfg) cfg.launcher = args.launcher if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif args.task_name is not None: cfg.work_dir = osp.join('./work_dirs', args.task_name) elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) cfg.load_from = args.checkpoint if args.show or args.show_dir: cfg = trigger_visualization_hook(cfg, args) if args.tta: # Currently, we only support tta for 3D segmentation # TODO: Support tta for 3D detection assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.' assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` in config.' cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model) # build the runner from config if 'runner_type' not in cfg: # build the default runner runner = Runner.from_cfg(cfg) else: # build customized runner from the registry # if 'runner_type' is set in the cfg runner = RUNNERS.build(cfg) # start testing runner.test() if __name__ == '__main__': main() ================================================ FILE: tools/train.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import time import os def wait_before_import_config(): t = int(os.environ.get('LOCAL_RANK', 0)) time.sleep(t * 0.5) def wait_after_import_config(): t = int(os.environ.get('WORLD_SIZE', 0)) - int(os.environ.get('LOCAL_RANK', 0)) time.sleep(t * 0.5) wait_before_import_config() from mmengine.config import Config wait_after_import_config() import argparse import logging import os import os.path as osp from mmengine.config import Config, DictAction from mmengine.logging import print_log from mmengine.registry import RUNNERS from mmengine.runner import Runner from bip3d import * import torch # torch.autograd.set_detect_anomaly(True) torch.multiprocessing.set_sharing_strategy('file_system') def parse_args(): parser = argparse.ArgumentParser(description='Train a 3D detector') parser.add_argument('config', help='train config file path') parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--task-name', help='task names') parser.add_argument('--amp', action='store_true', default=False, help='enable automatic-mixed-precision training') parser.add_argument('--auto-scale-lr', action='store_true', help='enable automatically scaling LR.') parser.add_argument( '--resume', nargs='?', type=str, const='auto', help='If specify checkpoint path, resume from it, while if not ' 'specify, try to auto resume from the latest checkpoint ' 'in the work directory.') parser.add_argument('--ceph', action='store_true', help='Use ceph as data storage backend') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` # will pass the `--local-rank` parameter to `tools/train.py` instead # of `--local_rank`. parser.add_argument('--local_rank', '--local-rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) return args def main(): args = parse_args() # load config cfg = Config.fromfile(args.config) # TODO: We will unify the ceph support approach with other OpenMMLab repos # if args.ceph: # cfg = replace_ceph_backend(cfg) cfg.launcher = args.launcher if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif args.task_name is not None: cfg.work_dir = osp.join('./work_dirs', args.task_name) elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) # enable automatic-mixed-precision training if args.amp is True: optim_wrapper = cfg.optim_wrapper.type if optim_wrapper == 'AmpOptimWrapper': print_log('AMP training is already enabled in your config.', logger='current', level=logging.WARNING) else: assert optim_wrapper == 'OptimWrapper', ( '`--amp` is only supported when the optimizer wrapper type is ' f'`OptimWrapper` but got {optim_wrapper}.') cfg.optim_wrapper.type = 'AmpOptimWrapper' cfg.optim_wrapper.loss_scale = 'dynamic' # enable automatically scaling LR if args.auto_scale_lr: if 'auto_scale_lr' in cfg and \ 'enable' in cfg.auto_scale_lr and \ 'base_batch_size' in cfg.auto_scale_lr: cfg.auto_scale_lr.enable = True else: raise RuntimeError('Can not find "auto_scale_lr" or ' '"auto_scale_lr.enable" or ' '"auto_scale_lr.base_batch_size" in your' ' configuration file.') # resume is determined in this priority: resume from > auto_resume if args.resume == 'auto': cfg.resume = True cfg.load_from = None elif args.resume is not None: cfg.resume = True cfg.load_from = args.resume # build the runner from config if 'runner_type' not in cfg: # build the default runner runner = Runner.from_cfg(cfg) else: # build customized runner from the registry # if 'runner_type' is set in the cfg runner = RUNNERS.build(cfg) # start training runner.train() if __name__ == '__main__': main()