Repository: Megvii-BaseDetection/BEVDepth
Branch: main
Commit: d78c7b58b10b
Files: 59
Total size: 278.6 KB
Directory structure:
gitextract_ucu3202x/
├── .github/
│ └── workflows/
│ └── lint.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE.md
├── README.md
├── bevdepth/
│ ├── callbacks/
│ │ └── ema.py
│ ├── datasets/
│ │ └── nusc_det_dataset.py
│ ├── evaluators/
│ │ └── det_evaluators.py
│ ├── exps/
│ │ ├── base_cli.py
│ │ └── nuscenes/
│ │ ├── MatrixVT/
│ │ │ └── matrixvt_bev_depth_lss_r50_256x704_128x128_24e_ema.py
│ │ ├── base_exp.py
│ │ ├── fusion/
│ │ │ ├── bev_depth_fusion_lss_r50_256x704_128x128_24e.py
│ │ │ ├── bev_depth_fusion_lss_r50_256x704_128x128_24e_2key.py
│ │ │ ├── bev_depth_fusion_lss_r50_256x704_128x128_24e_2key_trainval.py
│ │ │ └── bev_depth_fusion_lss_r50_256x704_128x128_24e_key4.py
│ │ └── mv/
│ │ ├── bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.py
│ │ ├── bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py
│ │ ├── bev_depth_lss_r50_256x704_128x128_24e_2key.py
│ │ ├── bev_depth_lss_r50_256x704_128x128_24e_2key_ema.py
│ │ ├── bev_depth_lss_r50_256x704_128x128_24e_ema.py
│ │ ├── bev_depth_lss_r50_512x1408_128x128_24e_2key.py
│ │ ├── bev_depth_lss_r50_640x1600_128x128_24e_2key.py
│ │ ├── bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da.py
│ │ ├── bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py
│ │ ├── bev_stereo_lss_r50_256x704_128x128_24e_2key.py
│ │ ├── bev_stereo_lss_r50_256x704_128x128_24e_2key_ema.py
│ │ ├── bev_stereo_lss_r50_256x704_128x128_24e_key4.py
│ │ └── bev_stereo_lss_r50_256x704_128x128_24e_key4_ema.py
│ ├── layers/
│ │ ├── __init__.py
│ │ ├── backbones/
│ │ │ ├── __init__.py
│ │ │ ├── base_lss_fpn.py
│ │ │ ├── bevstereo_lss_fpn.py
│ │ │ ├── fusion_lss_fpn.py
│ │ │ └── matrixvt.py
│ │ └── heads/
│ │ ├── __init__.py
│ │ └── bev_depth_head.py
│ ├── models/
│ │ ├── base_bev_depth.py
│ │ ├── bev_stereo.py
│ │ ├── fusion_bev_depth.py
│ │ └── matrixvt_det.py
│ ├── ops/
│ │ ├── voxel_pooling_inference/
│ │ │ ├── __init__.py
│ │ │ ├── src/
│ │ │ │ ├── voxel_pooling_inference_forward.cpp
│ │ │ │ └── voxel_pooling_inference_forward_cuda.cu
│ │ │ └── voxel_pooling_inference.py
│ │ └── voxel_pooling_train/
│ │ ├── __init__.py
│ │ ├── src/
│ │ │ ├── voxel_pooling_train_forward.cpp
│ │ │ └── voxel_pooling_train_forward_cuda.cu
│ │ └── voxel_pooling_train.py
│ └── utils/
│ └── torch_dist.py
├── requirements-dev.txt
├── requirements.txt
├── scripts/
│ ├── gen_info.py
│ └── visualize_nusc.py
├── setup.py
└── test/
├── test_dataset/
│ └── test_nusc_mv_det_dataset.py
├── test_layers/
│ ├── test_backbone.py
│ ├── test_head.py
│ └── test_matrixvt.py
└── test_ops/
└── test_voxel_pooling.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/lint.yml
================================================
name: lint
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Install pre-commit hook
run: |
pip install pre-commit
pre-commit install
- name: Linting
run: pre-commit run --all-files
- name: Format c/cuda codes with clang-format
uses: DoozyX/clang-format-lint-action@v0.11
with:
source: bevdepth/ops
extensions: h,c,cpp,hpp,cu,cuh
style: google
- name: Check docstring coverage
run: |
pip install interrogate
interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" -e 'bevdepth/exps' -e 'test/' -e 'scripts' -e 'setup.py' -e 'bevdepth/ops' -e 'bevdepth/utils/' --fail-under 50
================================================
FILE: .gitignore
================================================
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### PyCharm ###
# User-specific stuff
.idea
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
# JetBrains templates
**___jb_tmp___
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
docs/build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don’t work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
### Vim ###
# Swap
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
# Temporary
.netrwhist
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
### Researcher ###
# output
train_log
docs/api
.code-workspace.code-workspace
output
outputs
instant_test_output
inference_test_output
*.pkl
*.npy
*.pth
events.out.tfevents*
# vscode
*.code-workspace
.vscode
# vim
.vim
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
- repo: https://github.com/PyCQA/isort
rev: 5.10.1
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://github.com/codespell-project/codespell
rev: v2.2.1
hooks:
- id: codespell
================================================
FILE: LICENSE.md
================================================
MIT License
Copyright (c) 2022 Megvii-BaseDetection
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
## BEVDepth
BEVDepth is a new 3D object detector with a trustworthy depth
estimation. For more details, please refer to our [paper on Arxiv](https://arxiv.org/abs/2206.10092).
## BEVStereo
BEVStereo is a new multi-view 3D object detector using temporal stereo to enhance depth estimation.
## MatrixVT
[MatrixVT](bevdepth/exps/nuscenes/MatrixVT/matrixvt_bev_depth_lss_r50_256x704_128x128_24e_ema.py) is a novel View Transformer for BEV paradigm with high efficiency and without customized operators. For more details, please refer to our [paper on Arxiv](https://arxiv.org/abs/2211.10593). Try MatrixVT on **CPU** by run [this file](bevdepth/layers/backbones/matrixvt.py) !
## Updates!!
* 【2022/12/06】 We released our new View Transformer (MatrixVT), the paper is on [Arxiv](https://arxiv.org/abs/2211.10593).
* 【2022/11/30】 We updated our paper(BEVDepth) on [Arxiv](https://arxiv.org/abs/2206.10092).
* 【2022/11/18】 Both BEVDepth and BEVStereo were accepted by AAAI'2023.
* 【2022/09/22】 We released our paper(BEVStereo) on [Arxiv](https://arxiv.org/abs/2209.10248).
* 【2022/08/24】 We submitted our result(BEVStereo) on [nuScenes Detection Task](https://nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera) and achieved the SOTA.
* 【2022/06/23】 We submitted our result(BEVDepth) without extra data on [nuScenes Detection Task](https://nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera) and achieved the SOTA.
* 【2022/06/21】 We released our paper(BEVDepth) on [Arxiv](https://arxiv.org/abs/2206.10092).
* 【2022/04/11】 We submitted our result(BEVDepth) on [nuScenes Detection Task](https://nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera) and achieved the SOTA.
## Quick Start
### Installation
**Step 0.** Install [pytorch](https://pytorch.org/)(v1.9.0).
**Step 1.** Install [MMDetection3D](https://github.com/open-mmlab/mmdetection3d)(v1.0.0rc4).
**Step 2.** Install requirements.
```shell
pip install -r requirements.txt
```
**Step 3.** Install BEVDepth(gpu required).
```shell
python setup.py develop
```
### Data preparation
**Step 0.** Download nuScenes official dataset.
**Step 1.** Symlink the dataset root to `./data/`.
```
ln -s [nuscenes root] ./data/
```
The directory will be as follows.
```
BEVDepth
├── data
│ ├── nuScenes
│ │ ├── maps
│ │ ├── samples
│ │ ├── sweeps
│ │ ├── v1.0-test
| | ├── v1.0-trainval
```
**Step 2.** Prepare infos.
```
python scripts/gen_info.py
```
### Tutorials
**Train.**
```
python [EXP_PATH] --amp_backend native -b 8 --gpus 8
```
**Eval.**
```
python [EXP_PATH] --ckpt_path [CKPT_PATH] -e -b 8 --gpus 8
```
### Benchmark
|Exp |EMA| CBGS |mAP |mATE| mASE | mAOE |mAVE| mAAE | NDS | weights |
| ------ | :---: | :---: | :---: |:---: |:---: | :---: | :----: | :----: | :----: | :----: |
|[BEVDepth](bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_24e_2key.py)| | |0.3304| 0.7021| 0.2795| 0.5346| 0.5530| 0.2274| 0.4355 | [github](https://github.com/Megvii-BaseDetection/BEVDepth/releases/download/v0.0.2/bev_depth_lss_r50_256x704_128x128_24e_2key.pth)
|[BEVDepth](bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_24e_2key_ema.py)|√ | |0.3329 | 0.6832 |0.2761 | 0.5446 | 0.5258 | 0.2259 | 0.4409 | [github](https://github.com/Megvii-BaseDetection/BEVDepth/releases/download/v0.0.2/bev_depth_lss_r50_256x704_128x128_24e_2key_ema.pth)
|[BEVDepth](bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.py)| |√ |0.3484| 0.6159| 0.2716| 0.4144| 0.4402| 0.1954| 0.4805 | [github](https://github.com/Megvii-BaseDetection/BEVDepth/releases/download/v0.0.2/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.pth)
|[BEVDepth](bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py)|√ |√ |0.3589 | 0.6119 |0.2692 | 0.5074 | 0.4086 | 0.2009 | 0.4797 | [github](https://github.com/Megvii-BaseDetection/BEVDepth/releases/download/v0.0.2/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.pth) |
|[BEVStereo](bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_2key.py)| | |0.3456 | 0.6589 | 0.2774 | 0.5500 | 0.4980 | 0.2278 | 0.4516 | [github](https://github.com/Megvii-BaseDetection/BEVStereo/releases/download/v0.0.2/bev_stereo_lss_r50_256x704_128x128_24e_2key.pth) |
|[BEVStereo](bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_2key_ema.py)|√ | |0.3494| 0.6671| 0.2785| 0.5606| 0.4686| 0.2295| 0.4543 | [github](https://github.com/Megvii-BaseDetection/BEVStereo/releases/download/v0.0.2/bev_stereo_lss_r50_256x704_128x128_24e_2key_ema.pth) |
|[BEVStereo](bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_key4.py)| | |0.3427| 0.6560| 0.2784| 0.5982| 0.5347| 0.2228| 0.4423 | [github](https://github.com/Megvii-BaseDetection/BEVStereo/releases/download/v0.0.2/bev_stereo_lss_r50_256x704_128x128_24e_key4.pth) |
|[BEVStereo](bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_key4_ema.py)|√ | |0.3435| 0.6585| 0.2757| 0.5792| 0.5034| 0.2163| 0.4485 | [github](https://github.com/Megvii-BaseDetection/BEVStereo/releases/download/v0.0.2/bev_stereo_lss_r50_256x704_128x128_24e_key4_ema.pth) |
|[BEVStereo](bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da.py)| |√ |0.3576| 0.6071| 0.2684| 0.4157| 0.3928| 0.2021| 0.4902 | [github](https://github.com/Megvii-BaseDetection/BEVStereo/releases/download/v0.0.2/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da.pth) |
|[BEVStereo](bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py)|√ |√ |0.3721| 0.5980| 0.2701| 0.4381| 0.3672| 0.1898| 0.4997 | [github](https://github.com/Megvii-BaseDetection/BEVStereo/releases/download/v0.0.2/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.pth) |
## FAQ
### EMA
- The results are different between evaluation during training and evaluation from ckpt.
Due to the working mechanism of EMA, the model parameters saved by ckpt are different from the model parameters used in the training stage.
- EMA exps are unable to resume training from ckpt.
We used the customized EMA callback and this function is not supported for now.
## Cite BEVDepth & BEVStereo & MatrixVT
If you use BEVDepth and BEVStereo in your research, please cite our work by using the following BibTeX entry:
```latex
@article{li2022bevdepth,
title={BEVDepth: Acquisition of Reliable Depth for Multi-view 3D Object Detection},
author={Li, Yinhao and Ge, Zheng and Yu, Guanyi and Yang, Jinrong and Wang, Zengran and Shi, Yukang and Sun, Jianjian and Li, Zeming},
journal={arXiv preprint arXiv:2206.10092},
year={2022}
}
@article{li2022bevstereo,
title={Bevstereo: Enhancing depth estimation in multi-view 3d object detection with dynamic temporal stereo},
author={Li, Yinhao and Bao, Han and Ge, Zheng and Yang, Jinrong and Sun, Jianjian and Li, Zeming},
journal={arXiv preprint arXiv:2209.10248},
year={2022}
}
@article{zhou2022matrixvt,
title={MatrixVT: Efficient Multi-Camera to BEV Transformation for 3D Perception},
author={Zhou, Hongyu and Ge, Zheng and Li, Zeming and Zhang, Xiangyu},
journal={arXiv preprint arXiv:2211.10593},
year={2022}
}
```
================================================
FILE: bevdepth/callbacks/ema.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import math
import os
from copy import deepcopy
import torch
import torch.nn as nn
from pytorch_lightning.callbacks import Callback
__all__ = ['ModelEMA', 'is_parallel']
def is_parallel(model):
"""check if model is in parallel mode."""
parallel_type = (
nn.parallel.DataParallel,
nn.parallel.DistributedDataParallel,
)
return isinstance(model, parallel_type)
class ModelEMA:
"""
Model Exponential Moving Average from https://github.com/rwightman/
pytorch-image-models Keep a moving average of everything in
the model state_dict (parameters and buffers).
This is intended to allow functionality like
https://www.tensorflow.org/api_docs/python/tf/train/
ExponentialMovingAverage
A smoothed version of the weights is necessary for some training
schemes to perform well.
This class is sensitive where it is initialized in the sequence
of model init, GPU assignment and distributed training wrappers.
"""
def __init__(self, model, decay=0.9999, updates=0):
"""
Args:
model (nn.Module): model to apply EMA.
decay (float): ema decay.
updates (int): counter of EMA updates.
"""
# Create EMA(FP32)
self.ema = deepcopy(
model.module if is_parallel(model) else model).eval()
self.updates = updates
# decay exponential ramp (to help early epochs)
self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
for p in self.ema.parameters():
p.requires_grad_(False)
def update(self, trainer, model):
# Update EMA parameters
with torch.no_grad():
self.updates += 1
d = self.decay(self.updates)
msd = model.module.state_dict() if is_parallel(
model) else model.state_dict() # model state_dict
for k, v in self.ema.state_dict().items():
if v.dtype.is_floating_point:
v *= d
v += (1.0 - d) * msd[k].detach()
class EMACallback(Callback):
def __init__(self, len_updates) -> None:
super().__init__()
self.len_updates = len_updates
def on_fit_start(self, trainer, pl_module):
# Todo (@lizeming@megvii.com): delete manually specified device
from torch.nn.modules.batchnorm import SyncBatchNorm
bn_model_list = list()
bn_model_dist_group_list = list()
for model_ref in trainer.model.modules():
if isinstance(model_ref, SyncBatchNorm):
bn_model_list.append(model_ref)
bn_model_dist_group_list.append(model_ref.process_group)
model_ref.process_group = None
trainer.ema_model = ModelEMA(trainer.model.module.module.model.cuda(),
0.9990)
for bn_model, dist_group in zip(bn_model_list,
bn_model_dist_group_list):
bn_model.process_group = dist_group
trainer.ema_model.updates = self.len_updates
def on_train_batch_end(self,
trainer,
pl_module,
outputs,
batch,
batch_idx,
unused=0):
trainer.ema_model.update(trainer, trainer.model.module.module.model)
def on_train_epoch_end(self, trainer, pl_module) -> None:
state_dict = trainer.ema_model.ema.state_dict()
state_dict_keys = list(state_dict.keys())
# TODO: Change to more elegant way.
for state_dict_key in state_dict_keys:
new_key = 'model.' + state_dict_key
state_dict[new_key] = state_dict.pop(state_dict_key)
checkpoint = {
# the epoch and global step are saved for
# compatibility but they are not relevant for restoration
'epoch': trainer.current_epoch,
'global_step': trainer.global_step,
'state_dict': state_dict
}
torch.save(
checkpoint,
os.path.join(trainer.log_dir, f'{trainer.current_epoch}.pth'))
================================================
FILE: bevdepth/datasets/nusc_det_dataset.py
================================================
import os
import mmcv
import numpy as np
import torch
from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
from nuscenes.utils.data_classes import Box, LidarPointCloud
from nuscenes.utils.geometry_utils import view_points
from PIL import Image
from pyquaternion import Quaternion
from torch.utils.data import Dataset
__all__ = ['NuscDetDataset']
map_name_from_general_to_detection = {
'human.pedestrian.adult': 'pedestrian',
'human.pedestrian.child': 'pedestrian',
'human.pedestrian.wheelchair': 'ignore',
'human.pedestrian.stroller': 'ignore',
'human.pedestrian.personal_mobility': 'ignore',
'human.pedestrian.police_officer': 'pedestrian',
'human.pedestrian.construction_worker': 'pedestrian',
'animal': 'ignore',
'vehicle.car': 'car',
'vehicle.motorcycle': 'motorcycle',
'vehicle.bicycle': 'bicycle',
'vehicle.bus.bendy': 'bus',
'vehicle.bus.rigid': 'bus',
'vehicle.truck': 'truck',
'vehicle.construction': 'construction_vehicle',
'vehicle.emergency.ambulance': 'ignore',
'vehicle.emergency.police': 'ignore',
'vehicle.trailer': 'trailer',
'movable_object.barrier': 'barrier',
'movable_object.trafficcone': 'traffic_cone',
'movable_object.pushable_pullable': 'ignore',
'movable_object.debris': 'ignore',
'static_object.bicycle_rack': 'ignore',
}
def get_rot(h):
return torch.Tensor([
[np.cos(h), np.sin(h)],
[-np.sin(h), np.cos(h)],
])
def img_transform(img, resize, resize_dims, crop, flip, rotate):
ida_rot = torch.eye(2)
ida_tran = torch.zeros(2)
# adjust image
img = img.resize(resize_dims)
img = img.crop(crop)
if flip:
img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
img = img.rotate(rotate)
# post-homography transformation
ida_rot *= resize
ida_tran -= torch.Tensor(crop[:2])
if flip:
A = torch.Tensor([[-1, 0], [0, 1]])
b = torch.Tensor([crop[2] - crop[0], 0])
ida_rot = A.matmul(ida_rot)
ida_tran = A.matmul(ida_tran) + b
A = get_rot(rotate / 180 * np.pi)
b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
b = A.matmul(-b) + b
ida_rot = A.matmul(ida_rot)
ida_tran = A.matmul(ida_tran) + b
ida_mat = ida_rot.new_zeros(4, 4)
ida_mat[3, 3] = 1
ida_mat[2, 2] = 1
ida_mat[:2, :2] = ida_rot
ida_mat[:2, 3] = ida_tran
return img, ida_mat
def bev_transform(gt_boxes, rotate_angle, scale_ratio, flip_dx, flip_dy):
rotate_angle = torch.tensor(rotate_angle / 180 * np.pi)
rot_sin = torch.sin(rotate_angle)
rot_cos = torch.cos(rotate_angle)
rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0],
[0, 0, 1]])
scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0],
[0, 0, scale_ratio]])
flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
if flip_dx:
flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0], [0, 0, 1]])
if flip_dy:
flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]])
rot_mat = flip_mat @ (scale_mat @ rot_mat)
if gt_boxes.shape[0] > 0:
gt_boxes[:, :3] = (rot_mat @ gt_boxes[:, :3].unsqueeze(-1)).squeeze(-1)
gt_boxes[:, 3:6] *= scale_ratio
gt_boxes[:, 6] += rotate_angle
if flip_dx:
gt_boxes[:, 6] = 2 * torch.asin(torch.tensor(1.0)) - gt_boxes[:, 6]
if flip_dy:
gt_boxes[:, 6] = -gt_boxes[:, 6]
gt_boxes[:, 7:] = (
rot_mat[:2, :2] @ gt_boxes[:, 7:].unsqueeze(-1)).squeeze(-1)
return gt_boxes, rot_mat
def depth_transform(cam_depth, resize, resize_dims, crop, flip, rotate):
"""Transform depth based on ida augmentation configuration.
Args:
cam_depth (np array): Nx3, 3: x,y,d.
resize (float): Resize factor.
resize_dims (list): Final dimension.
crop (list): x1, y1, x2, y2
flip (bool): Whether to flip.
rotate (float): Rotation value.
Returns:
np array: [h/down_ratio, w/down_ratio, d]
"""
H, W = resize_dims
cam_depth[:, :2] = cam_depth[:, :2] * resize
cam_depth[:, 0] -= crop[0]
cam_depth[:, 1] -= crop[1]
if flip:
cam_depth[:, 0] = resize_dims[1] - cam_depth[:, 0]
cam_depth[:, 0] -= W / 2.0
cam_depth[:, 1] -= H / 2.0
h = rotate / 180 * np.pi
rot_matrix = [
[np.cos(h), np.sin(h)],
[-np.sin(h), np.cos(h)],
]
cam_depth[:, :2] = np.matmul(rot_matrix, cam_depth[:, :2].T).T
cam_depth[:, 0] += W / 2.0
cam_depth[:, 1] += H / 2.0
depth_coords = cam_depth[:, :2].astype(np.int16)
depth_map = np.zeros(resize_dims)
valid_mask = ((depth_coords[:, 1] < resize_dims[0])
& (depth_coords[:, 0] < resize_dims[1])
& (depth_coords[:, 1] >= 0)
& (depth_coords[:, 0] >= 0))
depth_map[depth_coords[valid_mask, 1],
depth_coords[valid_mask, 0]] = cam_depth[valid_mask, 2]
return torch.Tensor(depth_map)
def map_pointcloud_to_image(
lidar_points,
img,
lidar_calibrated_sensor,
lidar_ego_pose,
cam_calibrated_sensor,
cam_ego_pose,
min_dist: float = 0.0,
):
# Points live in the point sensor frame. So they need to be
# transformed via global to the image plane.
# First step: transform the pointcloud to the ego vehicle
# frame for the timestamp of the sweep.
lidar_points = LidarPointCloud(lidar_points.T)
lidar_points.rotate(
Quaternion(lidar_calibrated_sensor['rotation']).rotation_matrix)
lidar_points.translate(np.array(lidar_calibrated_sensor['translation']))
# Second step: transform from ego to the global frame.
lidar_points.rotate(Quaternion(lidar_ego_pose['rotation']).rotation_matrix)
lidar_points.translate(np.array(lidar_ego_pose['translation']))
# Third step: transform from global into the ego vehicle
# frame for the timestamp of the image.
lidar_points.translate(-np.array(cam_ego_pose['translation']))
lidar_points.rotate(Quaternion(cam_ego_pose['rotation']).rotation_matrix.T)
# Fourth step: transform from ego into the camera.
lidar_points.translate(-np.array(cam_calibrated_sensor['translation']))
lidar_points.rotate(
Quaternion(cam_calibrated_sensor['rotation']).rotation_matrix.T)
# Fifth step: actually take a "picture" of the point cloud.
# Grab the depths (camera frame z axis points away from the camera).
depths = lidar_points.points[2, :]
coloring = depths
# Take the actual picture (matrix multiplication with camera-matrix
# + renormalization).
points = view_points(lidar_points.points[:3, :],
np.array(cam_calibrated_sensor['camera_intrinsic']),
normalize=True)
# Remove points that are either outside or behind the camera.
# Leave a margin of 1 pixel for aesthetic reasons. Also make
# sure points are at least 1m in front of the camera to avoid
# seeing the lidar points on the camera casing for non-keyframes
# which are slightly out of sync.
mask = np.ones(depths.shape[0], dtype=bool)
mask = np.logical_and(mask, depths > min_dist)
mask = np.logical_and(mask, points[0, :] > 1)
mask = np.logical_and(mask, points[0, :] < img.size[0] - 1)
mask = np.logical_and(mask, points[1, :] > 1)
mask = np.logical_and(mask, points[1, :] < img.size[1] - 1)
points = points[:, mask]
coloring = coloring[mask]
return points, coloring
class NuscDetDataset(Dataset):
def __init__(self,
ida_aug_conf,
bda_aug_conf,
classes,
data_root,
info_paths,
is_train,
use_cbgs=False,
num_sweeps=1,
img_conf=dict(img_mean=[123.675, 116.28, 103.53],
img_std=[58.395, 57.12, 57.375],
to_rgb=True),
return_depth=False,
sweep_idxes=list(),
key_idxes=list(),
use_fusion=False):
"""Dataset used for bevdetection task.
Args:
ida_aug_conf (dict): Config for ida augmentation.
bda_aug_conf (dict): Config for bda augmentation.
classes (list): Class names.
use_cbgs (bool): Whether to use cbgs strategy,
Default: False.
num_sweeps (int): Number of sweeps to be used for each sample.
default: 1.
img_conf (dict): Config for image.
return_depth (bool): Whether to use depth gt.
default: False.
sweep_idxes (list): List of sweep idxes to be used.
default: list().
key_idxes (list): List of key idxes to be used.
default: list().
use_fusion (bool): Whether to use lidar data.
default: False.
"""
super().__init__()
if isinstance(info_paths, list):
self.infos = list()
for info_path in info_paths:
self.infos.extend(mmcv.load(info_path))
else:
self.infos = mmcv.load(info_paths)
self.is_train = is_train
self.ida_aug_conf = ida_aug_conf
self.bda_aug_conf = bda_aug_conf
self.data_root = data_root
self.classes = classes
self.use_cbgs = use_cbgs
if self.use_cbgs:
self.cat2id = {name: i for i, name in enumerate(self.classes)}
self.sample_indices = self._get_sample_indices()
self.num_sweeps = num_sweeps
self.img_mean = np.array(img_conf['img_mean'], np.float32)
self.img_std = np.array(img_conf['img_std'], np.float32)
self.to_rgb = img_conf['to_rgb']
self.return_depth = return_depth
assert sum([sweep_idx >= 0 for sweep_idx in sweep_idxes]) \
== len(sweep_idxes), 'All `sweep_idxes` must greater \
than or equal to 0.'
self.sweeps_idx = sweep_idxes
assert sum([key_idx < 0 for key_idx in key_idxes]) == len(key_idxes),\
'All `key_idxes` must less than 0.'
self.key_idxes = [0] + key_idxes
self.use_fusion = use_fusion
def _get_sample_indices(self):
"""Load annotations from ann_file.
Args:
ann_file (str): Path of the annotation file.
Returns:
list[dict]: List of annotations after class sampling.
"""
class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()}
for idx, info in enumerate(self.infos):
gt_names = set(
[ann_info['category_name'] for ann_info in info['ann_infos']])
for gt_name in gt_names:
gt_name = map_name_from_general_to_detection[gt_name]
if gt_name not in self.classes:
continue
class_sample_idxs[self.cat2id[gt_name]].append(idx)
duplicated_samples = sum(
[len(v) for _, v in class_sample_idxs.items()])
class_distribution = {
k: len(v) / duplicated_samples
for k, v in class_sample_idxs.items()
}
sample_indices = []
frac = 1.0 / len(self.classes)
ratios = [frac / v for v in class_distribution.values()]
for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
sample_indices += np.random.choice(cls_inds,
int(len(cls_inds) *
ratio)).tolist()
return sample_indices
def sample_ida_augmentation(self):
"""Generate ida augmentation values based on ida_config."""
H, W = self.ida_aug_conf['H'], self.ida_aug_conf['W']
fH, fW = self.ida_aug_conf['final_dim']
if self.is_train:
resize = np.random.uniform(*self.ida_aug_conf['resize_lim'])
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int(
(1 - np.random.uniform(*self.ida_aug_conf['bot_pct_lim'])) *
newH) - fH
crop_w = int(np.random.uniform(0, max(0, newW - fW)))
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
if self.ida_aug_conf['rand_flip'] and np.random.choice([0, 1]):
flip = True
rotate_ida = np.random.uniform(*self.ida_aug_conf['rot_lim'])
else:
resize = max(fH / H, fW / W)
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int(
(1 - np.mean(self.ida_aug_conf['bot_pct_lim'])) * newH) - fH
crop_w = int(max(0, newW - fW) / 2)
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
rotate_ida = 0
return resize, resize_dims, crop, flip, rotate_ida
def sample_bda_augmentation(self):
"""Generate bda augmentation values based on bda_config."""
if self.is_train:
rotate_bda = np.random.uniform(*self.bda_aug_conf['rot_lim'])
scale_bda = np.random.uniform(*self.bda_aug_conf['scale_lim'])
flip_dx = np.random.uniform() < self.bda_aug_conf['flip_dx_ratio']
flip_dy = np.random.uniform() < self.bda_aug_conf['flip_dy_ratio']
else:
rotate_bda = 0
scale_bda = 1.0
flip_dx = False
flip_dy = False
return rotate_bda, scale_bda, flip_dx, flip_dy
def get_lidar_depth(self, lidar_points, img, lidar_info, cam_info):
lidar_calibrated_sensor = lidar_info['LIDAR_TOP']['calibrated_sensor']
lidar_ego_pose = lidar_info['LIDAR_TOP']['ego_pose']
cam_calibrated_sensor = cam_info['calibrated_sensor']
cam_ego_pose = cam_info['ego_pose']
pts_img, depth = map_pointcloud_to_image(
lidar_points.copy(), img, lidar_calibrated_sensor.copy(),
lidar_ego_pose.copy(), cam_calibrated_sensor, cam_ego_pose)
return np.concatenate([pts_img[:2, :].T, depth[:, None]],
axis=1).astype(np.float32)
def get_image(self, cam_infos, cams, lidar_infos=None):
"""Given data and cam_names, return image data needed.
Args:
sweeps_data (list): Raw data used to generate the data we needed.
cams (list): Camera names.
Returns:
Tensor: Image data after processing.
Tensor: Transformation matrix from camera to ego.
Tensor: Intrinsic matrix.
Tensor: Transformation matrix for ida.
Tensor: Transformation matrix from key
frame camera to sweep frame camera.
Tensor: timestamps.
dict: meta infos needed for evaluation.
"""
assert len(cam_infos) > 0
sweep_imgs = list()
sweep_sensor2ego_mats = list()
sweep_intrin_mats = list()
sweep_ida_mats = list()
sweep_sensor2sensor_mats = list()
sweep_timestamps = list()
sweep_lidar_depth = list()
if self.return_depth or self.use_fusion:
sweep_lidar_points = list()
for lidar_info in lidar_infos:
lidar_path = lidar_info['LIDAR_TOP']['filename']
lidar_points = np.fromfile(os.path.join(
self.data_root, lidar_path),
dtype=np.float32,
count=-1).reshape(-1, 5)[..., :4]
sweep_lidar_points.append(lidar_points)
for cam in cams:
imgs = list()
sensor2ego_mats = list()
intrin_mats = list()
ida_mats = list()
sensor2sensor_mats = list()
timestamps = list()
lidar_depth = list()
key_info = cam_infos[0]
resize, resize_dims, crop, flip, \
rotate_ida = self.sample_ida_augmentation(
)
for sweep_idx, cam_info in enumerate(cam_infos):
img = Image.open(
os.path.join(self.data_root, cam_info[cam]['filename']))
# img = Image.fromarray(img)
w, x, y, z = cam_info[cam]['calibrated_sensor']['rotation']
# sweep sensor to sweep ego
sweepsensor2sweepego_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
sweepsensor2sweepego_tran = torch.Tensor(
cam_info[cam]['calibrated_sensor']['translation'])
sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros(
(4, 4))
sweepsensor2sweepego[3, 3] = 1
sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot
sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran
# sweep ego to global
w, x, y, z = cam_info[cam]['ego_pose']['rotation']
sweepego2global_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
sweepego2global_tran = torch.Tensor(
cam_info[cam]['ego_pose']['translation'])
sweepego2global = sweepego2global_rot.new_zeros((4, 4))
sweepego2global[3, 3] = 1
sweepego2global[:3, :3] = sweepego2global_rot
sweepego2global[:3, -1] = sweepego2global_tran
# global sensor to cur ego
w, x, y, z = key_info[cam]['ego_pose']['rotation']
keyego2global_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
keyego2global_tran = torch.Tensor(
key_info[cam]['ego_pose']['translation'])
keyego2global = keyego2global_rot.new_zeros((4, 4))
keyego2global[3, 3] = 1
keyego2global[:3, :3] = keyego2global_rot
keyego2global[:3, -1] = keyego2global_tran
global2keyego = keyego2global.inverse()
# cur ego to sensor
w, x, y, z = key_info[cam]['calibrated_sensor']['rotation']
keysensor2keyego_rot = torch.Tensor(
Quaternion(w, x, y, z).rotation_matrix)
keysensor2keyego_tran = torch.Tensor(
key_info[cam]['calibrated_sensor']['translation'])
keysensor2keyego = keysensor2keyego_rot.new_zeros((4, 4))
keysensor2keyego[3, 3] = 1
keysensor2keyego[:3, :3] = keysensor2keyego_rot
keysensor2keyego[:3, -1] = keysensor2keyego_tran
keyego2keysensor = keysensor2keyego.inverse()
keysensor2sweepsensor = (
keyego2keysensor @ global2keyego @ sweepego2global
@ sweepsensor2sweepego).inverse()
sweepsensor2keyego = global2keyego @ sweepego2global @\
sweepsensor2sweepego
sensor2ego_mats.append(sweepsensor2keyego)
sensor2sensor_mats.append(keysensor2sweepsensor)
intrin_mat = torch.zeros((4, 4))
intrin_mat[3, 3] = 1
intrin_mat[:3, :3] = torch.Tensor(
cam_info[cam]['calibrated_sensor']['camera_intrinsic'])
if self.return_depth and (self.use_fusion or sweep_idx == 0):
point_depth = self.get_lidar_depth(
sweep_lidar_points[sweep_idx], img,
lidar_infos[sweep_idx], cam_info[cam])
point_depth_augmented = depth_transform(
point_depth, resize, self.ida_aug_conf['final_dim'],
crop, flip, rotate_ida)
lidar_depth.append(point_depth_augmented)
img, ida_mat = img_transform(
img,
resize=resize,
resize_dims=resize_dims,
crop=crop,
flip=flip,
rotate=rotate_ida,
)
ida_mats.append(ida_mat)
img = mmcv.imnormalize(np.array(img), self.img_mean,
self.img_std, self.to_rgb)
img = torch.from_numpy(img).permute(2, 0, 1)
imgs.append(img)
intrin_mats.append(intrin_mat)
timestamps.append(cam_info[cam]['timestamp'])
sweep_imgs.append(torch.stack(imgs))
sweep_sensor2ego_mats.append(torch.stack(sensor2ego_mats))
sweep_intrin_mats.append(torch.stack(intrin_mats))
sweep_ida_mats.append(torch.stack(ida_mats))
sweep_sensor2sensor_mats.append(torch.stack(sensor2sensor_mats))
sweep_timestamps.append(torch.tensor(timestamps))
if self.return_depth:
sweep_lidar_depth.append(torch.stack(lidar_depth))
# Get mean pose of all cams.
ego2global_rotation = np.mean(
[key_info[cam]['ego_pose']['rotation'] for cam in cams], 0)
ego2global_translation = np.mean(
[key_info[cam]['ego_pose']['translation'] for cam in cams], 0)
img_metas = dict(
box_type_3d=LiDARInstance3DBoxes,
ego2global_translation=ego2global_translation,
ego2global_rotation=ego2global_rotation,
)
ret_list = [
torch.stack(sweep_imgs).permute(1, 0, 2, 3, 4),
torch.stack(sweep_sensor2ego_mats).permute(1, 0, 2, 3),
torch.stack(sweep_intrin_mats).permute(1, 0, 2, 3),
torch.stack(sweep_ida_mats).permute(1, 0, 2, 3),
torch.stack(sweep_sensor2sensor_mats).permute(1, 0, 2, 3),
torch.stack(sweep_timestamps).permute(1, 0),
img_metas,
]
if self.return_depth:
ret_list.append(torch.stack(sweep_lidar_depth).permute(1, 0, 2, 3))
return ret_list
def get_gt(self, info, cams):
"""Generate gt labels from info.
Args:
info(dict): Infos needed to generate gt labels.
cams(list): Camera names.
Returns:
Tensor: GT bboxes.
Tensor: GT labels.
"""
ego2global_rotation = np.mean(
[info['cam_infos'][cam]['ego_pose']['rotation'] for cam in cams],
0)
ego2global_translation = np.mean([
info['cam_infos'][cam]['ego_pose']['translation'] for cam in cams
], 0)
trans = -np.array(ego2global_translation)
rot = Quaternion(ego2global_rotation).inverse
gt_boxes = list()
gt_labels = list()
for ann_info in info['ann_infos']:
# Use ego coordinate.
if (map_name_from_general_to_detection[ann_info['category_name']]
not in self.classes
or ann_info['num_lidar_pts'] + ann_info['num_radar_pts'] <=
0):
continue
box = Box(
ann_info['translation'],
ann_info['size'],
Quaternion(ann_info['rotation']),
velocity=ann_info['velocity'],
)
box.translate(trans)
box.rotate(rot)
box_xyz = np.array(box.center)
box_dxdydz = np.array(box.wlh)[[1, 0, 2]]
box_yaw = np.array([box.orientation.yaw_pitch_roll[0]])
box_velo = np.array(box.velocity[:2])
gt_box = np.concatenate([box_xyz, box_dxdydz, box_yaw, box_velo])
gt_boxes.append(gt_box)
gt_labels.append(
self.classes.index(map_name_from_general_to_detection[
ann_info['category_name']]))
return torch.Tensor(gt_boxes), torch.tensor(gt_labels)
def choose_cams(self):
"""Choose cameras randomly.
Returns:
list: Cameras to be used.
"""
if self.is_train and self.ida_aug_conf['Ncams'] < len(
self.ida_aug_conf['cams']):
cams = np.random.choice(self.ida_aug_conf['cams'],
self.ida_aug_conf['Ncams'],
replace=False)
else:
cams = self.ida_aug_conf['cams']
return cams
def __getitem__(self, idx):
if self.use_cbgs:
idx = self.sample_indices[idx]
cam_infos = list()
lidar_infos = list()
# TODO: Check if it still works when number of cameras is reduced.
cams = self.choose_cams()
for key_idx in self.key_idxes:
cur_idx = key_idx + idx
# Handle scenarios when current idx doesn't have previous key
# frame or previous key frame is from another scene.
if cur_idx < 0:
cur_idx = idx
elif self.infos[cur_idx]['scene_token'] != self.infos[idx][
'scene_token']:
cur_idx = idx
info = self.infos[cur_idx]
cam_infos.append(info['cam_infos'])
lidar_infos.append(info['lidar_infos'])
lidar_sweep_timestamps = [
lidar_sweep['LIDAR_TOP']['timestamp']
for lidar_sweep in info['lidar_sweeps']
]
for sweep_idx in self.sweeps_idx:
if len(info['cam_sweeps']) == 0:
cam_infos.append(info['cam_infos'])
lidar_infos.append(info['lidar_infos'])
else:
# Handle scenarios when current sweep doesn't have all
# cam keys.
for i in range(min(len(info['cam_sweeps']) - 1, sweep_idx),
-1, -1):
if sum([cam in info['cam_sweeps'][i]
for cam in cams]) == len(cams):
cam_infos.append(info['cam_sweeps'][i])
cam_timestamp = np.mean([
val['timestamp']
for val in info['cam_sweeps'][i].values()
])
# Find the closest lidar frame to the cam frame.
lidar_idx = np.abs(lidar_sweep_timestamps -
cam_timestamp).argmin()
lidar_infos.append(info['lidar_sweeps'][lidar_idx])
break
if self.return_depth or self.use_fusion:
image_data_list = self.get_image(cam_infos, cams, lidar_infos)
else:
image_data_list = self.get_image(cam_infos, cams)
ret_list = list()
(
sweep_imgs,
sweep_sensor2ego_mats,
sweep_intrins,
sweep_ida_mats,
sweep_sensor2sensor_mats,
sweep_timestamps,
img_metas,
) = image_data_list[:7]
img_metas['token'] = self.infos[idx]['sample_token']
if self.is_train:
gt_boxes, gt_labels = self.get_gt(self.infos[idx], cams)
# Temporary solution for test.
else:
gt_boxes = sweep_imgs.new_zeros(0, 7)
gt_labels = sweep_imgs.new_zeros(0, )
rotate_bda, scale_bda, flip_dx, flip_dy = self.sample_bda_augmentation(
)
bda_mat = sweep_imgs.new_zeros(4, 4)
bda_mat[3, 3] = 1
gt_boxes, bda_rot = bev_transform(gt_boxes, rotate_bda, scale_bda,
flip_dx, flip_dy)
bda_mat[:3, :3] = bda_rot
ret_list = [
sweep_imgs,
sweep_sensor2ego_mats,
sweep_intrins,
sweep_ida_mats,
sweep_sensor2sensor_mats,
bda_mat,
sweep_timestamps,
img_metas,
gt_boxes,
gt_labels,
]
if self.return_depth:
ret_list.append(image_data_list[7])
return ret_list
def __str__(self):
return f"""NuscData: {len(self)} samples. Split: \
{"train" if self.is_train else "val"}.
Augmentation Conf: {self.ida_aug_conf}"""
def __len__(self):
if self.use_cbgs:
return len(self.sample_indices)
else:
return len(self.infos)
def collate_fn(data, is_return_depth=False):
imgs_batch = list()
sensor2ego_mats_batch = list()
intrin_mats_batch = list()
ida_mats_batch = list()
sensor2sensor_mats_batch = list()
bda_mat_batch = list()
timestamps_batch = list()
gt_boxes_batch = list()
gt_labels_batch = list()
img_metas_batch = list()
depth_labels_batch = list()
for iter_data in data:
(
sweep_imgs,
sweep_sensor2ego_mats,
sweep_intrins,
sweep_ida_mats,
sweep_sensor2sensor_mats,
bda_mat,
sweep_timestamps,
img_metas,
gt_boxes,
gt_labels,
) = iter_data[:10]
if is_return_depth:
gt_depth = iter_data[10]
depth_labels_batch.append(gt_depth)
imgs_batch.append(sweep_imgs)
sensor2ego_mats_batch.append(sweep_sensor2ego_mats)
intrin_mats_batch.append(sweep_intrins)
ida_mats_batch.append(sweep_ida_mats)
sensor2sensor_mats_batch.append(sweep_sensor2sensor_mats)
bda_mat_batch.append(bda_mat)
timestamps_batch.append(sweep_timestamps)
img_metas_batch.append(img_metas)
gt_boxes_batch.append(gt_boxes)
gt_labels_batch.append(gt_labels)
mats_dict = dict()
mats_dict['sensor2ego_mats'] = torch.stack(sensor2ego_mats_batch)
mats_dict['intrin_mats'] = torch.stack(intrin_mats_batch)
mats_dict['ida_mats'] = torch.stack(ida_mats_batch)
mats_dict['sensor2sensor_mats'] = torch.stack(sensor2sensor_mats_batch)
mats_dict['bda_mat'] = torch.stack(bda_mat_batch)
ret_list = [
torch.stack(imgs_batch),
mats_dict,
torch.stack(timestamps_batch),
img_metas_batch,
gt_boxes_batch,
gt_labels_batch,
]
if is_return_depth:
ret_list.append(torch.stack(depth_labels_batch))
return ret_list
================================================
FILE: bevdepth/evaluators/det_evaluators.py
================================================
'''Modified from # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
'''
import os.path as osp
import tempfile
import mmcv
import numpy as np
import pyquaternion
from nuscenes.utils.data_classes import Box
from pyquaternion import Quaternion
__all__ = ['DetNuscEvaluator']
class DetNuscEvaluator():
ErrNameMapping = {
'trans_err': 'mATE',
'scale_err': 'mASE',
'orient_err': 'mAOE',
'vel_err': 'mAVE',
'attr_err': 'mAAE',
}
DefaultAttribute = {
'car': 'vehicle.parked',
'pedestrian': 'pedestrian.moving',
'trailer': 'vehicle.parked',
'truck': 'vehicle.parked',
'bus': 'vehicle.moving',
'motorcycle': 'cycle.without_rider',
'construction_vehicle': 'vehicle.parked',
'bicycle': 'cycle.without_rider',
'barrier': '',
'traffic_cone': '',
}
def __init__(
self,
class_names,
eval_version='detection_cvpr_2019',
data_root='./data/nuScenes',
version='v1.0-trainval',
modality=dict(use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False),
output_dir=None,
) -> None:
self.eval_version = eval_version
self.data_root = data_root
if self.eval_version is not None:
from nuscenes.eval.detection.config import config_factory
self.eval_detection_configs = config_factory(self.eval_version)
self.version = version
self.class_names = class_names
self.modality = modality
self.output_dir = output_dir
def _evaluate_single(self,
result_path,
logger=None,
metric='bbox',
result_name='pts_bbox'):
"""Evaluation for a single model in nuScenes protocol.
Args:
result_path (str): Path of the result file.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
metric (str): Metric name used for evaluation. Default: 'bbox'.
result_name (str): Result name in the metric prefix.
Default: 'pts_bbox'.
Returns:
dict: Dictionary of evaluation details.
"""
from nuscenes import NuScenes
from nuscenes.eval.detection.evaluate import NuScenesEval
output_dir = osp.join(*osp.split(result_path)[:-1])
nusc = NuScenes(version=self.version,
dataroot=self.data_root,
verbose=False)
eval_set_map = {
'v1.0-mini': 'mini_val',
'v1.0-trainval': 'val',
}
nusc_eval = NuScenesEval(nusc,
config=self.eval_detection_configs,
result_path=result_path,
eval_set=eval_set_map[self.version],
output_dir=output_dir,
verbose=False)
nusc_eval.main(render_curves=False)
# record metrics
metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
detail = dict()
metric_prefix = f'{result_name}_NuScenes'
for class_name in self.class_names:
for k, v in metrics['label_aps'][class_name].items():
val = float('{:.4f}'.format(v))
detail['{}/{}_AP_dist_{}'.format(metric_prefix, class_name,
k)] = val
for k, v in metrics['label_tp_errors'][class_name].items():
val = float('{:.4f}'.format(v))
detail['{}/{}_{}'.format(metric_prefix, class_name, k)] = val
for k, v in metrics['tp_errors'].items():
val = float('{:.4f}'.format(v))
detail['{}/{}'.format(metric_prefix,
self.ErrNameMapping[k])] = val
detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
return detail
def format_results(self,
results,
img_metas,
result_names=['img_bbox'],
jsonfile_prefix=None,
**kwargs):
"""Format the results to json (standard format for COCO evaluation).
Args:
results (list[tuple | numpy.ndarray]): Testing results of the
dataset.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
Returns:
tuple: (result_files, tmp_dir), result_files is a dict containing \
the json filepaths, tmp_dir is the temporal directory created \
for saving json files when jsonfile_prefix is not specified.
"""
assert isinstance(results, list), 'results must be a list'
if jsonfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
jsonfile_prefix = osp.join(tmp_dir.name, 'results')
else:
tmp_dir = None
# currently the output prediction results could be in two formats
# 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
# 2. list of dict('pts_bbox' or 'img_bbox':
# dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
# this is a workaround to enable evaluation of both formats on nuScenes
# refer to https://github.com/open-mmlab/mmdetection3d/issues/449
# should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
result_files = dict()
# refactor this.
for rasult_name in result_names:
# not evaluate 2D predictions on nuScenes
if '2d' in rasult_name:
continue
print(f'\nFormating bboxes of {rasult_name}')
tmp_file_ = osp.join(jsonfile_prefix, rasult_name)
if self.output_dir:
result_files.update({
rasult_name:
self._format_bbox(results, img_metas, self.output_dir)
})
else:
result_files.update({
rasult_name:
self._format_bbox(results, img_metas, tmp_file_)
})
return result_files, tmp_dir
def evaluate(
self,
results,
img_metas,
metric='bbox',
logger=None,
jsonfile_prefix=None,
result_names=['img_bbox'],
show=False,
out_dir=None,
pipeline=None,
):
"""Evaluation in nuScenes protocol.
Args:
results (list[dict]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
show (bool): Whether to visualize.
Default: False.
out_dir (str): Path to save the visualization results.
Default: None.
pipeline (list[dict], optional): raw data loading for showing.
Default: None.
Returns:
dict[str, float]: Results of each evaluation metric.
"""
result_files, tmp_dir = self.format_results(results, img_metas,
result_names,
jsonfile_prefix)
if isinstance(result_files, dict):
for name in result_names:
print('Evaluating bboxes of {}'.format(name))
self._evaluate_single(result_files[name])
elif isinstance(result_files, str):
self._evaluate_single(result_files)
if tmp_dir is not None:
tmp_dir.cleanup()
def _format_bbox(self, results, img_metas, jsonfile_prefix=None):
"""Convert the results to the standard format.
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str): The prefix of the output jsonfile.
You can specify the output directory/filename by
modifying the jsonfile_prefix. Default: None.
Returns:
str: Path of the output json file.
"""
nusc_annos = {}
mapped_class_names = self.class_names
print('Start to convert detection format...')
for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
boxes, scores, labels = det
boxes = boxes
sample_token = img_metas[sample_id]['token']
trans = np.array(img_metas[sample_id]['ego2global_translation'])
rot = Quaternion(img_metas[sample_id]['ego2global_rotation'])
annos = list()
for i, box in enumerate(boxes):
name = mapped_class_names[labels[i]]
center = box[:3]
wlh = box[[4, 3, 5]]
box_yaw = box[6]
box_vel = box[7:].tolist()
box_vel.append(0)
quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw)
nusc_box = Box(center, wlh, quat, velocity=box_vel)
nusc_box.rotate(rot)
nusc_box.translate(trans)
if np.sqrt(nusc_box.velocity[0]**2 +
nusc_box.velocity[1]**2) > 0.2:
if name in [
'car',
'construction_vehicle',
'bus',
'truck',
'trailer',
]:
attr = 'vehicle.moving'
elif name in ['bicycle', 'motorcycle']:
attr = 'cycle.with_rider'
else:
attr = self.DefaultAttribute[name]
else:
if name in ['pedestrian']:
attr = 'pedestrian.standing'
elif name in ['bus']:
attr = 'vehicle.stopped'
else:
attr = self.DefaultAttribute[name]
nusc_anno = dict(
sample_token=sample_token,
translation=nusc_box.center.tolist(),
size=nusc_box.wlh.tolist(),
rotation=nusc_box.orientation.elements.tolist(),
velocity=nusc_box.velocity[:2],
detection_name=name,
detection_score=float(scores[i]),
attribute_name=attr,
)
annos.append(nusc_anno)
# other views results of the same frame should be concatenated
if sample_token in nusc_annos:
nusc_annos[sample_token].extend(annos)
else:
nusc_annos[sample_token] = annos
nusc_submissions = {
'meta': self.modality,
'results': nusc_annos,
}
mmcv.mkdir_or_exist(jsonfile_prefix)
res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
print('Results writes to', res_path)
mmcv.dump(nusc_submissions, res_path)
return res_path
================================================
FILE: bevdepth/exps/base_cli.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import os
from argparse import ArgumentParser
import pytorch_lightning as pl
from bevdepth.callbacks.ema import EMACallback
from bevdepth.utils.torch_dist import all_gather_object, synchronize
from .nuscenes.base_exp import BEVDepthLightningModel
def run_cli(model_class=BEVDepthLightningModel,
exp_name='base_exp',
use_ema=False,
extra_trainer_config_args={}):
parent_parser = ArgumentParser(add_help=False)
parent_parser = pl.Trainer.add_argparse_args(parent_parser)
parent_parser.add_argument('-e',
'--evaluate',
dest='evaluate',
action='store_true',
help='evaluate model on validation set')
parent_parser.add_argument('-p',
'--predict',
dest='predict',
action='store_true',
help='predict model on testing set')
parent_parser.add_argument('-b', '--batch_size_per_device', type=int)
parent_parser.add_argument('--seed',
type=int,
default=0,
help='seed for initializing training.')
parent_parser.add_argument('--ckpt_path', type=str)
parser = BEVDepthLightningModel.add_model_specific_args(parent_parser)
parser.set_defaults(profiler='simple',
deterministic=False,
max_epochs=extra_trainer_config_args.get('epochs', 24),
accelerator='ddp',
num_sanity_val_steps=0,
gradient_clip_val=5,
limit_val_batches=0,
enable_checkpointing=True,
precision=16,
default_root_dir=os.path.join('./outputs/', exp_name))
args = parser.parse_args()
if args.seed is not None:
pl.seed_everything(args.seed)
model = model_class(**vars(args))
if use_ema:
train_dataloader = model.train_dataloader()
ema_callback = EMACallback(
len(train_dataloader.dataset) * args.max_epochs)
trainer = pl.Trainer.from_argparse_args(args, callbacks=[ema_callback])
else:
trainer = pl.Trainer.from_argparse_args(args)
if args.evaluate:
trainer.test(model, ckpt_path=args.ckpt_path)
elif args.predict:
predict_step_outputs = trainer.predict(model, ckpt_path=args.ckpt_path)
all_pred_results = list()
all_img_metas = list()
for predict_step_output in predict_step_outputs:
for i in range(len(predict_step_output)):
all_pred_results.append(predict_step_output[i][:3])
all_img_metas.append(predict_step_output[i][3])
synchronize()
len_dataset = len(model.test_dataloader().dataset)
all_pred_results = sum(
map(list, zip(*all_gather_object(all_pred_results))),
[])[:len_dataset]
all_img_metas = sum(map(list, zip(*all_gather_object(all_img_metas))),
[])[:len_dataset]
model.evaluator._format_bbox(all_pred_results, all_img_metas,
os.path.dirname(args.ckpt_path))
else:
trainer.fit(model)
================================================
FILE: bevdepth/exps/nuscenes/MatrixVT/matrixvt_bev_depth_lss_r50_256x704_128x128_24e_ema.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
# isort: skip_file
from bevdepth.exps.base_cli import run_cli
# Basic Experiment
from bevdepth.exps.nuscenes.mv.bev_depth_lss_r50_256x704_128x128_24e_ema import \
BEVDepthLightningModel as BaseExp # noqa
# new model
from bevdepth.models.matrixvt_det import MatrixVT_Det
class MatrixVT_Exp(BaseExp):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model = MatrixVT_Det(self.backbone_conf,
self.head_conf,
is_train_depth=True)
self.data_use_cbgs = True
if __name__ == '__main__':
run_cli(
MatrixVT_Exp,
'matrixvt_bev_depth_lss_r50_256x704_128x128_24e_ema_cbgs',
use_ema=True,
)
================================================
FILE: bevdepth/exps/nuscenes/base_exp.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import os
from functools import partial
import mmcv
import torch
import torch.nn.functional as F
import torch.nn.parallel
import torch.utils.data
import torch.utils.data.distributed
import torchvision.models as models
from pytorch_lightning.core import LightningModule
from torch.cuda.amp.autocast_mode import autocast
from torch.optim.lr_scheduler import MultiStepLR
from bevdepth.datasets.nusc_det_dataset import NuscDetDataset, collate_fn
from bevdepth.evaluators.det_evaluators import DetNuscEvaluator
from bevdepth.models.base_bev_depth import BaseBEVDepth
from bevdepth.utils.torch_dist import all_gather_object, get_rank, synchronize
H = 900
W = 1600
final_dim = (256, 704)
img_conf = dict(img_mean=[123.675, 116.28, 103.53],
img_std=[58.395, 57.12, 57.375],
to_rgb=True)
backbone_conf = {
'x_bound': [-51.2, 51.2, 0.8],
'y_bound': [-51.2, 51.2, 0.8],
'z_bound': [-5, 3, 8],
'd_bound': [2.0, 58.0, 0.5],
'final_dim':
final_dim,
'output_channels':
80,
'downsample_factor':
16,
'img_backbone_conf':
dict(
type='ResNet',
depth=50,
frozen_stages=0,
out_indices=[0, 1, 2, 3],
norm_eval=False,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
),
'img_neck_conf':
dict(
type='SECONDFPN',
in_channels=[256, 512, 1024, 2048],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[128, 128, 128, 128],
),
'depth_net_conf':
dict(in_channels=512, mid_channels=512)
}
ida_aug_conf = {
'resize_lim': (0.386, 0.55),
'final_dim':
final_dim,
'rot_lim': (-5.4, 5.4),
'H':
H,
'W':
W,
'rand_flip':
True,
'bot_pct_lim': (0.0, 0.0),
'cams': [
'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
'CAM_BACK', 'CAM_BACK_RIGHT'
],
'Ncams':
6,
}
bda_aug_conf = {
'rot_lim': (-22.5, 22.5),
'scale_lim': (0.95, 1.05),
'flip_dx_ratio': 0.5,
'flip_dy_ratio': 0.5
}
bev_backbone = dict(
type='ResNet',
in_channels=80,
depth=18,
num_stages=3,
strides=(1, 2, 2),
dilations=(1, 1, 1),
out_indices=[0, 1, 2],
norm_eval=False,
base_channels=160,
)
bev_neck = dict(type='SECONDFPN',
in_channels=[80, 160, 320, 640],
upsample_strides=[1, 2, 4, 8],
out_channels=[64, 64, 64, 64])
CLASSES = [
'car',
'truck',
'construction_vehicle',
'bus',
'trailer',
'barrier',
'motorcycle',
'bicycle',
'pedestrian',
'traffic_cone',
]
TASKS = [
dict(num_class=1, class_names=['car']),
dict(num_class=2, class_names=['truck', 'construction_vehicle']),
dict(num_class=2, class_names=['bus', 'trailer']),
dict(num_class=1, class_names=['barrier']),
dict(num_class=2, class_names=['motorcycle', 'bicycle']),
dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
]
common_heads = dict(reg=(2, 2),
height=(1, 2),
dim=(3, 2),
rot=(2, 2),
vel=(2, 2))
bbox_coder = dict(
type='CenterPointBBoxCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_num=500,
score_threshold=0.1,
out_size_factor=4,
voxel_size=[0.2, 0.2, 8],
pc_range=[-51.2, -51.2, -5, 51.2, 51.2, 3],
code_size=9,
)
train_cfg = dict(
point_cloud_range=[-51.2, -51.2, -5, 51.2, 51.2, 3],
grid_size=[512, 512, 1],
voxel_size=[0.2, 0.2, 8],
out_size_factor=4,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5],
)
test_cfg = dict(
post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_per_img=500,
max_pool_nms=False,
min_radius=[4, 12, 10, 1, 0.85, 0.175],
score_threshold=0.1,
out_size_factor=4,
voxel_size=[0.2, 0.2, 8],
nms_type='circle',
pre_max_size=1000,
post_max_size=83,
nms_thr=0.2,
)
head_conf = {
'bev_backbone_conf': bev_backbone,
'bev_neck_conf': bev_neck,
'tasks': TASKS,
'common_heads': common_heads,
'bbox_coder': bbox_coder,
'train_cfg': train_cfg,
'test_cfg': test_cfg,
'in_channels': 256, # Equal to bev_neck output_channels.
'loss_cls': dict(type='GaussianFocalLoss', reduction='mean'),
'loss_bbox': dict(type='L1Loss', reduction='mean', loss_weight=0.25),
'gaussian_overlap': 0.1,
'min_radius': 2,
}
class BEVDepthLightningModel(LightningModule):
MODEL_NAMES = sorted(name for name in models.__dict__
if name.islower() and not name.startswith('__')
and callable(models.__dict__[name]))
def __init__(self,
gpus: int = 1,
data_root='data/nuScenes',
eval_interval=1,
batch_size_per_device=8,
class_names=CLASSES,
backbone_conf=backbone_conf,
head_conf=head_conf,
ida_aug_conf=ida_aug_conf,
bda_aug_conf=bda_aug_conf,
default_root_dir='./outputs/',
**kwargs):
super().__init__()
self.save_hyperparameters()
self.gpus = gpus
self.eval_interval = eval_interval
self.batch_size_per_device = batch_size_per_device
self.data_root = data_root
self.basic_lr_per_img = 2e-4 / 64
self.class_names = class_names
self.backbone_conf = backbone_conf
self.head_conf = head_conf
self.ida_aug_conf = ida_aug_conf
self.bda_aug_conf = bda_aug_conf
mmcv.mkdir_or_exist(default_root_dir)
self.default_root_dir = default_root_dir
self.evaluator = DetNuscEvaluator(class_names=self.class_names,
output_dir=self.default_root_dir)
self.model = BaseBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=True)
self.mode = 'valid'
self.img_conf = img_conf
self.data_use_cbgs = False
self.num_sweeps = 1
self.sweep_idxes = list()
self.key_idxes = list()
self.data_return_depth = True
self.downsample_factor = self.backbone_conf['downsample_factor']
self.dbound = self.backbone_conf['d_bound']
self.depth_channels = int(
(self.dbound[1] - self.dbound[0]) / self.dbound[2])
self.use_fusion = False
self.train_info_paths = os.path.join(self.data_root,
'nuscenes_infos_train.pkl')
self.val_info_paths = os.path.join(self.data_root,
'nuscenes_infos_val.pkl')
self.predict_info_paths = os.path.join(self.data_root,
'nuscenes_infos_test.pkl')
def forward(self, sweep_imgs, mats):
return self.model(sweep_imgs, mats)
def training_step(self, batch):
(sweep_imgs, mats, _, _, gt_boxes, gt_labels, depth_labels) = batch
if torch.cuda.is_available():
for key, value in mats.items():
mats[key] = value.cuda()
sweep_imgs = sweep_imgs.cuda()
gt_boxes = [gt_box.cuda() for gt_box in gt_boxes]
gt_labels = [gt_label.cuda() for gt_label in gt_labels]
preds, depth_preds = self(sweep_imgs, mats)
if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
targets = self.model.module.get_targets(gt_boxes, gt_labels)
detection_loss = self.model.module.loss(targets, preds)
else:
targets = self.model.get_targets(gt_boxes, gt_labels)
detection_loss = self.model.loss(targets, preds)
if len(depth_labels.shape) == 5:
# only key-frame will calculate depth loss
depth_labels = depth_labels[:, 0, ...]
depth_loss = self.get_depth_loss(depth_labels.cuda(), depth_preds)
self.log('detection_loss', detection_loss)
self.log('depth_loss', depth_loss)
return detection_loss + depth_loss
def get_depth_loss(self, depth_labels, depth_preds):
depth_labels = self.get_downsampled_gt_depth(depth_labels)
depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(
-1, self.depth_channels)
fg_mask = torch.max(depth_labels, dim=1).values > 0.0
with autocast(enabled=False):
depth_loss = (F.binary_cross_entropy(
depth_preds[fg_mask],
depth_labels[fg_mask],
reduction='none',
).sum() / max(1.0, fg_mask.sum()))
return 3.0 * depth_loss
def get_downsampled_gt_depth(self, gt_depths):
"""
Input:
gt_depths: [B, N, H, W]
Output:
gt_depths: [B*N*h*w, d]
"""
B, N, H, W = gt_depths.shape
gt_depths = gt_depths.view(
B * N,
H // self.downsample_factor,
self.downsample_factor,
W // self.downsample_factor,
self.downsample_factor,
1,
)
gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
gt_depths = gt_depths.view(
-1, self.downsample_factor * self.downsample_factor)
gt_depths_tmp = torch.where(gt_depths == 0.0,
1e5 * torch.ones_like(gt_depths),
gt_depths)
gt_depths = torch.min(gt_depths_tmp, dim=-1).values
gt_depths = gt_depths.view(B * N, H // self.downsample_factor,
W // self.downsample_factor)
gt_depths = (gt_depths -
(self.dbound[0] - self.dbound[2])) / self.dbound[2]
gt_depths = torch.where(
(gt_depths < self.depth_channels + 1) & (gt_depths >= 0.0),
gt_depths, torch.zeros_like(gt_depths))
gt_depths = F.one_hot(gt_depths.long(),
num_classes=self.depth_channels + 1).view(
-1, self.depth_channels + 1)[:, 1:]
return gt_depths.float()
def eval_step(self, batch, batch_idx, prefix: str):
(sweep_imgs, mats, _, img_metas, _, _) = batch
if torch.cuda.is_available():
for key, value in mats.items():
mats[key] = value.cuda()
sweep_imgs = sweep_imgs.cuda()
preds = self.model(sweep_imgs, mats)
if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
results = self.model.module.get_bboxes(preds, img_metas)
else:
results = self.model.get_bboxes(preds, img_metas)
for i in range(len(results)):
results[i][0] = results[i][0].detach().cpu().numpy()
results[i][1] = results[i][1].detach().cpu().numpy()
results[i][2] = results[i][2].detach().cpu().numpy()
results[i].append(img_metas[i])
return results
def validation_step(self, batch, batch_idx):
return self.eval_step(batch, batch_idx, 'val')
def validation_epoch_end(self, validation_step_outputs):
all_pred_results = list()
all_img_metas = list()
for validation_step_output in validation_step_outputs:
for i in range(len(validation_step_output)):
all_pred_results.append(validation_step_output[i][:3])
all_img_metas.append(validation_step_output[i][3])
synchronize()
len_dataset = len(self.val_dataloader().dataset)
all_pred_results = sum(
map(list, zip(*all_gather_object(all_pred_results))),
[])[:len_dataset]
all_img_metas = sum(map(list, zip(*all_gather_object(all_img_metas))),
[])[:len_dataset]
if get_rank() == 0:
self.evaluator.evaluate(all_pred_results, all_img_metas)
def test_epoch_end(self, test_step_outputs):
all_pred_results = list()
all_img_metas = list()
for test_step_output in test_step_outputs:
for i in range(len(test_step_output)):
all_pred_results.append(test_step_output[i][:3])
all_img_metas.append(test_step_output[i][3])
synchronize()
# TODO: Change another way.
dataset_length = len(self.val_dataloader().dataset)
all_pred_results = sum(
map(list, zip(*all_gather_object(all_pred_results))),
[])[:dataset_length]
all_img_metas = sum(map(list, zip(*all_gather_object(all_img_metas))),
[])[:dataset_length]
if get_rank() == 0:
self.evaluator.evaluate(all_pred_results, all_img_metas)
def configure_optimizers(self):
lr = self.basic_lr_per_img * \
self.batch_size_per_device * self.gpus
optimizer = torch.optim.AdamW(self.model.parameters(),
lr=lr,
weight_decay=1e-7)
scheduler = MultiStepLR(optimizer, [19, 23])
return [[optimizer], [scheduler]]
def train_dataloader(self):
train_dataset = NuscDetDataset(ida_aug_conf=self.ida_aug_conf,
bda_aug_conf=self.bda_aug_conf,
classes=self.class_names,
data_root=self.data_root,
info_paths=self.train_info_paths,
is_train=True,
use_cbgs=self.data_use_cbgs,
img_conf=self.img_conf,
num_sweeps=self.num_sweeps,
sweep_idxes=self.sweep_idxes,
key_idxes=self.key_idxes,
return_depth=self.data_return_depth,
use_fusion=self.use_fusion)
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=self.batch_size_per_device,
num_workers=4,
drop_last=True,
shuffle=False,
collate_fn=partial(collate_fn,
is_return_depth=self.data_return_depth
or self.use_fusion),
sampler=None,
)
return train_loader
def val_dataloader(self):
val_dataset = NuscDetDataset(ida_aug_conf=self.ida_aug_conf,
bda_aug_conf=self.bda_aug_conf,
classes=self.class_names,
data_root=self.data_root,
info_paths=self.val_info_paths,
is_train=False,
img_conf=self.img_conf,
num_sweeps=self.num_sweeps,
sweep_idxes=self.sweep_idxes,
key_idxes=self.key_idxes,
return_depth=self.use_fusion,
use_fusion=self.use_fusion)
val_loader = torch.utils.data.DataLoader(
val_dataset,
batch_size=self.batch_size_per_device,
shuffle=False,
collate_fn=partial(collate_fn, is_return_depth=self.use_fusion),
num_workers=4,
sampler=None,
)
return val_loader
def test_dataloader(self):
return self.val_dataloader()
def predict_dataloader(self):
predict_dataset = NuscDetDataset(ida_aug_conf=self.ida_aug_conf,
bda_aug_conf=self.bda_aug_conf,
classes=self.class_names,
data_root=self.data_root,
info_paths=self.predict_info_paths,
is_train=False,
img_conf=self.img_conf,
num_sweeps=self.num_sweeps,
sweep_idxes=self.sweep_idxes,
key_idxes=self.key_idxes,
return_depth=self.use_fusion,
use_fusion=self.use_fusion)
predict_loader = torch.utils.data.DataLoader(
predict_dataset,
batch_size=self.batch_size_per_device,
shuffle=False,
collate_fn=partial(collate_fn, is_return_depth=self.use_fusion),
num_workers=4,
sampler=None,
)
return predict_loader
def test_step(self, batch, batch_idx):
return self.eval_step(batch, batch_idx, 'test')
def predict_step(self, batch, batch_idx):
return self.eval_step(batch, batch_idx, 'predict')
@staticmethod
def add_model_specific_args(parent_parser): # pragma: no-cover
return parent_parser
================================================
FILE: bevdepth/exps/nuscenes/fusion/bev_depth_fusion_lss_r50_256x704_128x128_24e.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
import torch.nn.parallel
import torch.utils.data
import torch.utils.data.distributed
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.base_exp import \
BEVDepthLightningModel as BaseBEVDepthLightningModel
from bevdepth.models.fusion_bev_depth import FusionBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.model = FusionBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=False)
self.use_fusion = True
def forward(self, sweep_imgs, mats, lidar_depth):
return self.model(sweep_imgs, mats, lidar_depth)
def training_step(self, batch):
(sweep_imgs, mats, _, _, gt_boxes, gt_labels, lidar_depth) = batch
if torch.cuda.is_available():
for key, value in mats.items():
mats[key] = value.cuda()
sweep_imgs = sweep_imgs.cuda()
gt_boxes = [gt_box.cuda() for gt_box in gt_boxes]
gt_labels = [gt_label.cuda() for gt_label in gt_labels]
preds = self(sweep_imgs, mats, lidar_depth)
if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
targets = self.model.module.get_targets(gt_boxes, gt_labels)
detection_loss = self.model.module.loss(targets, preds)
else:
targets = self.model.get_targets(gt_boxes, gt_labels)
detection_loss = self.model.loss(targets, preds)
if len(lidar_depth.shape) == 5:
# only key-frame will calculate depth loss
lidar_depth = lidar_depth[:, 0, ...]
self.log('detection_loss', detection_loss)
return detection_loss
def eval_step(self, batch, batch_idx, prefix: str):
(sweep_imgs, mats, _, img_metas, _, _, lidar_depth) = batch
if torch.cuda.is_available():
for key, value in mats.items():
mats[key] = value.cuda()
sweep_imgs = sweep_imgs.cuda()
preds = self.model(sweep_imgs, mats, lidar_depth)
if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
results = self.model.module.get_bboxes(preds, img_metas)
else:
results = self.model.get_bboxes(preds, img_metas)
for i in range(len(results)):
results[i][0] = results[i][0].detach().cpu().numpy()
results[i][1] = results[i][1].detach().cpu().numpy()
results[i][2] = results[i][2].detach().cpu().numpy()
results[i].append(img_metas[i])
return results
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_fusion_lss_r50_256x704_128x128_24e')
================================================
FILE: bevdepth/exps/nuscenes/fusion/bev_depth_fusion_lss_r50_256x704_128x128_24e_2key.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.fusion.bev_depth_fusion_lss_r50_256x704_128x128_24e import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
from bevdepth.models.fusion_bev_depth import FusionBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.key_idxes = [-1]
self.head_conf['bev_backbone_conf']['in_channels'] = 80 * (
len(self.key_idxes) + 1)
self.head_conf['bev_neck_conf']['in_channels'] = [
80 * (len(self.key_idxes) + 1), 160, 320, 640
]
self.head_conf['train_cfg']['code_weight'] = [
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]
self.model = FusionBEVDepth(self.backbone_conf, self.head_conf)
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_fusion_lss_r50_256x704_128x128_24e_2key')
================================================
FILE: bevdepth/exps/nuscenes/fusion/bev_depth_fusion_lss_r50_256x704_128x128_24e_2key_trainval.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
from bevdepth.exps.base_cli import run_cli
from .bev_depth_fusion_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel as BaseBEVDepthLightningModel
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.train_info_paths = [
'data/nuScenes/nuscenes_infos_train.pkl',
'data/nuScenes/nuscenes_infos_val.pkl'
]
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_fusion_lss_r50_256x704_128x128_24e_2key_trainval')
================================================
FILE: bevdepth/exps/nuscenes/fusion/bev_depth_fusion_lss_r50_256x704_128x128_24e_key4.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.fusion.bev_depth_fusion_lss_r50_256x704_128x128_24e import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
from bevdepth.models.fusion_bev_depth import FusionBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.sweep_idxes = [4]
self.head_conf['bev_backbone_conf']['in_channels'] = 80 * (
len(self.sweep_idxes) + 1)
self.head_conf['bev_neck_conf']['in_channels'] = [
80 * (len(self.sweep_idxes) + 1), 160, 320, 640
]
self.head_conf['train_cfg']['code_weight'] = [
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]
self.model = FusionBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=False)
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_fusion_lss_r50_256x704_128x128_24e_key4')
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3484
mATE: 0.6159
mASE: 0.2716
mAOE: 0.4144
mAVE: 0.4402
mAAE: 0.1954
NDS: 0.4805
Eval time: 110.7s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.553 0.480 0.157 0.117 0.386 0.205
truck 0.252 0.645 0.202 0.097 0.381 0.185
bus 0.378 0.674 0.197 0.090 0.871 0.298
trailer 0.163 0.932 0.230 0.409 0.543 0.098
construction_vehicle 0.076 0.878 0.495 1.015 0.103 0.344
pedestrian 0.361 0.694 0.300 0.816 0.491 0.247
motorcycle 0.319 0.569 0.252 0.431 0.552 0.181
bicycle 0.286 0.457 0.255 0.630 0.194 0.006
traffic_cone 0.536 0.438 0.339 nan nan nan
barrier 0.559 0.392 0.289 0.124 nan nan
"""
import torch
from torch.optim.lr_scheduler import MultiStepLR
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_depth_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
from bevdepth.models.base_bev_depth import BaseBEVDepth as BaseBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.backbone_conf['use_da'] = True
self.data_use_cbgs = True
self.model = BaseBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=True)
def configure_optimizers(self):
lr = self.basic_lr_per_img * \
self.batch_size_per_device * self.gpus
optimizer = torch.optim.AdamW(self.model.parameters(),
lr=lr,
weight_decay=1e-7)
scheduler = MultiStepLR(optimizer, [16, 19])
return [[optimizer], [scheduler]]
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da',
extra_trainer_config_args={'epochs': 20})
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3589
mATE: 0.6119
mASE: 0.2692
mAOE: 0.5074
mAVE: 0.4086
mAAE: 0.2009
NDS: 0.4797
Eval time: 183.3s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.559 0.475 0.157 0.112 0.370 0.205
truck 0.270 0.659 0.196 0.103 0.356 0.181
bus 0.374 0.651 0.184 0.072 0.846 0.326
trailer 0.179 0.963 0.227 0.512 0.294 0.127
construction_vehicle 0.081 0.825 0.481 1.352 0.094 0.345
pedestrian 0.363 0.690 0.297 0.831 0.491 0.244
motorcycle 0.354 0.580 0.255 0.545 0.615 0.164
bicycle 0.301 0.447 0.280 0.920 0.203 0.015
traffic_cone 0.539 0.435 0.324 nan nan nan
barrier 0.569 0.394 0.293 0.120 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da import \
BEVDepthLightningModel # noqa
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema',
use_ema=True,
extra_trainer_config_args={'epochs': 20})
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_24e_2key.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3304
mATE: 0.7021
mASE: 0.2795
mAOE: 0.5346
mAVE: 0.5530
mAAE: 0.2274
NDS: 0.4355
Eval time: 171.8s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.499 0.540 0.165 0.211 0.650 0.233
truck 0.278 0.719 0.218 0.265 0.547 0.215
bus 0.386 0.661 0.211 0.171 1.132 0.274
trailer 0.168 1.034 0.235 0.548 0.408 0.168
construction_vehicle 0.075 1.124 0.510 1.177 0.111 0.385
pedestrian 0.284 0.757 0.298 0.966 0.578 0.301
motorcycle 0.335 0.624 0.263 0.621 0.734 0.237
bicycle 0.305 0.554 0.264 0.653 0.263 0.006
traffic_cone 0.462 0.516 0.355 nan nan nan
barrier 0.512 0.491 0.275 0.200 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.base_exp import \
BEVDepthLightningModel as BaseBEVDepthLightningModel
from bevdepth.models.base_bev_depth import BaseBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.key_idxes = [-1]
self.head_conf['bev_backbone_conf']['in_channels'] = 80 * (
len(self.key_idxes) + 1)
self.head_conf['bev_neck_conf']['in_channels'] = [
80 * (len(self.key_idxes) + 1), 160, 320, 640
]
self.head_conf['train_cfg']['code_weights'] = [
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]
self.model = BaseBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=True)
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_lss_r50_256x704_128x128_24e_2key')
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_24e_2key_ema.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3329
mATE: 0.6832
mASE: 0.2761
mAOE: 0.5446
mAVE: 0.5258
mAAE: 0.2259
NDS: 0.4409
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.505 0.531 0.165 0.189 0.618 0.234
truck 0.274 0.731 0.206 0.211 0.546 0.223
bus 0.394 0.673 0.219 0.148 1.061 0.274
trailer 0.174 0.934 0.228 0.544 0.369 0.183
construction_vehicle 0.079 1.043 0.528 1.162 0.112 0.376
pedestrian 0.284 0.748 0.294 0.973 0.575 0.297
motorcycle 0.345 0.633 0.256 0.719 0.667 0.214
bicycle 0.314 0.544 0.252 0.778 0.259 0.007
traffic_cone 0.453 0.519 0.335 nan nan nan
barrier 0.506 0.475 0.279 0.178 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_depth_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel # noqa
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_lss_r50_256x704_128x128_24e_2key_ema',
use_ema=True)
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_256x704_128x128_24e_ema.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
import torch.nn.parallel
import torch.utils.data
import torch.utils.data.distributed
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.base_exp import \
BEVDepthLightningModel as BaseBEVDepthLightningModel
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def configure_optimizers(self):
lr = self.basic_lr_per_img * \
self.batch_size_per_device * self.gpus
optimizer = torch.optim.AdamW(self.model.parameters(),
lr=lr,
weight_decay=1e-7)
return [optimizer]
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_lss_r50_256x704_128x128_24e_ema',
use_ema=True)
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_512x1408_128x128_24e_2key.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
from torch.optim.lr_scheduler import MultiStepLR
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_depth_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
from bevdepth.models.base_bev_depth import BaseBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
final_dim = (512, 1408)
self.backbone_conf['final_dim'] = final_dim
self.ida_aug_conf['resize_lim'] = (0.386 * 2, 0.55 * 2)
self.ida_aug_conf['final_dim'] = final_dim
self.model = BaseBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=True)
def configure_optimizers(self):
lr = self.basic_lr_per_img * \
self.batch_size_per_device * self.gpus
optimizer = torch.optim.AdamW(self.model.parameters(),
lr=lr,
weight_decay=1e-3)
scheduler = MultiStepLR(optimizer, [19, 23])
return [[optimizer], [scheduler]]
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_lss_r50_512x1408_128x128_24e_2key')
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_depth_lss_r50_640x1600_128x128_24e_2key.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
from torch.optim.lr_scheduler import MultiStepLR
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_depth_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
from bevdepth.models.base_bev_depth import BaseBEVDepth
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
final_dim = (640, 1600)
self.backbone_conf['final_dim'] = final_dim
self.ida_aug_conf['resize_lim'] = (0.386 * 2, 0.55 * 2)
self.ida_aug_conf['final_dim'] = final_dim
self.model = BaseBEVDepth(self.backbone_conf,
self.head_conf,
is_train_depth=True)
def configure_optimizers(self):
lr = self.basic_lr_per_img * \
self.batch_size_per_device * self.gpus
optimizer = torch.optim.AdamW(self.model.parameters(),
lr=lr,
weight_decay=1e-3)
scheduler = MultiStepLR(optimizer, [19, 23])
return [[optimizer], [scheduler]]
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_depth_lss_r50_512x1408_128x128_24e_2key')
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3576
mATE: 0.6071
mASE: 0.2684
mAOE: 0.4157
mAVE: 0.3928
mAAE: 0.2021
NDS: 0.4902
Eval time: 129.7s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.559 0.465 0.157 0.110 0.350 0.205
truck 0.285 0.633 0.205 0.101 0.304 0.209
bus 0.373 0.667 0.204 0.076 0.896 0.345
trailer 0.167 0.956 0.228 0.482 0.289 0.100
construction_vehicle 0.077 0.869 0.454 1.024 0.108 0.335
pedestrian 0.402 0.652 0.299 0.821 0.493 0.253
motorcycle 0.321 0.544 0.255 0.484 0.529 0.159
bicycle 0.276 0.466 0.272 0.522 0.173 0.011
traffic_cone 0.551 0.432 0.321 nan nan nan
barrier 0.565 0.386 0.287 0.121 nan nan
"""
import torch
from torch.optim.lr_scheduler import MultiStepLR
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_stereo_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
from bevdepth.models.bev_stereo import BEVStereo
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.backbone_conf['use_da'] = True
self.data_use_cbgs = True
self.basic_lr_per_img = 2e-4 / 32
self.model = BEVStereo(self.backbone_conf,
self.head_conf,
is_train_depth=True)
def configure_optimizers(self):
lr = self.basic_lr_per_img * \
self.batch_size_per_device * self.gpus
optimizer = torch.optim.AdamW(self.model.parameters(),
lr=lr,
weight_decay=1e-2)
scheduler = MultiStepLR(optimizer, [16, 19])
return [[optimizer], [scheduler]]
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da',
extra_trainer_config_args={'epochs': 20})
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3721
mATE: 0.5980
mASE: 0.2701
mAOE: 0.4381
mAVE: 0.3672
mAAE: 0.1898
NDS: 0.4997
Eval time: 138.0s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.567 0.457 0.156 0.104 0.343 0.204
truck 0.299 0.650 0.205 0.103 0.321 0.197
bus 0.394 0.613 0.203 0.106 0.643 0.252
trailer 0.178 0.991 0.239 0.433 0.345 0.070
construction_vehicle 0.102 0.826 0.458 1.055 0.114 0.372
pedestrian 0.402 0.653 0.297 0.803 0.479 0.249
motorcycle 0.356 0.553 0.251 0.450 0.512 0.168
bicycle 0.311 0.440 0.265 0.779 0.180 0.006
traffic_cone 0.552 0.420 0.336 nan nan nan
barrier 0.561 0.377 0.291 0.111 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da import \
BEVDepthLightningModel # noqa
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_stereo_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema',
use_ema=True,
extra_trainer_config_args={'epochs': 20})
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_2key.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3456
mATE: 0.6589
mASE: 0.2774
mAOE: 0.5500
mAVE: 0.4980
mAAE: 0.2278
NDS: 0.4516
Eval time: 158.2s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.510 0.525 0.165 0.188 0.510 0.226
truck 0.288 0.698 0.220 0.205 0.443 0.227
bus 0.378 0.622 0.210 0.135 0.896 0.289
trailer 0.156 1.003 0.219 0.482 0.609 0.179
construction_vehicle 0.094 0.929 0.502 1.209 0.108 0.365
pedestrian 0.356 0.728 0.297 1.005 0.579 0.319
motorcycle 0.361 0.571 0.258 0.734 0.631 0.211
bicycle 0.318 0.533 0.269 0.793 0.208 0.007
traffic_cone 0.488 0.501 0.355 nan nan nan
barrier 0.506 0.478 0.277 0.200 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.base_exp import \
BEVDepthLightningModel as BaseBEVDepthLightningModel
from bevdepth.models.bev_stereo import BEVStereo
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.key_idxes = [-1]
self.head_conf['bev_backbone_conf']['in_channels'] = 80 * (
len(self.key_idxes) + 1)
self.head_conf['bev_neck_conf']['in_channels'] = [
80 * (len(self.key_idxes) + 1), 160, 320, 640
]
self.head_conf['train_cfg']['code_weights'] = [
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]
self.head_conf['test_cfg']['thresh_scale'] = [
0.6, 0.4, 0.4, 0.7, 0.8, 0.9
]
self.head_conf['test_cfg']['nms_type'] = 'size_aware_circle'
self.model = BEVStereo(self.backbone_conf,
self.head_conf,
is_train_depth=True)
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_stereo_lss_r50_256x704_128x128_24e_2key')
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_2key_ema.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3494
mATE: 0.6672
mASE: 0.2785
mAOE: 0.5607
mAVE: 0.4687
mAAE: 0.2295
NDS: 0.4542
Eval time: 166.7s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.509 0.522 0.163 0.187 0.507 0.228
truck 0.287 0.694 0.213 0.202 0.449 0.229
bus 0.390 0.681 0.207 0.152 0.902 0.261
trailer 0.167 0.945 0.248 0.491 0.340 0.185
construction_vehicle 0.087 1.057 0.515 1.199 0.104 0.377
pedestrian 0.351 0.729 0.299 0.987 0.575 0.321
motorcycle 0.368 0.581 0.262 0.721 0.663 0.226
bicycle 0.338 0.494 0.258 0.921 0.209 0.008
traffic_cone 0.494 0.502 0.341 nan nan nan
barrier 0.502 0.467 0.278 0.185 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_stereo_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel # noqa
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_stereo_lss_r50_256x704_128x128_24e_2key_ema',
use_ema=True)
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_key4.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3427
mATE: 0.6560
mASE: 0.2784
mAOE: 0.5982
mAVE: 0.5347
mAAE: 0.2228
NDS: 0.4423
Eval time: 116.3s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.508 0.518 0.163 0.188 0.534 0.230
truck 0.268 0.709 0.214 0.215 0.510 0.226
bus 0.379 0.640 0.207 0.142 1.049 0.315
trailer 0.151 0.953 0.240 0.541 0.618 0.113
construction_vehicle 0.092 0.955 0.514 1.360 0.113 0.394
pedestrian 0.350 0.727 0.300 1.013 0.598 0.328
motorcycle 0.371 0.576 0.259 0.777 0.634 0.175
bicycle 0.325 0.512 0.261 0.942 0.221 0.002
traffic_cone 0.489 0.503 0.345 nan nan nan
barrier 0.495 0.468 0.280 0.206 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_stereo_lss_r50_256x704_128x128_24e_2key import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.num_sweeps = 2
self.sweep_idxes = [4]
self.key_idxes = list()
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_stereo_lss_r50_256x704_128x128_24e_key4')
================================================
FILE: bevdepth/exps/nuscenes/mv/bev_stereo_lss_r50_256x704_128x128_24e_key4_ema.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
"""
mAP: 0.3427
mATE: 0.6560
mASE: 0.2784
mAOE: 0.5982
mAVE: 0.5347
mAAE: 0.2228
NDS: 0.4423
Eval time: 116.3s
Per-class results:
Object Class AP ATE ASE AOE AVE AAE
car 0.508 0.518 0.163 0.188 0.534 0.230
truck 0.268 0.709 0.214 0.215 0.510 0.226
bus 0.379 0.640 0.207 0.142 1.049 0.315
trailer 0.151 0.953 0.240 0.541 0.618 0.113
construction_vehicle 0.092 0.955 0.514 1.360 0.113 0.394
pedestrian 0.350 0.727 0.300 1.013 0.598 0.328
motorcycle 0.371 0.576 0.259 0.777 0.634 0.175
bicycle 0.325 0.512 0.261 0.942 0.221 0.002
traffic_cone 0.489 0.503 0.345 nan nan nan
barrier 0.495 0.468 0.280 0.206 nan nan
"""
from bevdepth.exps.base_cli import run_cli
from bevdepth.exps.nuscenes.mv.bev_stereo_lss_r50_256x704_128x128_24e_key4 import \
BEVDepthLightningModel as BaseBEVDepthLightningModel # noqa
class BEVDepthLightningModel(BaseBEVDepthLightningModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.num_sweeps = 2
self.sweep_idxes = [4]
self.key_idxes = list()
if __name__ == '__main__':
run_cli(BEVDepthLightningModel,
'bev_stereo_lss_r50_256x704_128x128_24e_key4_ema')
================================================
FILE: bevdepth/layers/__init__.py
================================================
from .heads.bev_depth_head import BEVDepthHead
__all__ = ['BEVDepthHead']
================================================
FILE: bevdepth/layers/backbones/__init__.py
================================================
from .base_lss_fpn import BaseLSSFPN
from .fusion_lss_fpn import FusionLSSFPN
__all__ = ['BaseLSSFPN', 'FusionLSSFPN']
================================================
FILE: bevdepth/layers/backbones/base_lss_fpn.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
import torch.nn.functional as F
from mmcv.cnn import build_conv_layer
from mmdet3d.models import build_neck
from mmdet.models import build_backbone
from mmdet.models.backbones.resnet import BasicBlock
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
try:
from bevdepth.ops.voxel_pooling_inference import voxel_pooling_inference
from bevdepth.ops.voxel_pooling_train import voxel_pooling_train
except ImportError:
print('Import VoxelPooling fail.')
__all__ = ['BaseLSSFPN']
class _ASPPModule(nn.Module):
def __init__(self, inplanes, planes, kernel_size, padding, dilation,
BatchNorm):
super(_ASPPModule, self).__init__()
self.atrous_conv = nn.Conv2d(inplanes,
planes,
kernel_size=kernel_size,
stride=1,
padding=padding,
dilation=dilation,
bias=False)
self.bn = BatchNorm(planes)
self.relu = nn.ReLU()
self._init_weight()
def forward(self, x):
x = self.atrous_conv(x)
x = self.bn(x)
return self.relu(x)
def _init_weight(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
class ASPP(nn.Module):
def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d):
super(ASPP, self).__init__()
dilations = [1, 6, 12, 18]
self.aspp1 = _ASPPModule(inplanes,
mid_channels,
1,
padding=0,
dilation=dilations[0],
BatchNorm=BatchNorm)
self.aspp2 = _ASPPModule(inplanes,
mid_channels,
3,
padding=dilations[1],
dilation=dilations[1],
BatchNorm=BatchNorm)
self.aspp3 = _ASPPModule(inplanes,
mid_channels,
3,
padding=dilations[2],
dilation=dilations[2],
BatchNorm=BatchNorm)
self.aspp4 = _ASPPModule(inplanes,
mid_channels,
3,
padding=dilations[3],
dilation=dilations[3],
BatchNorm=BatchNorm)
self.global_avg_pool = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False),
BatchNorm(mid_channels),
nn.ReLU(),
)
self.conv1 = nn.Conv2d(int(mid_channels * 5),
mid_channels,
1,
bias=False)
self.bn1 = BatchNorm(mid_channels)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
self._init_weight()
def forward(self, x):
x1 = self.aspp1(x)
x2 = self.aspp2(x)
x3 = self.aspp3(x)
x4 = self.aspp4(x)
x5 = self.global_avg_pool(x)
x5 = F.interpolate(x5,
size=x4.size()[2:],
mode='bilinear',
align_corners=True)
x = torch.cat((x1, x2, x3, x4, x5), dim=1)
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
return self.dropout(x)
def _init_weight(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
class Mlp(nn.Module):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.ReLU,
drop=0.0):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.drop1 = nn.Dropout(drop)
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop2 = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop1(x)
x = self.fc2(x)
x = self.drop2(x)
return x
class SELayer(nn.Module):
def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
super().__init__()
self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
self.act1 = act_layer()
self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
self.gate = gate_layer()
def forward(self, x, x_se):
x_se = self.conv_reduce(x_se)
x_se = self.act1(x_se)
x_se = self.conv_expand(x_se)
return x * self.gate(x_se)
class DepthNet(nn.Module):
def __init__(self, in_channels, mid_channels, context_channels,
depth_channels):
super(DepthNet, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
)
self.context_conv = nn.Conv2d(mid_channels,
context_channels,
kernel_size=1,
stride=1,
padding=0)
self.bn = nn.BatchNorm1d(27)
self.depth_mlp = Mlp(27, mid_channels, mid_channels)
self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware
self.context_mlp = Mlp(27, mid_channels, mid_channels)
self.context_se = SELayer(mid_channels) # NOTE: add camera-aware
self.depth_conv = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
BasicBlock(mid_channels, mid_channels),
BasicBlock(mid_channels, mid_channels),
ASPP(mid_channels, mid_channels),
build_conv_layer(cfg=dict(
type='DCN',
in_channels=mid_channels,
out_channels=mid_channels,
kernel_size=3,
padding=1,
groups=4,
im2col_step=128,
)),
nn.Conv2d(mid_channels,
depth_channels,
kernel_size=1,
stride=1,
padding=0),
)
def forward(self, x, mats_dict):
intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3]
batch_size = intrins.shape[0]
num_cams = intrins.shape[2]
ida = mats_dict['ida_mats'][:, 0:1, ...]
sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :]
bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4,
4).repeat(1, 1, num_cams, 1, 1)
mlp_input = torch.cat(
[
torch.stack(
[
intrins[:, 0:1, ..., 0, 0],
intrins[:, 0:1, ..., 1, 1],
intrins[:, 0:1, ..., 0, 2],
intrins[:, 0:1, ..., 1, 2],
ida[:, 0:1, ..., 0, 0],
ida[:, 0:1, ..., 0, 1],
ida[:, 0:1, ..., 0, 3],
ida[:, 0:1, ..., 1, 0],
ida[:, 0:1, ..., 1, 1],
ida[:, 0:1, ..., 1, 3],
bda[:, 0:1, ..., 0, 0],
bda[:, 0:1, ..., 0, 1],
bda[:, 0:1, ..., 1, 0],
bda[:, 0:1, ..., 1, 1],
bda[:, 0:1, ..., 2, 2],
],
dim=-1,
),
sensor2ego.view(batch_size, 1, num_cams, -1),
],
-1,
)
mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
x = self.reduce_conv(x)
context_se = self.context_mlp(mlp_input)[..., None, None]
context = self.context_se(x, context_se)
context = self.context_conv(context)
depth_se = self.depth_mlp(mlp_input)[..., None, None]
depth = self.depth_se(x, depth_se)
depth = self.depth_conv(depth)
return torch.cat([depth, context], dim=1)
class DepthAggregation(nn.Module):
"""
pixel cloud feature extraction
"""
def __init__(self, in_channels, mid_channels, out_channels):
super(DepthAggregation, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
)
self.conv = nn.Sequential(
nn.Conv2d(mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
)
self.out_conv = nn.Sequential(
nn.Conv2d(mid_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=True),
# nn.BatchNorm3d(out_channels),
# nn.ReLU(inplace=True),
)
@autocast(False)
def forward(self, x):
x = self.reduce_conv(x)
x = self.conv(x) + x
x = self.out_conv(x)
return x
class BaseLSSFPN(nn.Module):
def __init__(self,
x_bound,
y_bound,
z_bound,
d_bound,
final_dim,
downsample_factor,
output_channels,
img_backbone_conf,
img_neck_conf,
depth_net_conf,
use_da=False):
"""Modified from `https://github.com/nv-tlabs/lift-splat-shoot`.
Args:
x_bound (list): Boundaries for x.
y_bound (list): Boundaries for y.
z_bound (list): Boundaries for z.
d_bound (list): Boundaries for d.
final_dim (list): Dimension for input images.
downsample_factor (int): Downsample factor between feature map
and input image.
output_channels (int): Number of channels for the output
feature map.
img_backbone_conf (dict): Config for image backbone.
img_neck_conf (dict): Config for image neck.
depth_net_conf (dict): Config for depth net.
"""
super(BaseLSSFPN, self).__init__()
self.downsample_factor = downsample_factor
self.d_bound = d_bound
self.final_dim = final_dim
self.output_channels = output_channels
self.register_buffer(
'voxel_size',
torch.Tensor([row[2] for row in [x_bound, y_bound, z_bound]]))
self.register_buffer(
'voxel_coord',
torch.Tensor([
row[0] + row[2] / 2.0 for row in [x_bound, y_bound, z_bound]
]))
self.register_buffer(
'voxel_num',
torch.LongTensor([(row[1] - row[0]) / row[2]
for row in [x_bound, y_bound, z_bound]]))
self.register_buffer('frustum', self.create_frustum())
self.depth_channels, _, _, _ = self.frustum.shape
self.img_backbone = build_backbone(img_backbone_conf)
self.img_neck = build_neck(img_neck_conf)
self.depth_net = self._configure_depth_net(depth_net_conf)
self.img_neck.init_weights()
self.img_backbone.init_weights()
self.use_da = use_da
if self.use_da:
self.depth_aggregation_net = self._configure_depth_aggregation_net(
)
def _configure_depth_net(self, depth_net_conf):
return DepthNet(
depth_net_conf['in_channels'],
depth_net_conf['mid_channels'],
self.output_channels,
self.depth_channels,
)
def _configure_depth_aggregation_net(self):
"""build pixel cloud feature extractor"""
return DepthAggregation(self.output_channels, self.output_channels,
self.output_channels)
def _forward_voxel_net(self, img_feat_with_depth):
if self.use_da:
# BEVConv2D [n, c, d, h, w] -> [n, h, c, w, d]
img_feat_with_depth = img_feat_with_depth.permute(
0, 3, 1, 4,
2).contiguous() # [n, c, d, h, w] -> [n, h, c, w, d]
n, h, c, w, d = img_feat_with_depth.shape
img_feat_with_depth = img_feat_with_depth.view(-1, c, w, d)
img_feat_with_depth = (
self.depth_aggregation_net(img_feat_with_depth).view(
n, h, c, w, d).permute(0, 2, 4, 1, 3).contiguous())
return img_feat_with_depth
def create_frustum(self):
"""Generate frustum"""
# make grid in image plane
ogfH, ogfW = self.final_dim
fH, fW = ogfH // self.downsample_factor, ogfW // self.downsample_factor
d_coords = torch.arange(*self.d_bound,
dtype=torch.float).view(-1, 1,
1).expand(-1, fH, fW)
D, _, _ = d_coords.shape
x_coords = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(
1, 1, fW).expand(D, fH, fW)
y_coords = torch.linspace(0, ogfH - 1, fH,
dtype=torch.float).view(1, fH,
1).expand(D, fH, fW)
paddings = torch.ones_like(d_coords)
# D x H x W x 3
frustum = torch.stack((x_coords, y_coords, d_coords, paddings), -1)
return frustum
def get_geometry(self, sensor2ego_mat, intrin_mat, ida_mat, bda_mat):
"""Transfer points from camera coord to ego coord.
Args:
rots(Tensor): Rotation matrix from camera to ego.
trans(Tensor): Translation matrix from camera to ego.
intrins(Tensor): Intrinsic matrix.
post_rots_ida(Tensor): Rotation matrix for ida.
post_trans_ida(Tensor): Translation matrix for ida
post_rot_bda(Tensor): Rotation matrix for bda.
Returns:
Tensors: points ego coord.
"""
batch_size, num_cams, _, _ = sensor2ego_mat.shape
# undo post-transformation
# B x N x D x H x W x 3
points = self.frustum
ida_mat = ida_mat.view(batch_size, num_cams, 1, 1, 1, 4, 4)
points = ida_mat.inverse().matmul(points.unsqueeze(-1))
# cam_to_ego
points = torch.cat(
(points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
points[:, :, :, :, :, 2:]), 5)
combine = sensor2ego_mat.matmul(torch.inverse(intrin_mat))
points = combine.view(batch_size, num_cams, 1, 1, 1, 4,
4).matmul(points)
if bda_mat is not None:
bda_mat = bda_mat.unsqueeze(1).repeat(1, num_cams, 1, 1).view(
batch_size, num_cams, 1, 1, 1, 4, 4)
points = (bda_mat @ points).squeeze(-1)
else:
points = points.squeeze(-1)
return points[..., :3]
def get_cam_feats(self, imgs):
"""Get feature maps from images."""
batch_size, num_sweeps, num_cams, num_channels, imH, imW = imgs.shape
imgs = imgs.flatten().view(batch_size * num_sweeps * num_cams,
num_channels, imH, imW)
img_feats = self.img_neck(self.img_backbone(imgs))[0]
img_feats = img_feats.reshape(batch_size, num_sweeps, num_cams,
img_feats.shape[1], img_feats.shape[2],
img_feats.shape[3])
return img_feats
def _forward_depth_net(self, feat, mats_dict):
return self.depth_net(feat, mats_dict)
def _forward_single_sweep(self,
sweep_index,
sweep_imgs,
mats_dict,
is_return_depth=False):
"""Forward function for single sweep.
Args:
sweep_index (int): Index of sweeps.
sweep_imgs (Tensor): Input images.
mats_dict (dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
is_return_depth (bool, optional): Whether to return depth.
Default: False.
Returns:
Tensor: BEV feature map.
"""
batch_size, num_sweeps, num_cams, num_channels, img_height, \
img_width = sweep_imgs.shape
img_feats = self.get_cam_feats(sweep_imgs)
source_features = img_feats[:, 0, ...]
depth_feature = self._forward_depth_net(
source_features.reshape(batch_size * num_cams,
source_features.shape[2],
source_features.shape[3],
source_features.shape[4]),
mats_dict,
)
depth = depth_feature[:, :self.depth_channels].softmax(
dim=1, dtype=depth_feature.dtype)
geom_xyz = self.get_geometry(
mats_dict['sensor2ego_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict.get('bda_mat', None),
)
geom_xyz = ((geom_xyz - (self.voxel_coord - self.voxel_size / 2.0)) /
self.voxel_size).int()
if self.training or self.use_da:
img_feat_with_depth = depth.unsqueeze(
1) * depth_feature[:, self.depth_channels:(
self.depth_channels + self.output_channels)].unsqueeze(2)
img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth)
img_feat_with_depth = img_feat_with_depth.reshape(
batch_size,
num_cams,
img_feat_with_depth.shape[1],
img_feat_with_depth.shape[2],
img_feat_with_depth.shape[3],
img_feat_with_depth.shape[4],
)
img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2)
feature_map = voxel_pooling_train(geom_xyz,
img_feat_with_depth.contiguous(),
self.voxel_num.cuda())
else:
feature_map = voxel_pooling_inference(
geom_xyz, depth, depth_feature[:, self.depth_channels:(
self.depth_channels + self.output_channels)].contiguous(),
self.voxel_num.cuda())
if is_return_depth:
# final_depth has to be fp32, otherwise the depth
# loss will colapse during the traing process.
return feature_map.contiguous(
), depth_feature[:, :self.depth_channels].softmax(dim=1)
return feature_map.contiguous()
def forward(self,
sweep_imgs,
mats_dict,
timestamps=None,
is_return_depth=False):
"""Forward function.
Args:
sweep_imgs(Tensor): Input images with shape of (B, num_sweeps,
num_cameras, 3, H, W).
mats_dict(dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
timestamps(Tensor): Timestamp for all images with the shape of(B,
num_sweeps, num_cameras).
Return:
Tensor: bev feature map.
"""
batch_size, num_sweeps, num_cams, num_channels, img_height, \
img_width = sweep_imgs.shape
key_frame_res = self._forward_single_sweep(
0,
sweep_imgs[:, 0:1, ...],
mats_dict,
is_return_depth=is_return_depth)
if num_sweeps == 1:
return key_frame_res
key_frame_feature = key_frame_res[
0] if is_return_depth else key_frame_res
ret_feature_list = [key_frame_feature]
for sweep_index in range(1, num_sweeps):
with torch.no_grad():
feature_map = self._forward_single_sweep(
sweep_index,
sweep_imgs[:, sweep_index:sweep_index + 1, ...],
mats_dict,
is_return_depth=False)
ret_feature_list.append(feature_map)
if is_return_depth:
return torch.cat(ret_feature_list, 1), key_frame_res[1]
else:
return torch.cat(ret_feature_list, 1)
================================================
FILE: bevdepth/layers/backbones/bevstereo_lss_fpn.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import math
import numpy as np
import torch
import torch.nn.functional as F
from mmcv.cnn import build_conv_layer
from mmdet.models.backbones.resnet import BasicBlock
from scipy.special import erf
from scipy.stats import norm
from torch import nn
from bevdepth.layers.backbones.base_lss_fpn import (ASPP, BaseLSSFPN, Mlp,
SELayer)
try:
from bevdepth.ops.voxel_pooling_inference import voxel_pooling_inference
from bevdepth.ops.voxel_pooling_train import voxel_pooling_train
except ImportError:
print('Import VoxelPooling fail.')
__all__ = ['BEVStereoLSSFPN']
class ConvBnReLU3D(nn.Module):
"""Implements of 3d convolution + batch normalization + ReLU."""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
pad: int = 1,
dilation: int = 1,
) -> None:
"""initialization method for convolution3D +
batch normalization + relu module
Args:
in_channels: input channel number of convolution layer
out_channels: output channel number of convolution layer
kernel_size: kernel size of convolution layer
stride: stride of convolution layer
pad: pad of convolution layer
dilation: dilation of convolution layer
"""
super(ConvBnReLU3D, self).__init__()
self.conv = nn.Conv3d(in_channels,
out_channels,
kernel_size,
stride=stride,
padding=pad,
dilation=dilation,
bias=False)
self.bn = nn.BatchNorm3d(out_channels)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""forward method"""
return F.relu(self.bn(self.conv(x)), inplace=True)
class DepthNet(nn.Module):
def __init__(self,
in_channels,
mid_channels,
context_channels,
depth_channels,
d_bound,
num_ranges=4):
super(DepthNet, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
)
self.context_conv = nn.Conv2d(mid_channels,
context_channels,
kernel_size=1,
stride=1,
padding=0)
self.bn = nn.BatchNorm1d(27)
self.depth_mlp = Mlp(27, mid_channels, mid_channels)
self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware
self.context_mlp = Mlp(27, mid_channels, mid_channels)
self.context_se = SELayer(mid_channels) # NOTE: add camera-aware
self.depth_feat_conv = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
BasicBlock(mid_channels, mid_channels),
ASPP(mid_channels, mid_channels),
build_conv_layer(cfg=dict(
type='DCN',
in_channels=mid_channels,
out_channels=mid_channels,
kernel_size=3,
padding=1,
groups=4,
im2col_step=128,
)),
)
self.mu_sigma_range_net = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
nn.ConvTranspose2d(mid_channels,
mid_channels,
3,
stride=2,
padding=1,
output_padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.ConvTranspose2d(mid_channels,
mid_channels,
3,
stride=2,
padding=1,
output_padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels,
num_ranges * 3,
kernel_size=1,
stride=1,
padding=0),
)
self.mono_depth_net = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
nn.Conv2d(mid_channels,
depth_channels,
kernel_size=1,
stride=1,
padding=0),
)
self.d_bound = d_bound
self.num_ranges = num_ranges
# @autocast(False)
def forward(self, x, mats_dict, scale_depth_factor=1000.0):
B, _, H, W = x.shape
intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3]
batch_size = intrins.shape[0]
num_cams = intrins.shape[2]
ida = mats_dict['ida_mats'][:, 0:1, ...]
sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :]
bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4,
4).repeat(1, 1, num_cams, 1, 1)
mlp_input = torch.cat(
[
torch.stack(
[
intrins[:, 0:1, ..., 0, 0],
intrins[:, 0:1, ..., 1, 1],
intrins[:, 0:1, ..., 0, 2],
intrins[:, 0:1, ..., 1, 2],
ida[:, 0:1, ..., 0, 0],
ida[:, 0:1, ..., 0, 1],
ida[:, 0:1, ..., 0, 3],
ida[:, 0:1, ..., 1, 0],
ida[:, 0:1, ..., 1, 1],
ida[:, 0:1, ..., 1, 3],
bda[:, 0:1, ..., 0, 0],
bda[:, 0:1, ..., 0, 1],
bda[:, 0:1, ..., 1, 0],
bda[:, 0:1, ..., 1, 1],
bda[:, 0:1, ..., 2, 2],
],
dim=-1,
),
sensor2ego.view(batch_size, 1, num_cams, -1),
],
-1,
)
mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
x = self.reduce_conv(x)
context_se = self.context_mlp(mlp_input)[..., None, None]
context = self.context_se(x, context_se)
context = self.context_conv(context)
depth_se = self.depth_mlp(mlp_input)[..., None, None]
depth_feat = self.depth_se(x, depth_se)
depth_feat = self.depth_feat_conv(depth_feat)
mono_depth = self.mono_depth_net(depth_feat)
mu_sigma_score = self.mu_sigma_range_net(depth_feat)
d_coords = torch.arange(*self.d_bound,
dtype=torch.float).reshape(1, -1, 1, 1).cuda()
d_coords = d_coords.repeat(B, 1, H, W)
mu = mu_sigma_score[:, 0:self.num_ranges, ...]
sigma = mu_sigma_score[:, self.num_ranges:2 * self.num_ranges, ...]
range_score = mu_sigma_score[:,
2 * self.num_ranges:3 * self.num_ranges,
...]
sigma = F.elu(sigma) + 1.0 + 1e-10
return x, context, mu, sigma, range_score, mono_depth
class BEVStereoLSSFPN(BaseLSSFPN):
def __init__(self,
x_bound,
y_bound,
z_bound,
d_bound,
final_dim,
downsample_factor,
output_channels,
img_backbone_conf,
img_neck_conf,
depth_net_conf,
use_da=False,
sampling_range=3,
num_samples=3,
stereo_downsample_factor=4,
em_iteration=3,
min_sigma=1,
num_groups=8,
num_ranges=4,
range_list=[[2, 8], [8, 16], [16, 28], [28, 58]],
k_list=None,
use_mask=True):
"""Modified from `https://github.com/nv-tlabs/lift-splat-shoot`.
Args:
x_bound (list): Boundaries for x.
y_bound (list): Boundaries for y.
z_bound (list): Boundaries for z.
d_bound (list): Boundaries for d.
final_dim (list): Dimension for input images.
downsample_factor (int): Downsample factor between feature map
and input image.
output_channels (int): Number of channels for the output
feature map.
img_backbone_conf (dict): Config for image backbone.
img_neck_conf (dict): Config for image neck.
depth_net_conf (dict): Config for depth net.
sampling_range (int): The base range of sampling candidates.
Defaults to 3.
num_samples (int): Number of samples. Defaults to 3.
stereo_downsample_factor (int): Downsample factor from input image
and stereo depth. Defaults to 4.
em_iteration (int): Number of iterations for em. Defaults to 3.
min_sigma (float): Minimal value for sigma. Defaults to 1.
num_groups (int): Number of groups to keep after inner product.
Defaults to 8.
num_ranges (int): Number of split ranges. Defaults to 1.
range_list (list): Start and end of every range, Defaults to None.
k_list (list): Depth of all candidates inside the range.
Defaults to None.
use_mask (bool): Whether to use mask_net. Defaults to True.
"""
self.num_ranges = num_ranges
self.sampling_range = sampling_range
self.num_samples = num_samples
super(BEVStereoLSSFPN,
self).__init__(x_bound, y_bound, z_bound, d_bound, final_dim,
downsample_factor, output_channels,
img_backbone_conf, img_neck_conf, depth_net_conf,
use_da)
self.depth_channels, _, _, _ = self.frustum.shape
self.use_mask = use_mask
if k_list is None:
self.register_buffer('k_list', torch.Tensor(self.depth_sampling()))
else:
self.register_buffer('k_list', torch.Tensor(k_list))
self.stereo_downsample_factor = stereo_downsample_factor
self.em_iteration = em_iteration
self.register_buffer(
'depth_values',
torch.arange((self.d_bound[1] - self.d_bound[0]) / self.d_bound[2],
dtype=torch.float))
self.num_groups = num_groups
self.similarity_net = nn.Sequential(
ConvBnReLU3D(in_channels=num_groups,
out_channels=16,
kernel_size=1,
stride=1,
pad=0),
ConvBnReLU3D(in_channels=16,
out_channels=8,
kernel_size=1,
stride=1,
pad=0),
nn.Conv3d(in_channels=8,
out_channels=1,
kernel_size=1,
stride=1,
padding=0),
)
if range_list is None:
range_length = (d_bound[1] - d_bound[0]) / num_ranges
self.range_list = [[
d_bound[0] + range_length * i,
d_bound[0] + range_length * (i + 1)
] for i in range(num_ranges)]
else:
assert len(range_list) == num_ranges
self.range_list = range_list
self.min_sigma = min_sigma
self.depth_downsample_net = nn.Sequential(
nn.Conv2d(self.depth_channels, 256, 3, 2, 1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(256, 256, 3, 2, 1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.Conv2d(256, self.depth_channels, 1, 1, 0),
)
self.context_downsample_net = nn.Identity()
if self.use_mask:
self.mask_net = nn.Sequential(
nn.Conv2d(224, 64, 3, 1, 1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
BasicBlock(64, 64),
BasicBlock(64, 64),
nn.Conv2d(64, 1, 1, 1, 0),
nn.Sigmoid(),
)
def depth_sampling(self):
"""Generate sampling range of candidates.
Returns:
list[float]: List of all candidates.
"""
P_total = erf(self.sampling_range /
np.sqrt(2)) # Probability covered by the sampling range
idx_list = np.arange(0, self.num_samples + 1)
p_list = (1 - P_total) / 2 + ((idx_list / self.num_samples) * P_total)
k_list = norm.ppf(p_list)
k_list = (k_list[1:] + k_list[:-1]) / 2
return list(k_list)
def _generate_cost_volume(
self,
sweep_index,
stereo_feats_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
):
"""Generate cost volume based on depth sample.
Args:
sweep_index (int): Index of sweep.
stereo_feats_all_sweeps (list[Tensor]): Stereo feature
of all sweeps.
mats_dict (dict):
sensor2ego_mats (Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats (Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats (Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats (Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat (Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
depth_sample (Tensor): Depth map of all candidates.
depth_sample_frustum (Tensor): Pre-generated frustum.
sensor2sensor_mats (Tensor): Transformation matrix from reference
sensor to source sensor.
Returns:
Tensor: Depth score for all sweeps.
"""
batch_size, num_channels, height, width = stereo_feats_all_sweeps[
0].shape
num_sweeps = len(stereo_feats_all_sweeps)
depth_score_all_sweeps = list()
for idx in range(num_sweeps):
if idx == sweep_index:
continue
warped_stereo_fea = self.homo_warping(
stereo_feats_all_sweeps[idx],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, idx, ...],
sensor2sensor_mats[idx],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, idx, ...],
depth_sample,
depth_sample_frustum.type_as(stereo_feats_all_sweeps[idx]),
)
warped_stereo_fea = warped_stereo_fea.reshape(
batch_size, self.num_groups, num_channels // self.num_groups,
self.num_samples, height, width)
ref_stereo_feat = stereo_feats_all_sweeps[sweep_index].reshape(
batch_size, self.num_groups, num_channels // self.num_groups,
height, width)
feat_cost = torch.mean(
(ref_stereo_feat.unsqueeze(3) * warped_stereo_fea), axis=2)
depth_score = self.similarity_net(feat_cost).squeeze(1)
depth_score_all_sweeps.append(depth_score)
return torch.stack(depth_score_all_sweeps).mean(0)
def homo_warping(
self,
stereo_feat,
key_intrin_mats,
sweep_intrin_mats,
sensor2sensor_mats,
key_ida_mats,
sweep_ida_mats,
depth_sample,
frustum,
):
"""Used for mvs method to transfer sweep image feature to
key image feature.
Args:
src_fea(Tensor): image features.
key_intrin_mats(Tensor): Intrin matrix for key sensor.
sweep_intrin_mats(Tensor): Intrin matrix for sweep sensor.
sensor2sensor_mats(Tensor): Transformation matrix from key
sensor to sweep sensor.
key_ida_mats(Tensor): Ida matrix for key frame.
sweep_ida_mats(Tensor): Ida matrix for sweep frame.
depth_sample (Tensor): Depth map of all candidates.
depth_sample_frustum (Tensor): Pre-generated frustum.
"""
batch_size_with_num_cams, channels = stereo_feat.shape[
0], stereo_feat.shape[1]
height, width = stereo_feat.shape[2], stereo_feat.shape[3]
with torch.no_grad():
points = frustum
points = points.reshape(points.shape[0], -1, points.shape[-1])
points[..., 2] = 1
# Undo ida for key frame.
points = key_ida_mats.reshape(batch_size_with_num_cams, *
key_ida_mats.shape[2:]).inverse(
).unsqueeze(1) @ points.unsqueeze(-1)
# Convert points from pixel coord to key camera coord.
points[..., :3, :] *= depth_sample.reshape(
batch_size_with_num_cams, -1, 1, 1)
num_depth = frustum.shape[1]
points = (key_intrin_mats.reshape(
batch_size_with_num_cams, *
key_intrin_mats.shape[2:]).inverse().unsqueeze(1) @ points)
points = (sensor2sensor_mats.reshape(
batch_size_with_num_cams, *
sensor2sensor_mats.shape[2:]).unsqueeze(1) @ points)
# points in sweep sensor coord.
points = (sweep_intrin_mats.reshape(
batch_size_with_num_cams, *
sweep_intrin_mats.shape[2:]).unsqueeze(1) @ points)
# points in sweep pixel coord.
points[..., :2, :] = points[..., :2, :] / points[
..., 2:3, :] # [B, 2, Ndepth, H*W]
points = (sweep_ida_mats.reshape(
batch_size_with_num_cams, *
sweep_ida_mats.shape[2:]).unsqueeze(1) @ points).squeeze(-1)
neg_mask = points[..., 2] < 1e-3
points[..., 0][neg_mask] = width * self.stereo_downsample_factor
points[..., 1][neg_mask] = height * self.stereo_downsample_factor
points[..., 2][neg_mask] = 1
proj_x_normalized = points[..., 0] / (
(width * self.stereo_downsample_factor - 1) / 2) - 1
proj_y_normalized = points[..., 1] / (
(height * self.stereo_downsample_factor - 1) / 2) - 1
grid = torch.stack([proj_x_normalized, proj_y_normalized],
dim=2) # [B, Ndepth, H*W, 2]
warped_stereo_fea = F.grid_sample(
stereo_feat,
grid.view(batch_size_with_num_cams, num_depth * height, width, 2),
mode='bilinear',
padding_mode='zeros',
)
warped_stereo_fea = warped_stereo_fea.view(batch_size_with_num_cams,
channels, num_depth, height,
width)
return warped_stereo_fea
def _forward_stereo(
self,
sweep_index,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
):
"""Forward function to generate stereo depth.
Args:
sweep_index (int): Index of sweep.
stereo_feats_all_sweeps (list[Tensor]): Stereo feature
of all sweeps.
mono_depth_all_sweeps (list[Tensor]):
mats_dict (dict):
sensor2ego_mats (Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats (Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats (Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats (Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat (Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix from key
sensor to sweep sensor.
mu_all_sweeps (list[Tensor]): List of mu for all sweeps.
sigma_all_sweeps (list[Tensor]): List of sigma for all sweeps.
range_score_all_sweeps (list[Tensor]): List of all range score
for all sweeps.
depth_feat_all_sweeps (list[Tensor]): List of all depth feat for
all sweeps.
Returns:
Tensor: stereo_depth
"""
batch_size_with_cams, _, feat_height, feat_width = \
stereo_feats_all_sweeps[0].shape
device = stereo_feats_all_sweeps[0].device
d_coords = torch.arange(*self.d_bound,
dtype=torch.float,
device=device).reshape(1, -1, 1, 1)
d_coords = d_coords.repeat(batch_size_with_cams, 1, feat_height,
feat_width)
stereo_depth = stereo_feats_all_sweeps[0].new_zeros(
batch_size_with_cams, self.depth_channels, feat_height, feat_width)
mask_score = stereo_feats_all_sweeps[0].new_zeros(
batch_size_with_cams,
self.depth_channels,
feat_height * self.stereo_downsample_factor //
self.downsample_factor,
feat_width * self.stereo_downsample_factor //
self.downsample_factor,
)
score_all_ranges = list()
range_score = range_score_all_sweeps[sweep_index].softmax(1)
for range_idx in range(self.num_ranges):
# Map mu to the corresponding interval.
range_start = self.range_list[range_idx][0]
mu_all_sweeps_single_range = [
mu[:, range_idx:range_idx + 1, ...].sigmoid() *
(self.range_list[range_idx][1] - self.range_list[range_idx][0])
+ range_start for mu in mu_all_sweeps
]
sigma_all_sweeps_single_range = [
sigma[:, range_idx:range_idx + 1, ...]
for sigma in sigma_all_sweeps
]
batch_size_with_cams, _, feat_height, feat_width =\
stereo_feats_all_sweeps[0].shape
mu = mu_all_sweeps_single_range[sweep_index]
sigma = sigma_all_sweeps_single_range[sweep_index]
for _ in range(self.em_iteration):
depth_sample = torch.cat([mu + sigma * k for k in self.k_list],
1)
depth_sample_frustum = self.create_depth_sample_frustum(
depth_sample, self.stereo_downsample_factor)
mu_score = self._generate_cost_volume(
sweep_index,
stereo_feats_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
)
mu_score = mu_score.softmax(1)
scale_factor = torch.clamp(
0.5 / (1e-4 + mu_score[:, self.num_samples //
2:self.num_samples // 2 + 1, ...]),
min=0.1,
max=10)
sigma = torch.clamp(sigma * scale_factor, min=0.1, max=10)
mu = (depth_sample * mu_score).sum(1, keepdim=True)
del depth_sample
del depth_sample_frustum
range_length = int(
(self.range_list[range_idx][1] - self.range_list[range_idx][0])
// self.d_bound[2])
if self.use_mask:
depth_sample = F.avg_pool2d(
mu,
self.downsample_factor // self.stereo_downsample_factor,
self.downsample_factor // self.stereo_downsample_factor,
)
depth_sample_frustum = self.create_depth_sample_frustum(
depth_sample, self.downsample_factor)
mask = self._forward_mask(
sweep_index,
mono_depth_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
)
mask_score[:,
int((range_start - self.d_bound[0]) //
self.d_bound[2]):range_length +
int((range_start - self.d_bound[0]) //
self.d_bound[2]), ..., ] += mask
del depth_sample
del depth_sample_frustum
sigma = torch.clamp(sigma, self.min_sigma)
mu_repeated = mu.repeat(1, range_length, 1, 1)
eps = 1e-6
depth_score_single_range = (-1 / 2 * (
(d_coords[:,
int((range_start - self.d_bound[0]) //
self.d_bound[2]):range_length + int(
(range_start - self.d_bound[0]) //
self.d_bound[2]), ..., ] - mu_repeated) /
torch.sqrt(sigma))**2)
depth_score_single_range = depth_score_single_range.exp()
score_all_ranges.append(mu_score.sum(1).unsqueeze(1))
depth_score_single_range = depth_score_single_range / (
sigma * math.sqrt(2 * math.pi) + eps)
stereo_depth[:,
int((range_start - self.d_bound[0]) //
self.d_bound[2]):range_length +
int((range_start - self.d_bound[0]) //
self.d_bound[2]), ..., ] = (
depth_score_single_range *
range_score[:, range_idx:range_idx + 1, ...])
del depth_score_single_range
del mu_repeated
if self.use_mask:
return stereo_depth, mask_score
else:
return stereo_depth
def create_depth_sample_frustum(self, depth_sample, downsample_factor=16):
"""Generate frustum"""
# make grid in image plane
ogfH, ogfW = self.final_dim
fH, fW = ogfH // downsample_factor, ogfW // downsample_factor
batch_size, num_depth, _, _ = depth_sample.shape
x_coords = (torch.linspace(0,
ogfW - 1,
fW,
dtype=torch.float,
device=depth_sample.device).view(
1, 1, 1,
fW).expand(batch_size, num_depth, fH,
fW))
y_coords = (torch.linspace(0,
ogfH - 1,
fH,
dtype=torch.float,
device=depth_sample.device).view(
1, 1, fH,
1).expand(batch_size, num_depth, fH,
fW))
paddings = torch.ones_like(depth_sample)
# D x H x W x 3
frustum = torch.stack((x_coords, y_coords, depth_sample, paddings), -1)
return frustum
def _configure_depth_net(self, depth_net_conf):
return DepthNet(
depth_net_conf['in_channels'],
depth_net_conf['mid_channels'],
self.output_channels,
self.depth_channels,
self.d_bound,
self.num_ranges,
)
def get_cam_feats(self, imgs):
"""Get feature maps from images."""
batch_size, num_sweeps, num_cams, num_channels, imH, imW = imgs.shape
imgs = imgs.flatten().view(batch_size * num_sweeps * num_cams,
num_channels, imH, imW)
backbone_feats = self.img_backbone(imgs)
img_feats = self.img_neck(backbone_feats)[0]
img_feats_reshape = img_feats.reshape(batch_size, num_sweeps, num_cams,
img_feats.shape[1],
img_feats.shape[2],
img_feats.shape[3])
return img_feats_reshape, backbone_feats[0].detach()
def _forward_mask(
self,
sweep_index,
mono_depth_all_sweeps,
mats_dict,
depth_sample,
depth_sample_frustum,
sensor2sensor_mats,
):
"""Forward function to generate mask.
Args:
sweep_index (int): Index of sweep.
mono_depth_all_sweeps (list[Tensor]): List of mono_depth for
all sweeps.
mats_dict (dict):
sensor2ego_mats (Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats (Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats (Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats (Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat (Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
depth_sample (Tensor): Depth map of all candidates.
depth_sample_frustum (Tensor): Pre-generated frustum.
sensor2sensor_mats (Tensor): Transformation matrix from reference
sensor to source sensor.
Returns:
Tensor: Generated mask.
"""
num_sweeps = len(mono_depth_all_sweeps)
mask_all_sweeps = list()
for idx in range(num_sweeps):
if idx == sweep_index:
continue
warped_mono_depth = self.homo_warping(
mono_depth_all_sweeps[idx],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, idx, ...],
sensor2sensor_mats[idx],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, idx, ...],
depth_sample,
depth_sample_frustum.type_as(mono_depth_all_sweeps[idx]),
)
mask = self.mask_net(
torch.cat([
mono_depth_all_sweeps[sweep_index].detach(),
warped_mono_depth.mean(2).detach()
], 1))
mask_all_sweeps.append(mask)
return torch.stack(mask_all_sweeps).mean(0)
def _forward_single_sweep(self,
sweep_index,
context,
mats_dict,
depth_score,
is_return_depth=False):
"""Forward function for single sweep.
Args:
sweep_index (int): Index of sweeps.
sweep_imgs (Tensor): Input images.
mats_dict (dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
is_return_depth (bool, optional): Whether to return depth.
Default: False.
Returns:
Tensor: BEV feature map.
"""
batch_size, num_cams = context.shape[0], context.shape[1]
context = context.reshape(batch_size * num_cams, *context.shape[2:])
depth = depth_score
geom_xyz = self.get_geometry(
mats_dict['sensor2ego_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict.get('bda_mat', None),
)
geom_xyz = ((geom_xyz - (self.voxel_coord - self.voxel_size / 2.0)) /
self.voxel_size).int()
if self.training or self.use_da:
img_feat_with_depth = depth.unsqueeze(1) * context.unsqueeze(2)
img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth)
img_feat_with_depth = img_feat_with_depth.reshape(
batch_size,
num_cams,
img_feat_with_depth.shape[1],
img_feat_with_depth.shape[2],
img_feat_with_depth.shape[3],
img_feat_with_depth.shape[4],
)
img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2)
feature_map = voxel_pooling_train(geom_xyz,
img_feat_with_depth.contiguous(),
self.voxel_num.cuda())
else:
feature_map = voxel_pooling_inference(geom_xyz, depth.contiguous(),
context.contiguous(),
self.voxel_num.cuda())
if is_return_depth:
return feature_map.contiguous(), depth
return feature_map.contiguous()
def forward(self,
sweep_imgs,
mats_dict,
timestamps=None,
is_return_depth=False):
"""Forward function.
Args:
sweep_imgs(Tensor): Input images with shape of (B, num_sweeps,
num_cameras, 3, H, W).
mats_dict(dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
timestamps(Tensor): Timestamp for all images with the shape of(B,
num_sweeps, num_cameras).
Return:
Tensor: bev feature map.
"""
batch_size, num_sweeps, num_cams, num_channels, img_height, \
img_width = sweep_imgs.shape
context_all_sweeps = list()
depth_feat_all_sweeps = list()
img_feats_all_sweeps = list()
stereo_feats_all_sweeps = list()
mu_all_sweeps = list()
sigma_all_sweeps = list()
mono_depth_all_sweeps = list()
range_score_all_sweeps = list()
for sweep_index in range(0, num_sweeps):
if sweep_index > 0:
with torch.no_grad():
img_feats, stereo_feats = self.get_cam_feats(
sweep_imgs[:, sweep_index:sweep_index + 1, ...])
img_feats_all_sweeps.append(
img_feats.view(batch_size * num_cams,
*img_feats.shape[3:]))
stereo_feats_all_sweeps.append(stereo_feats)
depth_feat, context, mu, sigma, range_score, mono_depth =\
self.depth_net(img_feats.view(batch_size * num_cams,
*img_feats.shape[3:]), mats_dict)
context_all_sweeps.append(
self.context_downsample_net(
context.reshape(batch_size * num_cams,
*context.shape[1:])))
depth_feat_all_sweeps.append(depth_feat)
else:
img_feats, stereo_feats = self.get_cam_feats(
sweep_imgs[:, sweep_index:sweep_index + 1, ...])
img_feats_all_sweeps.append(
img_feats.view(batch_size * num_cams,
*img_feats.shape[3:]))
stereo_feats_all_sweeps.append(stereo_feats)
depth_feat, context, mu, sigma, range_score, mono_depth =\
self.depth_net(img_feats.view(batch_size * num_cams,
*img_feats.shape[3:]), mats_dict)
depth_feat_all_sweeps.append(depth_feat)
context_all_sweeps.append(
self.context_downsample_net(
context.reshape(batch_size * num_cams,
*context.shape[1:])))
mu_all_sweeps.append(mu)
sigma_all_sweeps.append(sigma)
mono_depth_all_sweeps.append(mono_depth)
range_score_all_sweeps.append(range_score)
depth_score_all_sweeps = list()
final_depth = None
for ref_idx in range(num_sweeps):
sensor2sensor_mats = list()
for src_idx in range(num_sweeps):
ref2keysensor_mats = mats_dict[
'sensor2sensor_mats'][:, ref_idx, ...].inverse()
key2srcsensor_mats = mats_dict['sensor2sensor_mats'][:,
src_idx,
...]
ref2srcsensor_mats = key2srcsensor_mats @ ref2keysensor_mats
sensor2sensor_mats.append(ref2srcsensor_mats)
if ref_idx == 0:
# last iteration on stage 1 does not have propagation
# (photometric consistency filtering)
if self.use_mask:
stereo_depth, mask = self._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
else:
stereo_depth = self._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
else:
with torch.no_grad():
# last iteration on stage 1 does not have
# propagation (photometric consistency filtering)
if self.use_mask:
stereo_depth, mask = self._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
else:
stereo_depth = self._forward_stereo(
ref_idx,
stereo_feats_all_sweeps,
mono_depth_all_sweeps,
mats_dict,
sensor2sensor_mats,
mu_all_sweeps,
sigma_all_sweeps,
range_score_all_sweeps,
depth_feat_all_sweeps,
)
if self.use_mask:
depth_score = (
mono_depth_all_sweeps[ref_idx] +
self.depth_downsample_net(stereo_depth) * mask).softmax(
1, dtype=stereo_depth.dtype)
else:
depth_score = (
mono_depth_all_sweeps[ref_idx] +
self.depth_downsample_net(stereo_depth)).softmax(
1, dtype=stereo_depth.dtype)
depth_score_all_sweeps.append(depth_score)
if ref_idx == 0:
# final_depth has to be fp32, otherwise the
# depth loss will colapse during the traing process.
final_depth = (
mono_depth_all_sweeps[ref_idx] +
self.depth_downsample_net(stereo_depth)).softmax(1)
key_frame_res = self._forward_single_sweep(
0,
context_all_sweeps[0].reshape(batch_size, num_cams,
*context_all_sweeps[0].shape[1:]),
mats_dict,
depth_score_all_sweeps[0],
is_return_depth=is_return_depth,
)
if num_sweeps == 1:
return key_frame_res
key_frame_feature = key_frame_res[
0] if is_return_depth else key_frame_res
ret_feature_list = [key_frame_feature]
for sweep_index in range(1, num_sweeps):
with torch.no_grad():
feature_map = self._forward_single_sweep(
sweep_index,
context_all_sweeps[sweep_index].reshape(
batch_size, num_cams,
*context_all_sweeps[sweep_index].shape[1:]),
mats_dict,
depth_score_all_sweeps[sweep_index],
is_return_depth=False,
)
ret_feature_list.append(feature_map)
if is_return_depth:
return torch.cat(ret_feature_list, 1), final_depth
else:
return torch.cat(ret_feature_list, 1)
================================================
FILE: bevdepth/layers/backbones/fusion_lss_fpn.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
import torch.nn as nn
from mmdet.models.backbones.resnet import BasicBlock
try:
from bevdepth.ops.voxel_pooling_inference import voxel_pooling_inference
from bevdepth.ops.voxel_pooling_train import voxel_pooling_train
except ImportError:
print('Import VoxelPooling fail.')
from .base_lss_fpn import ASPP, BaseLSSFPN, Mlp, SELayer
__all__ = ['FusionLSSFPN']
class DepthNet(nn.Module):
def __init__(self, in_channels, mid_channels, context_channels,
depth_channels):
super(DepthNet, self).__init__()
self.reduce_conv = nn.Sequential(
nn.Conv2d(in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
)
self.context_conv = nn.Conv2d(mid_channels,
context_channels,
kernel_size=1,
stride=1,
padding=0)
self.mlp = Mlp(1, mid_channels, mid_channels)
self.se = SELayer(mid_channels) # NOTE: add camera-aware
self.depth_gt_conv = nn.Sequential(
nn.Conv2d(1, mid_channels, kernel_size=1, stride=1),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels, mid_channels, kernel_size=1, stride=1),
)
self.depth_conv = nn.Sequential(
BasicBlock(mid_channels, mid_channels),
BasicBlock(mid_channels, mid_channels),
BasicBlock(mid_channels, mid_channels),
)
self.aspp = ASPP(mid_channels, mid_channels)
self.depth_pred = nn.Conv2d(mid_channels,
depth_channels,
kernel_size=1,
stride=1,
padding=0)
def forward(self, x, mats_dict, lidar_depth, scale_depth_factor=1000.0):
x = self.reduce_conv(x)
context = self.context_conv(x)
inv_intrinsics = torch.inverse(mats_dict['intrin_mats'][:, 0:1, ...])
pixel_size = torch.norm(torch.stack(
[inv_intrinsics[..., 0, 0], inv_intrinsics[..., 1, 1]], dim=-1),
dim=-1).reshape(-1, 1)
aug_scale = torch.sqrt(mats_dict['ida_mats'][:, 0, :, 0, 0]**2 +
mats_dict['ida_mats'][:, 0, :, 0,
0]**2).reshape(-1, 1)
scaled_pixel_size = pixel_size * scale_depth_factor / aug_scale
x_se = self.mlp(scaled_pixel_size)[..., None, None]
x = self.se(x, x_se)
depth = self.depth_gt_conv(lidar_depth)
depth = self.depth_conv(x + depth)
depth = self.aspp(depth)
depth = self.depth_pred(depth)
return torch.cat([depth, context], dim=1)
class FusionLSSFPN(BaseLSSFPN):
def _configure_depth_net(self, depth_net_conf):
return DepthNet(
depth_net_conf['in_channels'],
depth_net_conf['mid_channels'],
self.output_channels,
self.depth_channels,
)
def _forward_depth_net(self, feat, mats_dict, lidar_depth):
return self.depth_net(feat, mats_dict, lidar_depth)
def _forward_single_sweep(self,
sweep_index,
sweep_imgs,
mats_dict,
sweep_lidar_depth,
is_return_depth=False):
"""Forward function for single sweep.
Args:
sweep_index (int): Index of sweeps.
sweep_imgs (Tensor): Input images.
mats_dict (dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
sweep_lidar_depth (Tensor): Depth generated by lidar.
is_return_depth (bool, optional): Whether to return depth.
Default: False.
Returns:
Tensor: BEV feature map.
"""
batch_size, num_sweeps, num_cams, num_channels, img_height, \
img_width = sweep_imgs.shape
img_feats = self.get_cam_feats(sweep_imgs)
sweep_lidar_depth = sweep_lidar_depth.reshape(
batch_size * num_cams, *sweep_lidar_depth.shape[2:])
source_features = img_feats[:, 0, ...]
depth_feature = self._forward_depth_net(
source_features.reshape(batch_size * num_cams,
source_features.shape[2],
source_features.shape[3],
source_features.shape[4]), mats_dict,
sweep_lidar_depth)
depth = depth_feature[:, :self.depth_channels].softmax(
dim=1, dtype=depth_feature.dtype)
geom_xyz = self.get_geometry(
mats_dict['sensor2ego_mats'][:, sweep_index, ...],
mats_dict['intrin_mats'][:, sweep_index, ...],
mats_dict['ida_mats'][:, sweep_index, ...],
mats_dict.get('bda_mat', None),
)
geom_xyz = ((geom_xyz - (self.voxel_coord - self.voxel_size / 2.0)) /
self.voxel_size).int()
if self.training or self.use_da:
img_feat_with_depth = depth.unsqueeze(
1) * depth_feature[:, self.depth_channels:(
self.depth_channels + self.output_channels)].unsqueeze(2)
img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth)
img_feat_with_depth = img_feat_with_depth.reshape(
batch_size,
num_cams,
img_feat_with_depth.shape[1],
img_feat_with_depth.shape[2],
img_feat_with_depth.shape[3],
img_feat_with_depth.shape[4],
)
img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2)
feature_map = voxel_pooling_train(geom_xyz,
img_feat_with_depth.contiguous(),
self.voxel_num.cuda())
else:
feature_map = voxel_pooling_inference(
geom_xyz, depth, depth_feature[:, self.depth_channels:(
self.depth_channels + self.output_channels)].contiguous(),
self.voxel_num.cuda())
if is_return_depth:
return feature_map.contiguous(), depth.float()
return feature_map.contiguous()
def forward(self,
sweep_imgs,
mats_dict,
lidar_depth,
timestamps=None,
is_return_depth=False):
"""Forward function.
Args:
sweep_imgs(Tensor): Input images with shape of (B, num_sweeps,
num_cameras, 3, H, W).
mats_dict(dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
lidar_depth (Tensor): Depth generated by lidar.
timestamps(Tensor): Timestamp for all images with the shape of(B,
num_sweeps, num_cameras).
Return:
Tensor: bev feature map.
"""
batch_size, num_sweeps, num_cams, num_channels, img_height, \
img_width = sweep_imgs.shape
lidar_depth = self.get_downsampled_lidar_depth(lidar_depth)
key_frame_res = self._forward_single_sweep(
0,
sweep_imgs[:, 0:1, ...],
mats_dict,
lidar_depth[:, 0, ...],
is_return_depth=is_return_depth)
if num_sweeps == 1:
return key_frame_res
key_frame_feature = key_frame_res[
0] if is_return_depth else key_frame_res
ret_feature_list = [key_frame_feature]
for sweep_index in range(1, num_sweeps):
with torch.no_grad():
feature_map = self._forward_single_sweep(
sweep_index,
sweep_imgs[:, sweep_index:sweep_index + 1, ...],
mats_dict,
lidar_depth[:, sweep_index, ...],
is_return_depth=False)
ret_feature_list.append(feature_map)
if is_return_depth:
return torch.cat(ret_feature_list, 1), key_frame_res[1]
else:
return torch.cat(ret_feature_list, 1)
def get_downsampled_lidar_depth(self, lidar_depth):
batch_size, num_sweeps, num_cams, height, width = lidar_depth.shape
lidar_depth = lidar_depth.view(
batch_size * num_sweeps * num_cams,
height // self.downsample_factor,
self.downsample_factor,
width // self.downsample_factor,
self.downsample_factor,
1,
)
lidar_depth = lidar_depth.permute(0, 1, 3, 5, 2, 4).contiguous()
lidar_depth = lidar_depth.view(
-1, self.downsample_factor * self.downsample_factor)
gt_depths_tmp = torch.where(lidar_depth == 0.0, lidar_depth.max(),
lidar_depth)
lidar_depth = torch.min(gt_depths_tmp, dim=-1).values
lidar_depth = lidar_depth.view(batch_size, num_sweeps, num_cams, 1,
height // self.downsample_factor,
width // self.downsample_factor)
lidar_depth = lidar_depth / self.d_bound[1]
return lidar_depth
================================================
FILE: bevdepth/layers/backbones/matrixvt.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
from torch import nn
from torch.cuda.amp import autocast
from bevdepth.layers.backbones.base_lss_fpn import BaseLSSFPN
class HoriConv(nn.Module):
def __init__(self, in_channels, mid_channels, out_channels, cat_dim=0):
"""HoriConv that reduce the image feature
in height dimension and refine it.
Args:
in_channels (int): in_channels
mid_channels (int): mid_channels
out_channels (int): output channels
cat_dim (int, optional): channels of position
embedding. Defaults to 0.
"""
super().__init__()
self.merger = nn.Sequential(
nn.Conv2d(in_channels + cat_dim,
in_channels,
kernel_size=1,
bias=True),
nn.Sigmoid(),
nn.Conv2d(in_channels, in_channels, kernel_size=1, bias=True),
)
self.reduce_conv = nn.Sequential(
nn.Conv1d(
in_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False,
),
nn.BatchNorm1d(mid_channels),
nn.ReLU(inplace=True),
)
self.conv1 = nn.Sequential(
nn.Conv1d(
mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False,
),
nn.BatchNorm1d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv1d(
mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False,
),
nn.BatchNorm1d(mid_channels),
nn.ReLU(inplace=True),
)
self.conv2 = nn.Sequential(
nn.Conv1d(
mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False,
),
nn.BatchNorm1d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv1d(
mid_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False,
),
nn.BatchNorm1d(mid_channels),
nn.ReLU(inplace=True),
)
self.out_conv = nn.Sequential(
nn.Conv1d(
mid_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=True,
),
nn.BatchNorm1d(out_channels),
nn.ReLU(inplace=True),
)
@autocast(False)
def forward(self, x, pe=None):
# [N,C,H,W]
if pe is not None:
x = self.merger(torch.cat([x, pe], 1))
else:
x = self.merger(x)
x = x.max(2)[0]
x = self.reduce_conv(x)
x = self.conv1(x) + x
x = self.conv2(x) + x
x = self.out_conv(x)
return x
class DepthReducer(nn.Module):
def __init__(self, img_channels, mid_channels):
"""Module that compresses the predicted
categorical depth in height dimension
Args:
img_channels (int): in_channels
mid_channels (int): mid_channels
"""
super().__init__()
self.vertical_weighter = nn.Sequential(
nn.Conv2d(img_channels,
mid_channels,
kernel_size=3,
stride=1,
padding=1),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels, 1, kernel_size=3, stride=1, padding=1),
)
@autocast(False)
def forward(self, feat, depth):
vert_weight = self.vertical_weighter(feat).softmax(2) # [N,1,H,W]
depth = (depth * vert_weight).sum(2)
return depth
# NOTE Modified Lift-Splat
class MatrixVT(BaseLSSFPN):
def __init__(
self,
x_bound,
y_bound,
z_bound,
d_bound,
final_dim,
downsample_factor,
output_channels,
img_backbone_conf,
img_neck_conf,
depth_net_conf,
):
"""Modified from LSSFPN.
Args:
x_bound (list): Boundaries for x.
y_bound (list): Boundaries for y.
z_bound (list): Boundaries for z.
d_bound (list): Boundaries for d.
final_dim (list): Dimension for input images.
downsample_factor (int): Downsample factor between feature map
and input image.
output_channels (int): Number of channels for the output
feature map.
img_backbone_conf (dict): Config for image backbone.
img_neck_conf (dict): Config for image neck.
depth_net_conf (dict): Config for depth net.
"""
super().__init__(
x_bound,
y_bound,
z_bound,
d_bound,
final_dim,
downsample_factor,
output_channels,
img_backbone_conf,
img_neck_conf,
depth_net_conf,
use_da=False,
)
self.register_buffer('bev_anchors',
self.create_bev_anchors(x_bound, y_bound))
self.horiconv = HoriConv(self.output_channels, 512,
self.output_channels)
self.depth_reducer = DepthReducer(self.output_channels,
self.output_channels)
self.static_mat = None
def create_bev_anchors(self, x_bound, y_bound, ds_rate=1):
"""Create anchors in BEV space
Args:
x_bound (list): xbound in meters [start, end, step]
y_bound (list): ybound in meters [start, end, step]
ds_rate (iint, optional): downsample rate. Defaults to 1.
Returns:
anchors: anchors in [W, H, 2]
"""
x_coords = ((torch.linspace(
x_bound[0],
x_bound[1] - x_bound[2] * ds_rate,
self.voxel_num[0] // ds_rate,
dtype=torch.float,
) + x_bound[2] * ds_rate / 2).view(self.voxel_num[0] // ds_rate,
1).expand(
self.voxel_num[0] // ds_rate,
self.voxel_num[1] // ds_rate))
y_coords = ((torch.linspace(
y_bound[0],
y_bound[1] - y_bound[2] * ds_rate,
self.voxel_num[1] // ds_rate,
dtype=torch.float,
) + y_bound[2] * ds_rate / 2).view(
1,
self.voxel_num[1] // ds_rate).expand(self.voxel_num[0] // ds_rate,
self.voxel_num[1] // ds_rate))
anchors = torch.stack([x_coords, y_coords]).permute(1, 2, 0)
return anchors
def get_proj_mat(self, mats_dict=None):
"""Create the Ring Matrix and Ray Matrix
Args:
mats_dict (dict, optional): dictionary that
contains intrin- and extrin- parameters.
Defaults to None.
Returns:
tuple: Ring Matrix in [B, D, L, L] and Ray Matrix in [B, W, L, L]
"""
if self.static_mat is not None:
return self.static_mat
bev_size = int(self.voxel_num[0]) # only consider square BEV
geom_sep = self.get_geometry(
mats_dict['sensor2ego_mats'][:, 0, ...],
mats_dict['intrin_mats'][:, 0, ...],
mats_dict['ida_mats'][:, 0, ...],
mats_dict.get('bda_mat', None),
)
geom_sep = (
geom_sep -
(self.voxel_coord - self.voxel_size / 2.0)) / self.voxel_size
geom_sep = geom_sep.mean(3).permute(0, 1, 3, 2,
4).contiguous() # B,Ncam,W,D,2
B, Nc, W, D, _ = geom_sep.shape
geom_sep = geom_sep.long().view(B, Nc * W, D, -1)[..., :2]
invalid1 = torch.logical_or((geom_sep < 0)[..., 0], (geom_sep < 0)[...,
1])
invalid2 = torch.logical_or((geom_sep > (bev_size - 1))[..., 0],
(geom_sep > (bev_size - 1))[..., 1])
geom_sep[(invalid1 | invalid2)] = int(bev_size / 2)
geom_idx = geom_sep[..., 1] * bev_size + geom_sep[..., 0]
geom_uni = self.bev_anchors[None].repeat([B, 1, 1, 1]) # B,128,128,2
B, L, L, _ = geom_uni.shape
circle_map = geom_uni.new_zeros((B, D, L * L))
ray_map = geom_uni.new_zeros((B, Nc * W, L * L))
for b in range(B):
for dir in range(Nc * W):
ray_map[b, dir, geom_idx[b, dir]] += 1
for d in range(D):
circle_map[b, d, geom_idx[b, :, d]] += 1
null_point = int((bev_size / 2) * (bev_size + 1))
circle_map[..., null_point] = 0
ray_map[..., null_point] = 0
circle_map = circle_map.view(B, D, L * L)
ray_map = ray_map.view(B, -1, L * L)
circle_map /= circle_map.max(1)[0].clip(min=1)[:, None]
ray_map /= ray_map.max(1)[0].clip(min=1)[:, None]
return circle_map, ray_map
@autocast(False)
def reduce_and_project(self, feature, depth, mats_dict):
"""reduce the feature and depth in height
dimension and make BEV feature
Args:
feature (Tensor): image feature in [B, C, H, W]
depth (Tensor): Depth Prediction in [B, D, H, W]
mats_dict (dict): dictionary that contains intrin-
and extrin- parameters
Returns:
Tensor: BEV feature in B, C, L, L
"""
# [N,112,H,W], [N,256,H,W]
depth = self.depth_reducer(feature, depth)
B = mats_dict['intrin_mats'].shape[0]
# N, C, H, W = feature.shape
# feature=feature.reshape(N,C*H,W)
feature = self.horiconv(feature)
# feature = feature.max(2)[0]
# [N.112,W], [N,C,W]
depth = depth.permute(0, 2, 1).reshape(B, -1, self.depth_channels)
feature = feature.permute(0, 2, 1).reshape(B, -1, self.output_channels)
circle_map, ray_map = self.get_proj_mat(mats_dict)
proj_mat = depth.matmul(circle_map)
proj_mat = (proj_mat * ray_map).permute(0, 2, 1)
img_feat_with_depth = proj_mat.matmul(feature)
img_feat_with_depth = img_feat_with_depth.permute(0, 2, 1).reshape(
B, -1, *self.voxel_num[:2])
return img_feat_with_depth
def _forward_single_sweep(self,
sweep_index,
sweep_imgs,
mats_dict,
is_return_depth=False):
(
batch_size,
num_sweeps,
num_cams,
num_channels,
img_height,
img_width,
) = sweep_imgs.shape
img_feats = self.get_cam_feats(sweep_imgs)
source_features = img_feats[:, 0, ...]
depth_feature = self.depth_net(
source_features.reshape(
batch_size * num_cams,
source_features.shape[2],
source_features.shape[3],
source_features.shape[4],
),
mats_dict,
)
with autocast(enabled=False):
feature = depth_feature[:, self.depth_channels:(
self.depth_channels + self.output_channels)].float()
depth = depth_feature[:, :self.depth_channels].float().softmax(1)
img_feat_with_depth = self.reduce_and_project(
feature, depth, mats_dict) # [b*n, c, d, w]
if is_return_depth:
return img_feat_with_depth.contiguous(), depth
return img_feat_with_depth.contiguous()
if __name__ == '__main__':
backbone_conf = {
'x_bound': [-51.2, 51.2, 0.8], # BEV grids bounds and size (m)
'y_bound': [-51.2, 51.2, 0.8], # BEV grids bounds and size (m)
'z_bound': [-5, 3, 8], # BEV grids bounds and size (m)
'd_bound': [2.0, 58.0,
0.5], # Categorical Depth bounds and division (m)
'final_dim': (256, 704), # img size for model input (pix)
'output_channels':
80, # BEV feature channels
'downsample_factor':
16, # ds factor of the feature to be projected to BEV (e.g. 256x704 -> 16x44) # noqa
'img_backbone_conf':
dict(
type='ResNet',
depth=50,
frozen_stages=0,
out_indices=[0, 1, 2, 3],
norm_eval=False,
init_cfg=dict(type='Pretrained',
checkpoint='torchvision://resnet50'),
),
'img_neck_conf':
dict(
type='SECONDFPN',
in_channels=[256, 512, 1024, 2048],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[128, 128, 128, 128],
),
'depth_net_conf':
dict(in_channels=512, mid_channels=512),
}
model = MatrixVT(**backbone_conf)
# for inference and deployment where intrin & extrin mats are static
# model.static_mat = model.get_proj_mat(mats_dict)
bev_feature, depth = model(
torch.rand((2, 1, 6, 3, 256, 704)), {
'sensor2ego_mats': torch.rand((2, 1, 6, 4, 4)),
'intrin_mats': torch.rand((2, 1, 6, 4, 4)),
'ida_mats': torch.rand((2, 1, 6, 4, 4)),
'sensor2sensor_mats': torch.rand((2, 1, 6, 4, 4)),
'bda_mat': torch.rand((2, 4, 4)),
},
is_return_depth=True)
print(bev_feature.shape, depth.shape)
================================================
FILE: bevdepth/layers/heads/__init__.py
================================================
from .bev_depth_head import BEVDepthHead
__all__ = ['BEVDepthHead']
================================================
FILE: bevdepth/layers/heads/bev_depth_head.py
================================================
"""Inherited from `https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/dense_heads/centerpoint_head.py`""" # noqa
import numba
import numpy as np
import torch
from mmdet3d.core import draw_heatmap_gaussian, gaussian_radius
from mmdet3d.models import build_neck
from mmdet3d.models.dense_heads.centerpoint_head import CenterHead, circle_nms
from mmdet3d.models.utils import clip_sigmoid
from mmdet.core import reduce_mean
from mmdet.models import build_backbone
from torch.cuda.amp import autocast
__all__ = ['BEVDepthHead']
bev_backbone_conf = dict(
type='ResNet',
in_channels=80,
depth=18,
num_stages=3,
strides=(1, 2, 2),
dilations=(1, 1, 1),
out_indices=[0, 1, 2],
norm_eval=False,
base_channels=160,
)
bev_neck_conf = dict(type='SECONDFPN',
in_channels=[160, 320, 640],
upsample_strides=[2, 4, 8],
out_channels=[64, 64, 128])
@numba.jit(nopython=True)
def size_aware_circle_nms(dets, thresh_scale, post_max_size=83):
"""Circular NMS.
An object is only counted as positive if no other center
with a higher confidence exists within a radius r using a
bird-eye view distance metric.
Args:
dets (torch.Tensor): Detection results with the shape of [N, 3].
thresh (float): Value of threshold.
post_max_size (int): Max number of prediction to be kept. Defaults
to 83
Returns:
torch.Tensor: Indexes of the detections to be kept.
"""
x1 = dets[:, 0]
y1 = dets[:, 1]
dx1 = dets[:, 2]
dy1 = dets[:, 3]
yaws = dets[:, 4]
scores = dets[:, -1]
order = scores.argsort()[::-1].astype(np.int32) # highest->lowest
ndets = dets.shape[0]
suppressed = np.zeros((ndets), dtype=np.int32)
keep = []
for _i in range(ndets):
i = order[_i] # start with highest score box
if suppressed[
i] == 1: # if any box have enough iou with this, remove it
continue
keep.append(i)
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
# calculate center distance between i and j box
dist_x = abs(x1[i] - x1[j])
dist_y = abs(y1[i] - y1[j])
dist_x_th = (abs(dx1[i] * np.cos(yaws[i])) +
abs(dx1[j] * np.cos(yaws[j])) +
abs(dy1[i] * np.sin(yaws[i])) +
abs(dy1[j] * np.sin(yaws[j])))
dist_y_th = (abs(dx1[i] * np.sin(yaws[i])) +
abs(dx1[j] * np.sin(yaws[j])) +
abs(dy1[i] * np.cos(yaws[i])) +
abs(dy1[j] * np.cos(yaws[j])))
# ovr = inter / areas[j]
if dist_x <= dist_x_th * thresh_scale / 2 and \
dist_y <= dist_y_th * thresh_scale / 2:
suppressed[j] = 1
return keep[:post_max_size]
class BEVDepthHead(CenterHead):
"""Head for BevDepth.
Args:
in_channels(int): Number of channels after bev_neck.
tasks(dict): Tasks for head.
bbox_coder(dict): Config of bbox coder.
common_heads(dict): Config of head for each task.
loss_cls(dict): Config of classification loss.
loss_bbox(dict): Config of regression loss.
gaussian_overlap(float): Gaussian overlap used for `get_targets`.
min_radius(int): Min radius used for `get_targets`.
train_cfg(dict): Config used in the training process.
test_cfg(dict): Config used in the test process.
bev_backbone_conf(dict): Cnfig of bev_backbone.
bev_neck_conf(dict): Cnfig of bev_neck.
"""
def __init__(
self,
in_channels=256,
tasks=None,
bbox_coder=None,
common_heads=dict(),
loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
gaussian_overlap=0.1,
min_radius=2,
train_cfg=None,
test_cfg=None,
bev_backbone_conf=bev_backbone_conf,
bev_neck_conf=bev_neck_conf,
separate_head=dict(type='SeparateHead',
init_bias=-2.19,
final_kernel=3),
):
super(BEVDepthHead, self).__init__(
in_channels=in_channels,
tasks=tasks,
bbox_coder=bbox_coder,
common_heads=common_heads,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
separate_head=separate_head,
)
self.trunk = build_backbone(bev_backbone_conf)
self.trunk.init_weights()
self.neck = build_neck(bev_neck_conf)
self.neck.init_weights()
del self.trunk.maxpool
self.gaussian_overlap = gaussian_overlap
self.min_radius = min_radius
self.train_cfg = train_cfg
self.test_cfg = test_cfg
@autocast(False)
def forward(self, x):
"""Forward pass.
Args:
feats (list[torch.Tensor]): Multi-level features, e.g.,
features produced by FPN.
Returns:
tuple(list[dict]): Output results for tasks.
"""
x = x.float()
# FPN
trunk_outs = [x]
if self.trunk.deep_stem:
x = self.trunk.stem(x)
else:
x = self.trunk.conv1(x)
x = self.trunk.norm1(x)
x = self.trunk.relu(x)
for i, layer_name in enumerate(self.trunk.res_layers):
res_layer = getattr(self.trunk, layer_name)
x = res_layer(x)
if i in self.trunk.out_indices:
trunk_outs.append(x)
fpn_output = self.neck(trunk_outs)
ret_values = super().forward(fpn_output)
return ret_values
def get_targets_single(self, gt_bboxes_3d, gt_labels_3d):
"""Generate training targets for a single sample.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
Returns:
tuple[list[torch.Tensor]]: Tuple of target including \
the following results in order.
- list[torch.Tensor]: Heatmap scores.
- list[torch.Tensor]: Ground truth boxes.
- list[torch.Tensor]: Indexes indicating the position \
of the valid boxes.
- list[torch.Tensor]: Masks indicating which boxes \
are valid.
"""
max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
grid_size = torch.tensor(self.train_cfg['grid_size'])
pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
voxel_size = torch.tensor(self.train_cfg['voxel_size'])
feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
# reorganize the gt_dict by tasks
task_masks = []
flag = 0
for class_name in self.class_names:
task_masks.append([
torch.where(gt_labels_3d == class_name.index(i) + flag)
for i in class_name
])
flag += len(class_name)
task_boxes = []
task_classes = []
flag2 = 0
for idx, mask in enumerate(task_masks):
task_box = []
task_class = []
for m in mask:
task_box.append(gt_bboxes_3d[m])
# 0 is background for each task, so we need to add 1 here.
task_class.append(gt_labels_3d[m] + 1 - flag2)
task_boxes.append(
torch.cat(task_box, axis=0).to(gt_bboxes_3d.device))
task_classes.append(
torch.cat(task_class).long().to(gt_bboxes_3d.device))
flag2 += len(mask)
draw_gaussian = draw_heatmap_gaussian
heatmaps, anno_boxes, inds, masks = [], [], [], []
for idx, task_head in enumerate(self.task_heads):
heatmap = gt_bboxes_3d.new_zeros(
(len(self.class_names[idx]), feature_map_size[1],
feature_map_size[0]),
device='cuda')
anno_box = gt_bboxes_3d.new_zeros(
(max_objs, len(self.train_cfg['code_weights'])),
dtype=torch.float32,
device='cuda')
ind = gt_labels_3d.new_zeros((max_objs),
dtype=torch.int64,
device='cuda')
mask = gt_bboxes_3d.new_zeros((max_objs),
dtype=torch.uint8,
device='cuda')
num_objs = min(task_boxes[idx].shape[0], max_objs)
for k in range(num_objs):
cls_id = task_classes[idx][k] - 1
width = task_boxes[idx][k][3]
length = task_boxes[idx][k][4]
width = width / voxel_size[0] / self.train_cfg[
'out_size_factor']
length = length / voxel_size[1] / self.train_cfg[
'out_size_factor']
if width > 0 and length > 0:
radius = gaussian_radius(
(length, width),
min_overlap=self.train_cfg['gaussian_overlap'])
radius = max(self.train_cfg['min_radius'], int(radius))
# be really careful for the coordinate system of
# your box annotation.
x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
1], task_boxes[idx][k][2]
coor_x = (
x - pc_range[0]
) / voxel_size[0] / self.train_cfg['out_size_factor']
coor_y = (
y - pc_range[1]
) / voxel_size[1] / self.train_cfg['out_size_factor']
center = torch.tensor([coor_x, coor_y],
dtype=torch.float32,
device='cuda')
center_int = center.to(torch.int32)
# throw out not in range objects to avoid out of array
# area when creating the heatmap
if not (0 <= center_int[0] < feature_map_size[0]
and 0 <= center_int[1] < feature_map_size[1]):
continue
draw_gaussian(heatmap[cls_id], center_int, radius)
new_idx = k
x, y = center_int[0], center_int[1]
assert y * feature_map_size[0] + x < feature_map_size[
0] * feature_map_size[1]
ind[new_idx] = y * feature_map_size[0] + x
mask[new_idx] = 1
# TODO: support other outdoor dataset
if len(task_boxes[idx][k]) > 7:
vx, vy = task_boxes[idx][k][7:]
rot = task_boxes[idx][k][6]
box_dim = task_boxes[idx][k][3:6]
if self.norm_bbox:
box_dim = box_dim.log()
if len(task_boxes[idx][k]) > 7:
anno_box[new_idx] = torch.cat([
center - torch.tensor([x, y], device='cuda'),
z.unsqueeze(0),
box_dim,
torch.sin(rot).unsqueeze(0),
torch.cos(rot).unsqueeze(0),
vx.unsqueeze(0),
vy.unsqueeze(0),
])
else:
anno_box[new_idx] = torch.cat([
center - torch.tensor([x, y], device='cuda'),
z.unsqueeze(0), box_dim,
torch.sin(rot).unsqueeze(0),
torch.cos(rot).unsqueeze(0)
])
heatmaps.append(heatmap)
anno_boxes.append(anno_box)
masks.append(mask)
inds.append(ind)
return heatmaps, anno_boxes, inds, masks
def loss(self, targets, preds_dicts, **kwargs):
"""Loss function for BEVDepthHead.
Args:
gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
truth gt boxes.
gt_labels_3d (list[torch.Tensor]): Labels of boxes.
preds_dicts (dict): Output of forward function.
Returns:
dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
"""
heatmaps, anno_boxes, inds, masks = targets
return_loss = 0
for task_id, preds_dict in enumerate(preds_dicts):
# heatmap focal loss
preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
num_pos = heatmaps[task_id].eq(1).float().sum().item()
cls_avg_factor = torch.clamp(reduce_mean(
heatmaps[task_id].new_tensor(num_pos)),
min=1).item()
loss_heatmap = self.loss_cls(preds_dict[0]['heatmap'],
heatmaps[task_id],
avg_factor=cls_avg_factor)
target_box = anno_boxes[task_id]
# reconstruct the anno_box from multiple reg heads
if 'vel' in preds_dict[0].keys():
preds_dict[0]['anno_box'] = torch.cat(
(preds_dict[0]['reg'], preds_dict[0]['height'],
preds_dict[0]['dim'], preds_dict[0]['rot'],
preds_dict[0]['vel']),
dim=1,
)
else:
preds_dict[0]['anno_box'] = torch.cat(
(preds_dict[0]['reg'], preds_dict[0]['height'],
preds_dict[0]['dim'], preds_dict[0]['rot']),
dim=1,
)
# Regression loss for dimension, offset, height, rotation
num = masks[task_id].float().sum()
ind = inds[task_id]
pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
pred = pred.view(pred.size(0), -1, pred.size(3))
pred = self._gather_feat(pred, ind)
mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
num = torch.clamp(reduce_mean(target_box.new_tensor(num)),
min=1e-4).item()
isnotnan = (~torch.isnan(target_box)).float()
mask *= isnotnan
code_weights = self.train_cfg['code_weights']
bbox_weights = mask * mask.new_tensor(code_weights)
loss_bbox = self.loss_bbox(pred,
target_box,
bbox_weights,
avg_factor=num)
return_loss += loss_bbox
return_loss += loss_heatmap
return return_loss
def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
img_metas (list[dict]): Point cloud and image's meta info.
Returns:
list[dict]: Decoded bbox, scores and labels after nms.
"""
rets = []
for task_id, preds_dict in enumerate(preds_dicts):
num_class_with_bg = self.num_classes[task_id]
batch_size = preds_dict[0]['heatmap'].shape[0]
batch_heatmap = preds_dict[0]['heatmap'].sigmoid()
batch_reg = preds_dict[0]['reg']
batch_hei = preds_dict[0]['height']
if self.norm_bbox:
batch_dim = torch.exp(preds_dict[0]['dim'])
else:
batch_dim = preds_dict[0]['dim']
batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1)
batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1)
if 'vel' in preds_dict[0]:
batch_vel = preds_dict[0]['vel']
else:
batch_vel = None
temp = self.bbox_coder.decode(batch_heatmap,
batch_rots,
batch_rotc,
batch_hei,
batch_dim,
batch_vel,
reg=batch_reg,
task_id=task_id)
assert self.test_cfg['nms_type'] in [
'size_aware_circle', 'circle', 'rotate'
]
batch_reg_preds = [box['bboxes'] for box in temp]
batch_cls_preds = [box['scores'] for box in temp]
batch_cls_labels = [box['labels'] for box in temp]
if self.test_cfg['nms_type'] == 'circle':
ret_task = []
for i in range(batch_size):
boxes3d = temp[i]['bboxes']
scores = temp[i]['scores']
labels = temp[i]['labels']
centers = boxes3d[:, [0, 1]]
boxes = torch.cat([centers, scores.view(-1, 1)], dim=1)
keep = torch.tensor(circle_nms(
boxes.detach().cpu().numpy(),
self.test_cfg['min_radius'][task_id],
post_max_size=self.test_cfg['post_max_size']),
dtype=torch.long,
device=boxes.device)
boxes3d = boxes3d[keep]
scores = scores[keep]
labels = labels[keep]
ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
ret_task.append(ret)
rets.append(ret_task)
elif self.test_cfg['nms_type'] == 'size_aware_circle':
ret_task = []
for i in range(batch_size):
boxes3d = temp[i]['bboxes']
scores = temp[i]['scores']
labels = temp[i]['labels']
boxes_2d = boxes3d[:, [0, 1, 3, 4, 6]]
boxes = torch.cat([boxes_2d, scores.view(-1, 1)], dim=1)
keep = torch.tensor(
size_aware_circle_nms(
boxes.detach().cpu().numpy(),
self.test_cfg['thresh_scale'][task_id],
post_max_size=self.test_cfg['post_max_size'],
),
dtype=torch.long,
device=boxes.device,
)
boxes3d = boxes3d[keep]
scores = scores[keep]
labels = labels[keep]
ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
ret_task.append(ret)
rets.append(ret_task)
else:
rets.append(
self.get_task_detections(num_class_with_bg,
batch_cls_preds, batch_reg_preds,
batch_cls_labels, img_metas))
# Merge branches results
num_samples = len(rets[0])
ret_list = []
for i in range(num_samples):
for k in rets[0][i].keys():
if k == 'bboxes':
bboxes = torch.cat([ret[i][k] for ret in rets])
elif k == 'scores':
scores = torch.cat([ret[i][k] for ret in rets])
elif k == 'labels':
flag = 0
for j, num_class in enumerate(self.num_classes):
rets[j][i][k] += flag
flag += num_class
labels = torch.cat([ret[i][k].int() for ret in rets])
ret_list.append([bboxes, scores, labels])
return ret_list
================================================
FILE: bevdepth/models/base_bev_depth.py
================================================
from torch import nn
from bevdepth.layers.backbones.base_lss_fpn import BaseLSSFPN
from bevdepth.layers.heads.bev_depth_head import BEVDepthHead
__all__ = ['BaseBEVDepth']
class BaseBEVDepth(nn.Module):
"""Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`.
Args:
backbone_conf (dict): Config of backbone.
head_conf (dict): Config of head.
is_train_depth (bool): Whether to return depth.
Default: False.
"""
# TODO: Reduce grid_conf and data_aug_conf
def __init__(self, backbone_conf, head_conf, is_train_depth=False):
super(BaseBEVDepth, self).__init__()
self.backbone = BaseLSSFPN(**backbone_conf)
self.head = BEVDepthHead(**head_conf)
self.is_train_depth = is_train_depth
def forward(
self,
x,
mats_dict,
timestamps=None,
):
"""Forward function for BEVDepth
Args:
x (Tensor): Input ferature map.
mats_dict(dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
timestamps (long): Timestamp.
Default: None.
Returns:
tuple(list[dict]): Output results for tasks.
"""
if self.is_train_depth and self.training:
x, depth_pred = self.backbone(x,
mats_dict,
timestamps,
is_return_depth=True)
preds = self.head(x)
return preds, depth_pred
else:
x = self.backbone(x, mats_dict, timestamps)
preds = self.head(x)
return preds
def get_targets(self, gt_boxes, gt_labels):
"""Generate training targets for a single sample.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
Returns:
tuple[list[torch.Tensor]]: Tuple of target including \
the following results in order.
- list[torch.Tensor]: Heatmap scores.
- list[torch.Tensor]: Ground truth boxes.
- list[torch.Tensor]: Indexes indicating the position \
of the valid boxes.
- list[torch.Tensor]: Masks indicating which boxes \
are valid.
"""
return self.head.get_targets(gt_boxes, gt_labels)
def loss(self, targets, preds_dicts):
"""Loss function for BEVDepth.
Args:
gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
truth gt boxes.
gt_labels_3d (list[torch.Tensor]): Labels of boxes.
preds_dicts (dict): Output of forward function.
Returns:
dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
"""
return self.head.loss(targets, preds_dicts)
def get_bboxes(self, preds_dicts, img_metas=None, img=None, rescale=False):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
img_metas (list[dict]): Point cloud and image's meta info.
Returns:
list[dict]: Decoded bbox, scores and labels after nms.
"""
return self.head.get_bboxes(preds_dicts, img_metas, img, rescale)
================================================
FILE: bevdepth/models/bev_stereo.py
================================================
from bevdepth.layers.backbones.bevstereo_lss_fpn import BEVStereoLSSFPN
from bevdepth.models.base_bev_depth import BaseBEVDepth
__all__ = ['BEVStereo']
class BEVStereo(BaseBEVDepth):
"""Source code of `BEVStereo`, `https://arxiv.org/abs/2209.10248`.
Args:
backbone_conf (dict): Config of backbone.
head_conf (dict): Config of head.
is_train_depth (bool): Whether to return depth.
Default: False.
"""
# TODO: Reduce grid_conf and data_aug_conf
def __init__(self, backbone_conf, head_conf, is_train_depth=False):
super(BEVStereo, self).__init__(backbone_conf, head_conf,
is_train_depth)
self.backbone = BEVStereoLSSFPN(**backbone_conf)
================================================
FILE: bevdepth/models/fusion_bev_depth.py
================================================
from bevdepth.layers.backbones.fusion_lss_fpn import FusionLSSFPN
from bevdepth.layers.heads.bev_depth_head import BEVDepthHead
from .base_bev_depth import BaseBEVDepth
__all__ = ['FusionBEVDepth']
class FusionBEVDepth(BaseBEVDepth):
"""Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`.
Args:
backbone_conf (dict): Config of backbone.
head_conf (dict): Config of head.
is_train_depth (bool): Whether to return depth.
Default: False.
"""
# TODO: Reduce grid_conf and data_aug_conf
def __init__(self, backbone_conf, head_conf, is_train_depth=False):
super(BaseBEVDepth, self).__init__()
self.backbone = FusionLSSFPN(**backbone_conf)
self.head = BEVDepthHead(**head_conf)
self.is_train_depth = is_train_depth
def forward(
self,
x,
mats_dict,
lidar_depth,
timestamps=None,
):
"""Forward function for BEVDepth
Args:
x (Tensor): Input feature map.
mats_dict(dict):
sensor2ego_mats(Tensor): Transformation matrix from
camera to ego with shape of (B, num_sweeps,
num_cameras, 4, 4).
intrin_mats(Tensor): Intrinsic matrix with shape
of (B, num_sweeps, num_cameras, 4, 4).
ida_mats(Tensor): Transformation matrix for ida with
shape of (B, num_sweeps, num_cameras, 4, 4).
sensor2sensor_mats(Tensor): Transformation matrix
from key frame camera to sweep frame camera with
shape of (B, num_sweeps, num_cameras, 4, 4).
bda_mat(Tensor): Rotation matrix for bda with shape
of (B, 4, 4).
lidar_depth (Tensor): Depth generated by lidar.
timestamps (long): Timestamp.
Default: None.
Returns:
tuple(list[dict]): Output results for tasks.
"""
if self.is_train_depth and self.training:
x = self.backbone(x, mats_dict, lidar_depth, timestamps)
preds = self.head(x)
return preds
else:
x = self.backbone(x, mats_dict, lidar_depth, timestamps)
preds = self.head(x)
return preds
================================================
FILE: bevdepth/models/matrixvt_det.py
================================================
from bevdepth.layers.backbones.matrixvt import MatrixVT
from bevdepth.models.base_bev_depth import BaseBEVDepth
class MatrixVT_Det(BaseBEVDepth):
"""Implementation of MatrixVT for Object Detection.
Args:
backbone_conf (dict): Config of backbone.
head_conf (dict): Config of head.
is_train_depth (bool): Whether to return depth.
Default: False.
"""
def __init__(self, backbone_conf, head_conf, is_train_depth=False):
super().__init__(backbone_conf, head_conf, is_train_depth)
self.backbone = MatrixVT(**backbone_conf)
================================================
FILE: bevdepth/ops/voxel_pooling_inference/__init__.py
================================================
from .voxel_pooling_inference import voxel_pooling_inference
__all__ = ['voxel_pooling_inference']
================================================
FILE: bevdepth/ops/voxel_pooling_inference/src/voxel_pooling_inference_forward.cpp
================================================
// Copyright (c) Megvii Inc. All rights reserved.
#include
#include
#include
#include
#include
#include
#include
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int voxel_pooling_inference_forward_wrapper(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
at::Tensor geom_xyz_tensor, at::Tensor depth_features_tensor,
at::Tensor context_features_tensor, at::Tensor output_features_tensor);
void voxel_pooling_inference_forward_kernel_launcher(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
const int *geom_xyz, const float *depth_features,
const float *context_features, float *output_features, cudaStream_t stream);
void voxel_pooling_inference_forward_kernel_launcher(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
const int *geom_xyz, const half *depth_features,
const half *context_features, half *output_features, cudaStream_t stream);
int voxel_pooling_inference_forward_wrapper(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
at::Tensor geom_xyz_tensor, at::Tensor depth_features_tensor,
at::Tensor context_features_tensor, at::Tensor output_features_tensor) {
CHECK_INPUT(geom_xyz_tensor);
CHECK_INPUT(depth_features_tensor);
CHECK_INPUT(context_features_tensor);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
const int *geom_xyz = geom_xyz_tensor.data_ptr();
if (depth_features_tensor.dtype() == at::kFloat) {
const float *depth_features = depth_features_tensor.data_ptr();
const float *context_features = context_features_tensor.data_ptr();
float *output_features = output_features_tensor.data_ptr();
voxel_pooling_inference_forward_kernel_launcher(
batch_size, num_cams, num_depth, num_height, num_width, num_channels,
num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, depth_features,
context_features, output_features, stream);
} else if (depth_features_tensor.dtype() == at::kHalf) {
assert(num_channels % 2 == 0);
const half *depth_features =
(half *)depth_features_tensor.data_ptr();
const half *context_features =
(half *)context_features_tensor.data_ptr();
half *output_features = (half *)output_features_tensor.data_ptr();
voxel_pooling_inference_forward_kernel_launcher(
batch_size, num_cams, num_depth, num_height, num_width, num_channels,
num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, depth_features,
context_features, output_features, stream);
}
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("voxel_pooling_inference_forward_wrapper",
&voxel_pooling_inference_forward_wrapper,
"voxel_pooling_inference_forward_wrapper");
}
================================================
FILE: bevdepth/ops/voxel_pooling_inference/src/voxel_pooling_inference_forward_cuda.cu
================================================
// Copyright (c) Megvii Inc. All rights reserved.
#include
#include
#include
#include
#define THREADS_BLOCK_X 32
#define THREADS_BLOCK_Y 4
#define THREADS_PER_BLOCK THREADS_BLOCK_X *THREADS_BLOCK_Y
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void voxel_pooling_inference_forward_kernel(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
const int *geom_xyz, const half *depth_features,
const half *context_features, half *output_features) {
const int bidx = blockIdx.x;
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int sample_dim = THREADS_PER_BLOCK;
const int idx_in_block = tidy * THREADS_BLOCK_X + tidx;
const int batch_size_with_cams = batch_size * num_cams;
const int block_sample_idx = bidx * sample_dim;
const int thread_sample_idx = block_sample_idx + idx_in_block;
const int total_samples =
batch_size_with_cams * num_depth * num_height * num_width;
__shared__ int geom_xyz_shared[THREADS_PER_BLOCK * 3];
if (thread_sample_idx < total_samples) {
const int sample_x = geom_xyz[thread_sample_idx * 3 + 0];
const int sample_y = geom_xyz[thread_sample_idx * 3 + 1];
const int sample_z = geom_xyz[thread_sample_idx * 3 + 2];
geom_xyz_shared[idx_in_block * 3 + 0] = sample_x;
geom_xyz_shared[idx_in_block * 3 + 1] = sample_y;
geom_xyz_shared[idx_in_block * 3 + 2] = sample_z;
}
__syncthreads();
for (int i = tidy;
i < THREADS_PER_BLOCK && block_sample_idx + i < total_samples;
i += THREADS_BLOCK_Y) {
const int sample_x = geom_xyz_shared[i * 3 + 0];
const int sample_y = geom_xyz_shared[i * 3 + 1];
const int sample_z = geom_xyz_shared[i * 3 + 2];
if (sample_x < 0 || sample_x >= num_voxel_x || sample_y < 0 ||
sample_y >= num_voxel_y || sample_z < 0 || sample_z >= num_voxel_z) {
continue;
}
const int sample_idx = block_sample_idx + i;
const int batch_idx =
sample_idx / (num_cams * num_depth * num_height * num_width);
const int width_idx = sample_idx % num_width;
const int height_idx = (sample_idx / num_width) % num_height;
const int cam_idx =
sample_idx / (num_depth * num_height * num_width) % num_cams;
// if(batch_idx > 0 || cam_idx > 0)
// printf("batch_idx: %d, width_idx: %d, height_idx: %d, cam_idx: %d \n",
// batch_idx, width_idx, height_idx, cam_idx);
const half depth_val = depth_features[sample_idx];
half res;
for (int j = tidx; j < num_channels; j += THREADS_BLOCK_X) {
const half context_val = context_features
[batch_idx * (num_cams * num_channels * num_height * num_width) +
cam_idx * (num_channels * num_height * num_width) +
j * (num_height * num_width) + height_idx * num_width + width_idx];
res = __hmul(depth_val, context_val);
atomicAdd(&output_features[(batch_idx * num_voxel_y * num_voxel_x +
sample_y * num_voxel_x + sample_x) *
num_channels +
j],
res);
}
}
}
__global__ void voxel_pooling_inference_forward_kernel(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
const int *geom_xyz, const float *depth_features,
const float *context_features, float *output_features) {
const int bidx = blockIdx.x;
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int sample_dim = THREADS_PER_BLOCK;
const int idx_in_block = tidy * THREADS_BLOCK_X + tidx;
const int batch_size_with_cams = batch_size * num_cams;
const int block_sample_idx = bidx * sample_dim;
const int thread_sample_idx = block_sample_idx + idx_in_block;
const int total_samples =
batch_size_with_cams * num_depth * num_height * num_width;
// printf("Total sample:%d num_cams: %d, num_depth: %d num_height: %d
// num_width: %d\n", total_samples, num_cams, num_depth, num_height,
// num_width);
__shared__ int geom_xyz_shared[THREADS_PER_BLOCK * 3];
if (thread_sample_idx < total_samples) {
const int sample_x = geom_xyz[thread_sample_idx * 3 + 0];
const int sample_y = geom_xyz[thread_sample_idx * 3 + 1];
const int sample_z = geom_xyz[thread_sample_idx * 3 + 2];
geom_xyz_shared[idx_in_block * 3 + 0] = sample_x;
geom_xyz_shared[idx_in_block * 3 + 1] = sample_y;
geom_xyz_shared[idx_in_block * 3 + 2] = sample_z;
}
__syncthreads();
for (int i = tidy;
i < THREADS_PER_BLOCK && block_sample_idx + i < total_samples;
i += THREADS_BLOCK_Y) {
const int sample_x = geom_xyz_shared[i * 3 + 0];
const int sample_y = geom_xyz_shared[i * 3 + 1];
const int sample_z = geom_xyz_shared[i * 3 + 2];
if (sample_x < 0 || sample_x >= num_voxel_x || sample_y < 0 ||
sample_y >= num_voxel_y || sample_z < 0 || sample_z >= num_voxel_z) {
continue;
}
const int sample_idx = block_sample_idx + i;
const int batch_idx =
sample_idx / (num_cams * num_depth * num_height * num_width);
const int width_idx = sample_idx % num_width;
const int height_idx = (sample_idx / num_width) % num_height;
const int cam_idx =
sample_idx / (num_depth * num_height * num_width) % num_cams;
const float depth_val = depth_features[sample_idx];
for (int j = tidx; j < num_channels; j += THREADS_BLOCK_X) {
const float context_val = context_features
[batch_idx * (num_cams * num_channels * num_height * num_width) +
cam_idx * (num_channels * num_height * num_width) +
j * (num_height * num_width) + height_idx * num_width + width_idx];
atomicAdd(&output_features[(batch_idx * num_voxel_y * num_voxel_x +
sample_y * num_voxel_x + sample_x) *
num_channels +
j],
depth_val * context_val);
}
}
}
void voxel_pooling_inference_forward_kernel_launcher(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
const int *geom_xyz, const float *depth_features,
const float *context_features, float *output_features,
cudaStream_t stream) {
cudaError_t err;
dim3 blocks(DIVUP(batch_size * num_cams * num_depth * num_height * num_width,
THREADS_PER_BLOCK));
dim3 threads(THREADS_BLOCK_X, THREADS_BLOCK_Y);
voxel_pooling_inference_forward_kernel<<>>(
batch_size, num_cams, num_depth, num_height, num_width, num_channels,
num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, depth_features,
context_features, output_features);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
void voxel_pooling_inference_forward_kernel_launcher(
int batch_size, int num_cams, int num_depth, int num_height, int num_width,
int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z,
const int *geom_xyz, const half *depth_features,
const half *context_features, half *output_features, cudaStream_t stream) {
cudaError_t err;
dim3 blocks(DIVUP(batch_size * num_cams * num_depth * num_height * num_width,
THREADS_PER_BLOCK));
dim3 threads(THREADS_BLOCK_X, THREADS_BLOCK_Y);
voxel_pooling_inference_forward_kernel<<>>(
batch_size, num_cams, num_depth, num_height, num_width, num_channels,
num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, depth_features,
context_features, output_features);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: bevdepth/ops/voxel_pooling_inference/voxel_pooling_inference.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
from torch.autograd import Function
from . import voxel_pooling_inference_ext
class VoxelPoolingInference(Function):
@staticmethod
def forward(ctx, geom_xyz: torch.Tensor, depth_features: torch.Tensor,
context_features: torch.Tensor,
voxel_num: torch.Tensor) -> torch.Tensor:
"""Forward function for `voxel pooling.
Args:
geom_xyz (Tensor): xyz coord for each voxel with the shape
of [B, N, 3].
input_features (Tensor): feature for each voxel with the
shape of [B, N, C].
voxel_num (Tensor): Number of voxels for each dim with the
shape of [3].
Returns:
Tensor: (B, C, H, W) bev feature map.
"""
assert geom_xyz.is_contiguous()
assert depth_features.is_contiguous()
assert context_features.is_contiguous()
# no gradient for input_features and geom_feats
ctx.mark_non_differentiable(geom_xyz)
batch_size = geom_xyz.shape[0]
num_cams = geom_xyz.shape[1]
num_depth = geom_xyz.shape[2]
num_height = geom_xyz.shape[3]
num_width = geom_xyz.shape[4]
num_channels = context_features.shape[1]
output_features = depth_features.new_zeros(
(batch_size, voxel_num[1], voxel_num[0], num_channels))
voxel_pooling_inference_ext.voxel_pooling_inference_forward_wrapper(
batch_size,
num_cams,
num_depth,
num_height,
num_width,
num_channels,
voxel_num[0],
voxel_num[1],
voxel_num[2],
geom_xyz,
depth_features,
context_features,
output_features,
)
return output_features.permute(0, 3, 1, 2)
voxel_pooling_inference = VoxelPoolingInference.apply
================================================
FILE: bevdepth/ops/voxel_pooling_train/__init__.py
================================================
from .voxel_pooling_train import voxel_pooling_train
__all__ = ['voxel_pooling_train']
================================================
FILE: bevdepth/ops/voxel_pooling_train/src/voxel_pooling_train_forward.cpp
================================================
// Copyright (c) Megvii Inc. All rights reserved.
#include
#include
#include
#include
#include
#include
#include
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int voxel_pooling_train_forward_wrapper(int batch_size, int num_points,
int num_channels, int num_voxel_x,
int num_voxel_y, int num_voxel_z,
at::Tensor geom_xyz_tensor,
at::Tensor input_features_tensor,
at::Tensor output_features_tensor,
at::Tensor pos_memo_tensor);
void voxel_pooling_train_forward_kernel_launcher(
int batch_size, int num_points, int num_channels, int num_voxel_x,
int num_voxel_y, int num_voxel_z, const int *geom_xyz,
const float *input_features, float *output_features, int *pos_memo,
cudaStream_t stream);
void voxel_pooling_train_forward_kernel_launcher(
int batch_size, int num_points, int num_channels, int num_voxel_x,
int num_voxel_y, int num_voxel_z, const int *geom_xyz,
const half *input_features, half *output_features, int *pos_memo,
cudaStream_t stream);
int voxel_pooling_train_forward_wrapper(int batch_size, int num_points,
int num_channels, int num_voxel_x,
int num_voxel_y, int num_voxel_z,
at::Tensor geom_xyz_tensor,
at::Tensor input_features_tensor,
at::Tensor output_features_tensor,
at::Tensor pos_memo_tensor) {
CHECK_INPUT(geom_xyz_tensor);
CHECK_INPUT(input_features_tensor);
const int *geom_xyz = geom_xyz_tensor.data_ptr();
int *pos_memo = pos_memo_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
if (input_features_tensor.dtype() == at::kFloat) {
const float *input_features = input_features_tensor.data_ptr();
float *output_features = output_features_tensor.data_ptr();
voxel_pooling_train_forward_kernel_launcher(
batch_size, num_points, num_channels, num_voxel_x, num_voxel_y,
num_voxel_z, geom_xyz, input_features, output_features, pos_memo,
stream);
}
else if (input_features_tensor.dtype() == at::kHalf) {
assert(num_channels % 2 == 0);
const half *input_features =
(half *)(input_features_tensor.data_ptr());
half *output_features =
(half *)(output_features_tensor.data_ptr());
voxel_pooling_train_forward_kernel_launcher(
batch_size, num_points, num_channels, num_voxel_x, num_voxel_y,
num_voxel_z, geom_xyz, input_features, output_features, pos_memo,
stream);
}
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("voxel_pooling_train_forward_wrapper",
&voxel_pooling_train_forward_wrapper,
"voxel_pooling_train_forward_wrapper");
}
================================================
FILE: bevdepth/ops/voxel_pooling_train/src/voxel_pooling_train_forward_cuda.cu
================================================
// Copyright (c) Megvii Inc. All rights reserved.
#include
#include
#include
#include
#define THREADS_BLOCK_X 32
#define THREADS_BLOCK_Y 4
#define THREADS_PER_BLOCK THREADS_BLOCK_X *THREADS_BLOCK_Y
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
template
__global__ void voxel_pooling_train_forward_kernel(
int batch_size, int num_points, int num_channels, int num_voxel_x,
int num_voxel_y, int num_voxel_z, const int *geom_xyz,
const T *input_features, T *output_features, int *pos_memo) {
const int bidx = blockIdx.x;
const int tidx = threadIdx.x;
const int tidy = threadIdx.y;
const int sample_dim = THREADS_PER_BLOCK;
const int idx_in_block = tidy * THREADS_BLOCK_X + tidx;
const int block_sample_idx = bidx * sample_dim;
const int thread_sample_idx = block_sample_idx + idx_in_block;
const int total_samples = batch_size * num_points;
__shared__ int geom_xyz_shared[THREADS_PER_BLOCK * 3];
if (thread_sample_idx < total_samples) {
const int sample_x = geom_xyz[thread_sample_idx * 3 + 0];
const int sample_y = geom_xyz[thread_sample_idx * 3 + 1];
const int sample_z = geom_xyz[thread_sample_idx * 3 + 2];
geom_xyz_shared[idx_in_block * 3 + 0] = sample_x;
geom_xyz_shared[idx_in_block * 3 + 1] = sample_y;
geom_xyz_shared[idx_in_block * 3 + 2] = sample_z;
if ((sample_x >= 0 && sample_x < num_voxel_x) &&
(sample_y >= 0 && sample_y < num_voxel_y) &&
(sample_z >= 0 && sample_z < num_voxel_z)) {
pos_memo[thread_sample_idx * 3 + 0] = thread_sample_idx / num_points;
pos_memo[thread_sample_idx * 3 + 1] = sample_y;
pos_memo[thread_sample_idx * 3 + 2] = sample_x;
}
}
__syncthreads();
for (int i = tidy;
i < THREADS_PER_BLOCK && block_sample_idx + i < total_samples;
i += THREADS_BLOCK_Y) {
const int sample_x = geom_xyz_shared[i * 3 + 0];
const int sample_y = geom_xyz_shared[i * 3 + 1];
const int sample_z = geom_xyz_shared[i * 3 + 2];
if (sample_x < 0 || sample_x >= num_voxel_x || sample_y < 0 ||
sample_y >= num_voxel_y || sample_z < 0 || sample_z >= num_voxel_z) {
continue;
}
const int batch_idx = (block_sample_idx + i) / num_points;
for (int j = tidx; j < num_channels; j += THREADS_BLOCK_X) {
atomicAdd(&output_features[(batch_idx * num_voxel_y * num_voxel_x +
sample_y * num_voxel_x + sample_x) *
num_channels +
j],
input_features[(block_sample_idx + i) * num_channels + j]);
}
}
}
void voxel_pooling_train_forward_kernel_launcher(
int batch_size, int num_points, int num_channels, int num_voxel_x,
int num_voxel_y, int num_voxel_z, const int *geom_xyz,
const float *input_features, float *output_features, int *pos_memo,
cudaStream_t stream) {
cudaError_t err;
dim3 blocks(DIVUP(batch_size * num_points, THREADS_PER_BLOCK));
dim3 threads(THREADS_BLOCK_X, THREADS_BLOCK_Y);
voxel_pooling_train_forward_kernel<<>>(
batch_size, num_points, num_channels, num_voxel_x, num_voxel_y,
num_voxel_z, geom_xyz, input_features, output_features, pos_memo);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
void voxel_pooling_train_forward_kernel_launcher(
int batch_size, int num_points, int num_channels, int num_voxel_x,
int num_voxel_y, int num_voxel_z, const int *geom_xyz,
const half *input_features, half *output_features, int *pos_memo,
cudaStream_t stream) {
cudaError_t err;
dim3 blocks(DIVUP(batch_size * num_points, THREADS_PER_BLOCK));
dim3 threads(THREADS_BLOCK_X, THREADS_BLOCK_Y);
voxel_pooling_train_forward_kernel<<>>(
batch_size, num_points, num_channels, num_voxel_x, num_voxel_y,
num_voxel_z, geom_xyz, input_features, output_features, pos_memo);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: bevdepth/ops/voxel_pooling_train/voxel_pooling_train.py
================================================
# Copyright (c) Megvii Inc. All rights reserved.
import torch
from torch.autograd import Function
from . import voxel_pooling_train_ext
class VoxelPoolingTrain(Function):
@staticmethod
def forward(ctx, geom_xyz: torch.Tensor, input_features: torch.Tensor,
voxel_num: torch.Tensor) -> torch.Tensor:
"""Forward function for `voxel pooling.
Args:
geom_xyz (Tensor): xyz coord for each voxel with the shape
of [B, N, 3].
input_features (Tensor): feature for each voxel with the
shape of [B, N, C].
voxel_num (Tensor): Number of voxels for each dim with the
shape of [3].
Returns:
Tensor: (B, C, H, W) bev feature map.
"""
assert geom_xyz.is_contiguous()
assert input_features.is_contiguous()
# no gradient for input_features and geom_feats
ctx.mark_non_differentiable(geom_xyz)
grad_input_features = torch.zeros_like(input_features)
geom_xyz = geom_xyz.reshape(geom_xyz.shape[0], -1, geom_xyz.shape[-1])
input_features = input_features.reshape(
(geom_xyz.shape[0], -1, input_features.shape[-1]))
assert geom_xyz.shape[1] == input_features.shape[1]
batch_size = input_features.shape[0]
num_points = input_features.shape[1]
num_channels = input_features.shape[2]
output_features = input_features.new_zeros(batch_size, voxel_num[1],
voxel_num[0], num_channels)
# Save the position of bev_feature_map for each input point.
pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1
voxel_pooling_train_ext.voxel_pooling_train_forward_wrapper(
batch_size,
num_points,
num_channels,
voxel_num[0],
voxel_num[1],
voxel_num[2],
geom_xyz,
input_features,
output_features,
pos_memo,
)
# save grad_input_features and pos_memo for backward
ctx.save_for_backward(grad_input_features, pos_memo)
return output_features.permute(0, 3, 1, 2)
@staticmethod
def backward(ctx, grad_output_features):
(grad_input_features, pos_memo) = ctx.saved_tensors
kept = (pos_memo != -1)[..., 0]
grad_input_features_shape = grad_input_features.shape
grad_input_features = grad_input_features.reshape(
grad_input_features.shape[0], -1, grad_input_features.shape[-1])
grad_input_features[kept] = grad_output_features[
pos_memo[kept][..., 0].long(), :, pos_memo[kept][..., 1].long(),
pos_memo[kept][..., 2].long()]
grad_input_features = grad_input_features.reshape(
grad_input_features_shape)
return None, grad_input_features, None
voxel_pooling_train = VoxelPoolingTrain.apply
================================================
FILE: bevdepth/utils/torch_dist.py
================================================
"""
@author: zeming li
@contact: zengarden2009@gmail.com
"""
from torch import distributed as dist
def get_rank() -> int:
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def get_world_size() -> int:
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def synchronize():
"""Helper function to synchronize (barrier)
among all processes when using distributed training"""
if not dist.is_available():
return
if not dist.is_initialized():
return
current_world_size = dist.get_world_size()
if current_world_size == 1:
return
dist.barrier()
def all_gather_object(obj):
world_size = get_world_size()
if world_size < 2:
return [obj]
output = [None for _ in range(world_size)]
dist.all_gather_object(output, obj)
return output
def is_available() -> bool:
return dist.is_available()
================================================
FILE: requirements-dev.txt
================================================
# code formatter
# force to use same version of the formatter, can be changed only by maintainer.
anybadge
autoflake==1.4
black==20.8b1
flake8
gitlint
isort==4.3.21
nbsphinx
pre-commit
pylint==2.3.1
pytest
pytest-cov
radon==4.2.0
recommonmark
seed-isort-config
setuptools
# ----- document usage
sphinx==3.5.4
sphinx-material
sphinx_markdown_tables
================================================
FILE: requirements.txt
================================================
numba
numpy
nuscenes-devkit
opencv-python-headless
pandas
pytorch-lightning==1.6.0
scikit-image
scipy
setuptools==59.5.0
tensorboardX
torch==1.9.0
torchvision==0.10.0
================================================
FILE: scripts/gen_info.py
================================================
import mmcv
import numpy as np
from nuscenes.nuscenes import NuScenes
from nuscenes.utils import splits
from tqdm import tqdm
def generate_info(nusc, scenes, max_cam_sweeps=6, max_lidar_sweeps=10):
infos = list()
for cur_scene in tqdm(nusc.scene):
if cur_scene['name'] not in scenes:
continue
first_sample_token = cur_scene['first_sample_token']
cur_sample = nusc.get('sample', first_sample_token)
while True:
info = dict()
sweep_cam_info = dict()
cam_datas = list()
lidar_datas = list()
info['sample_token'] = cur_sample['token']
info['timestamp'] = cur_sample['timestamp']
info['scene_token'] = cur_sample['scene_token']
cam_names = [
'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK',
'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'
]
lidar_names = ['LIDAR_TOP']
cam_infos = dict()
lidar_infos = dict()
for cam_name in cam_names:
cam_data = nusc.get('sample_data',
cur_sample['data'][cam_name])
cam_datas.append(cam_data)
sweep_cam_info = dict()
sweep_cam_info['sample_token'] = cam_data['sample_token']
sweep_cam_info['ego_pose'] = nusc.get(
'ego_pose', cam_data['ego_pose_token'])
sweep_cam_info['timestamp'] = cam_data['timestamp']
sweep_cam_info['is_key_frame'] = cam_data['is_key_frame']
sweep_cam_info['height'] = cam_data['height']
sweep_cam_info['width'] = cam_data['width']
sweep_cam_info['filename'] = cam_data['filename']
sweep_cam_info['calibrated_sensor'] = nusc.get(
'calibrated_sensor', cam_data['calibrated_sensor_token'])
cam_infos[cam_name] = sweep_cam_info
for lidar_name in lidar_names:
lidar_data = nusc.get('sample_data',
cur_sample['data'][lidar_name])
lidar_datas.append(lidar_data)
sweep_lidar_info = dict()
sweep_lidar_info['sample_token'] = lidar_data['sample_token']
sweep_lidar_info['ego_pose'] = nusc.get(
'ego_pose', lidar_data['ego_pose_token'])
sweep_lidar_info['timestamp'] = lidar_data['timestamp']
sweep_lidar_info['filename'] = lidar_data['filename']
sweep_lidar_info['calibrated_sensor'] = nusc.get(
'calibrated_sensor', lidar_data['calibrated_sensor_token'])
lidar_infos[lidar_name] = sweep_lidar_info
lidar_sweeps = [dict() for _ in range(max_lidar_sweeps)]
cam_sweeps = [dict() for _ in range(max_cam_sweeps)]
info['cam_infos'] = cam_infos
info['lidar_infos'] = lidar_infos
# for i in range(max_cam_sweeps):
# cam_sweeps.append(dict())
for k, cam_data in enumerate(cam_datas):
sweep_cam_data = cam_data
for j in range(max_cam_sweeps):
if sweep_cam_data['prev'] == '':
break
else:
sweep_cam_data = nusc.get('sample_data',
sweep_cam_data['prev'])
sweep_cam_info = dict()
sweep_cam_info['sample_token'] = sweep_cam_data[
'sample_token']
if sweep_cam_info['sample_token'] != cam_data[
'sample_token']:
break
sweep_cam_info['ego_pose'] = nusc.get(
'ego_pose', cam_data['ego_pose_token'])
sweep_cam_info['timestamp'] = sweep_cam_data[
'timestamp']
sweep_cam_info['is_key_frame'] = sweep_cam_data[
'is_key_frame']
sweep_cam_info['height'] = sweep_cam_data['height']
sweep_cam_info['width'] = sweep_cam_data['width']
sweep_cam_info['filename'] = sweep_cam_data['filename']
sweep_cam_info['calibrated_sensor'] = nusc.get(
'calibrated_sensor',
cam_data['calibrated_sensor_token'])
cam_sweeps[j][cam_names[k]] = sweep_cam_info
for k, lidar_data in enumerate(lidar_datas):
sweep_lidar_data = lidar_data
for j in range(max_lidar_sweeps):
if sweep_lidar_data['prev'] == '':
break
else:
sweep_lidar_data = nusc.get('sample_data',
sweep_lidar_data['prev'])
sweep_lidar_info = dict()
sweep_lidar_info['sample_token'] = sweep_lidar_data[
'sample_token']
if sweep_lidar_info['sample_token'] != lidar_data[
'sample_token']:
break
sweep_lidar_info['ego_pose'] = nusc.get(
'ego_pose', sweep_lidar_data['ego_pose_token'])
sweep_lidar_info['timestamp'] = sweep_lidar_data[
'timestamp']
sweep_lidar_info['is_key_frame'] = sweep_lidar_data[
'is_key_frame']
sweep_lidar_info['filename'] = sweep_lidar_data[
'filename']
sweep_lidar_info['calibrated_sensor'] = nusc.get(
'calibrated_sensor',
cam_data['calibrated_sensor_token'])
lidar_sweeps[j][lidar_names[k]] = sweep_lidar_info
# Remove empty sweeps.
for i, sweep in enumerate(cam_sweeps):
if len(sweep.keys()) == 0:
cam_sweeps = cam_sweeps[:i]
break
for i, sweep in enumerate(lidar_sweeps):
if len(sweep.keys()) == 0:
lidar_sweeps = lidar_sweeps[:i]
break
info['cam_sweeps'] = cam_sweeps
info['lidar_sweeps'] = lidar_sweeps
ann_infos = list()
if 'anns' in cur_sample:
for ann in cur_sample['anns']:
ann_info = nusc.get('sample_annotation', ann)
velocity = nusc.box_velocity(ann_info['token'])
if np.any(np.isnan(velocity)):
velocity = np.zeros(3)
ann_info['velocity'] = velocity
ann_infos.append(ann_info)
info['ann_infos'] = ann_infos
infos.append(info)
if cur_sample['next'] == '':
break
else:
cur_sample = nusc.get('sample', cur_sample['next'])
return infos
def main():
trainval_nusc = NuScenes(version='v1.0-trainval',
dataroot='./data/nuScenes/',
verbose=True)
train_scenes = splits.train
val_scenes = splits.val
train_infos = generate_info(trainval_nusc, train_scenes)
val_infos = generate_info(trainval_nusc, val_scenes)
mmcv.dump(train_infos, './data/nuScenes/nuscenes_infos_train.pkl')
mmcv.dump(val_infos, './data/nuScenes/nuscenes_infos_val.pkl')
test_nusc = NuScenes(version='v1.0-test',
dataroot='./data/nuScenes/',
verbose=True)
test_scenes = splits.test
test_infos = generate_info(test_nusc, test_scenes)
mmcv.dump(test_infos, './data/nuScenes/nuscenes_infos_test.pkl')
if __name__ == '__main__':
main()
================================================
FILE: scripts/visualize_nusc.py
================================================
import os
from argparse import ArgumentParser
import cv2
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import mmcv
import numpy as np
from nuscenes.utils.data_classes import Box, LidarPointCloud
from pyquaternion import Quaternion
from bevdepth.datasets.nusc_det_dataset import \
map_name_from_general_to_detection
def parse_args():
parser = ArgumentParser(add_help=False)
parser.add_argument('idx',
type=int,
help='Index of the dataset to be visualized.')
parser.add_argument('result_path', help='Path of the result json file.')
parser.add_argument('target_path',
help='Target path to save the visualization result.')
args = parser.parse_args()
return args
def get_ego_box(box_dict, ego2global_rotation, ego2global_translation):
box = Box(
box_dict['translation'],
box_dict['size'],
Quaternion(box_dict['rotation']),
)
trans = -np.array(ego2global_translation)
rot = Quaternion(ego2global_rotation).inverse
box.translate(trans)
box.rotate(rot)
box_xyz = np.array(box.center)
box_dxdydz = np.array(box.wlh)[[1, 0, 2]]
box_yaw = np.array([box.orientation.yaw_pitch_roll[0]])
box_velo = np.array(box.velocity[:2])
return np.concatenate([box_xyz, box_dxdydz, box_yaw, box_velo])
def rotate_points_along_z(points, angle):
"""
Args:
points: (B, N, 3 + C)
angle: (B), angle along z-axis, angle increases x ==> y
Returns:
"""
cosa = np.cos(angle)
sina = np.sin(angle)
zeros = np.zeros(points.shape[0])
ones = np.ones(points.shape[0])
rot_matrix = np.stack(
(cosa, sina, zeros, -sina, cosa, zeros, zeros, zeros, ones),
axis=1).reshape(-1, 3, 3)
points_rot = np.matmul(points[:, :, 0:3], rot_matrix)
points_rot = np.concatenate((points_rot, points[:, :, 3:]), axis=-1)
return points_rot
def get_corners(boxes3d):
"""
7 -------- 4
/| /|
6 -------- 5 .
| | | |
. 3 -------- 0
|/ |/
2 -------- 1
Args:
boxes3d: (N, 7) [x, y, z, dx, dy, dz, heading],
(x, y, z) is the box center
Returns:
"""
template = (np.array((
[1, 1, -1],
[1, -1, -1],
[-1, -1, -1],
[-1, 1, -1],
[1, 1, 1],
[1, -1, 1],
[-1, -1, 1],
[-1, 1, 1],
)) / 2)
corners3d = np.tile(boxes3d[:, None, 3:6],
[1, 8, 1]) * template[None, :, :]
corners3d = rotate_points_along_z(corners3d.reshape(-1, 8, 3),
boxes3d[:, 6]).reshape(-1, 8, 3)
corners3d += boxes3d[:, None, 0:3]
return corners3d
def get_bev_lines(corners):
return [[[corners[i, 0], corners[(i + 1) % 4, 0]],
[corners[i, 1], corners[(i + 1) % 4, 1]]] for i in range(4)]
def get_3d_lines(corners):
ret = []
for st, ed in [[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7],
[7, 4], [0, 4], [1, 5], [2, 6], [3, 7]]:
if corners[st, -1] > 0 and corners[ed, -1] > 0:
ret.append([[corners[st, 0], corners[ed, 0]],
[corners[st, 1], corners[ed, 1]]])
return ret
def get_cam_corners(corners, translation, rotation, cam_intrinsics):
cam_corners = corners.copy()
cam_corners -= np.array(translation)
cam_corners = cam_corners @ Quaternion(rotation).inverse.rotation_matrix.T
cam_corners = cam_corners @ np.array(cam_intrinsics).T
valid = cam_corners[:, -1] > 0
cam_corners /= cam_corners[:, 2:3]
cam_corners[~valid] = 0
return cam_corners
def demo(
idx,
nusc_results_file,
dump_file,
threshold=0.0,
show_range=60,
show_classes=[
'car',
'truck',
'construction_vehicle',
'bus',
'trailer',
'barrier',
'motorcycle',
'bicycle',
'pedestrian',
'traffic_cone',
],
):
# Set cameras
IMG_KEYS = [
'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT',
'CAM_BACK', 'CAM_BACK_LEFT'
]
infos = mmcv.load('data/nuScenes/nuscenes_12hz_infos_val.pkl')
assert idx < len(infos)
# Get data from dataset
results = mmcv.load(nusc_results_file)['results']
info = infos[idx]
lidar_path = info['lidar_infos']['LIDAR_TOP']['filename']
lidar_points = np.fromfile(os.path.join('data/nuScenes', lidar_path),
dtype=np.float32,
count=-1).reshape(-1, 5)[..., :4]
lidar_calibrated_sensor = info['lidar_infos']['LIDAR_TOP'][
'calibrated_sensor']
# Get point cloud
pts = lidar_points.copy()
ego2global_rotation = np.mean(
[info['cam_infos'][cam]['ego_pose']['rotation'] for cam in IMG_KEYS],
0)
ego2global_translation = np.mean([
info['cam_infos'][cam]['ego_pose']['translation'] for cam in IMG_KEYS
], 0)
lidar_points = LidarPointCloud(lidar_points.T)
lidar_points.rotate(
Quaternion(lidar_calibrated_sensor['rotation']).rotation_matrix)
lidar_points.translate(np.array(lidar_calibrated_sensor['translation']))
pts = lidar_points.points.T
# Get GT corners
gt_corners = []
for i in range(len(info['ann_infos'])):
if map_name_from_general_to_detection[
info['ann_infos'][i]['category_name']] in show_classes:
box = get_ego_box(
dict(
size=info['ann_infos'][i]['size'],
rotation=info['ann_infos'][i]['rotation'],
translation=info['ann_infos'][i]['translation'],
), ego2global_rotation, ego2global_translation)
if np.linalg.norm(box[:2]) <= show_range:
corners = get_corners(box[None])[0]
gt_corners.append(corners)
# Get prediction corners
pred_corners, pred_class = [], []
for box in results[info['sample_token']]:
if box['detection_score'] >= threshold and box[
'detection_name'] in show_classes:
box3d = get_ego_box(box, ego2global_rotation,
ego2global_translation)
box3d[2] += 0.5 * box3d[5] # NOTE
if np.linalg.norm(box3d[:2]) <= show_range:
corners = get_corners(box3d[None])[0]
pred_corners.append(corners)
pred_class.append(box['detection_name'])
# Set figure size
plt.figure(figsize=(24, 8))
for i, k in enumerate(IMG_KEYS):
# Draw camera views
fig_idx = i + 1 if i < 3 else i + 2
plt.subplot(2, 4, fig_idx)
# Set camera attributes
plt.title(k)
plt.axis('off')
plt.xlim(0, 1600)
plt.ylim(900, 0)
img = mmcv.imread(
os.path.join('data/nuScenes', info['cam_infos'][k]['filename']))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Draw images
plt.imshow(img)
# Draw 3D predictions
for corners, cls in zip(pred_corners, pred_class):
cam_corners = get_cam_corners(
corners,
info['cam_infos'][k]['calibrated_sensor']['translation'],
info['cam_infos'][k]['calibrated_sensor']['rotation'],
info['cam_infos'][k]['calibrated_sensor']['camera_intrinsic'])
lines = get_3d_lines(cam_corners)
for line in lines:
plt.plot(line[0],
line[1],
c=cm.get_cmap('tab10')(show_classes.index(cls)))
# Draw BEV
plt.subplot(1, 4, 4)
# Set BEV attributes
plt.title('LIDAR_TOP')
plt.axis('equal')
plt.xlim(-40, 40)
plt.ylim(-40, 40)
# Draw point cloud
plt.scatter(-pts[:, 1], pts[:, 0], s=0.01, c=pts[:, -1], cmap='gray')
# Draw BEV GT boxes
for corners in gt_corners:
lines = get_bev_lines(corners)
for line in lines:
plt.plot([-x for x in line[1]],
line[0],
c='r',
label='ground truth')
# Draw BEV predictions
for corners in pred_corners:
lines = get_bev_lines(corners)
for line in lines:
plt.plot([-x for x in line[1]], line[0], c='g', label='prediction')
# Set legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(),
by_label.keys(),
loc='upper right',
framealpha=1)
# Save figure
plt.tight_layout(w_pad=0, h_pad=2)
plt.savefig(dump_file)
if __name__ == '__main__':
args = parse_args()
demo(
args.idx,
args.result_path,
args.target_path,
)
================================================
FILE: setup.py
================================================
import os
import torch
from setuptools import find_packages, setup
from torch.utils.cpp_extension import (BuildExtension, CppExtension,
CUDAExtension)
with open('README.md', 'r') as fh:
long_description = fh.read()
def make_cuda_ext(name,
module,
sources,
sources_cuda=[],
extra_args=[],
extra_include_path=[]):
define_macros = []
extra_compile_args = {'cxx': [] + extra_args}
if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
define_macros += [('WITH_CUDA', None)]
extension = CUDAExtension
extra_compile_args['nvcc'] = extra_args + [
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
sources += sources_cuda
else:
print('Compiling {} without CUDA'.format(name))
extension = CppExtension
# raise EnvironmentError('CUDA is required to compile MMDetection!')
return extension(
name='{}.{}'.format(module, name),
sources=[os.path.join(*module.split('.'), p) for p in sources],
include_dirs=extra_include_path,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
setup(
name='BEVDepth',
version='0.0.1',
author='Megvii',
author_email='liyinhao@megvii.com',
description='Code for BEVDepth',
long_description=long_description,
long_description_content_type='text/markdown',
url=None,
packages=find_packages(),
classifiers=[
'Programming Language :: Python :: 3',
'Operating System :: OS Independent',
],
install_requires=[],
ext_modules=[
make_cuda_ext(
name='voxel_pooling_train_ext',
module='bevdepth.ops.voxel_pooling_train',
sources=['src/voxel_pooling_train_forward.cpp'],
sources_cuda=['src/voxel_pooling_train_forward_cuda.cu'],
),
make_cuda_ext(
name='voxel_pooling_inference_ext',
module='bevdepth.ops.voxel_pooling_inference',
sources=['src/voxel_pooling_inference_forward.cpp'],
sources_cuda=['src/voxel_pooling_inference_forward_cuda.cu'],
),
],
cmdclass={'build_ext': BuildExtension},
)
================================================
FILE: test/test_dataset/test_nusc_mv_det_dataset.py
================================================
import unittest
import numpy as np
import torch
from bevdepth.datasets.nusc_det_dataset import NuscDetDataset
CLASSES = [
'car',
'truck',
'construction_vehicle',
'bus',
'trailer',
'barrier',
'motorcycle',
'bicycle',
'pedestrian',
'traffic_cone',
]
H = 900
W = 1600
final_dim = (256, 704)
img_conf = dict(img_mean=[123.675, 116.28, 103.53],
img_std=[58.395, 57.12, 57.375],
to_rgb=True)
ida_aug_conf = {
'resize_lim': (0.4, 0.4),
'final_dim':
final_dim,
'rot_lim': (0, 0),
'H':
H,
'W':
W,
'rand_flip':
True,
'bot_pct_lim': (0.0, 0.0),
'cams': [
'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
'CAM_BACK', 'CAM_BACK_RIGHT'
],
'Ncams':
6,
}
bda_aug_conf = {
'rot_lim': (0, 0),
'scale_lim': (1, 1),
'flip_dx_ratio': 0,
'flip_dy_ratio': 0
}
class TestNuscMVDetData(unittest.TestCase):
def test_voxel_pooling(self):
np.random.seed(0)
torch.random.manual_seed(0)
nusc = NuscDetDataset(ida_aug_conf,
bda_aug_conf,
CLASSES,
'./test/data/nuscenes',
'./test/data/nuscenes/infos.pkl',
True,
sweep_idxes=[4])
ret_list = nusc[0]
assert torch.isclose(ret_list[0].mean(),
torch.tensor(-0.4667),
rtol=1e-3)
assert torch.isclose(ret_list[1].mean(),
torch.tensor(0.1678),
rtol=1e-3)
assert torch.isclose(ret_list[2].mean(),
torch.tensor(230.0464),
rtol=1e-3)
assert torch.isclose(ret_list[3].mean(),
torch.tensor(8.3250),
rtol=1e-3)
assert torch.isclose(ret_list[4].mean(), torch.tensor(0.25), rtol=1e-3)
assert torch.isclose(ret_list[5].mean(), torch.tensor(0.25), rtol=1e-3)
================================================
FILE: test/test_layers/test_backbone.py
================================================
import unittest
import pytest
import torch
from bevdepth.layers.backbones.base_lss_fpn import BaseLSSFPN
class TestLSSFPN(unittest.TestCase):
def setUp(self) -> None:
backbone_conf = {
'x_bound': [-10, 10, 0.5],
'y_bound': [-10, 10, 0.5],
'z_bound': [-5, 3, 8],
'd_bound': [2.0, 22, 1.0],
'final_dim': [64, 64],
'output_channels':
10,
'downsample_factor':
16,
'img_backbone_conf':
dict(type='ResNet',
depth=18,
frozen_stages=0,
out_indices=[0, 1, 2, 3],
norm_eval=False,
base_channels=8),
'img_neck_conf':
dict(
type='SECONDFPN',
in_channels=[8, 16, 32, 64],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[16, 16, 16, 16],
),
'depth_net_conf':
dict(in_channels=64, mid_channels=64),
}
self.lss_fpn = BaseLSSFPN(**backbone_conf).cuda()
@pytest.mark.skipif(torch.cuda.is_available() is False,
reason='No gpu available.')
def test_forward(self):
sweep_imgs = torch.rand(2, 2, 6, 3, 64, 64).cuda()
sensor2ego_mats = torch.rand(2, 2, 6, 4, 4).cuda()
intrin_mats = torch.rand(2, 2, 6, 4, 4).cuda()
ida_mats = torch.rand(2, 2, 6, 4, 4).cuda()
sensor2sensor_mats = torch.rand(2, 2, 6, 4, 4).cuda()
bda_mat = torch.rand(2, 4, 4).cuda()
mats_dict = dict()
mats_dict['sensor2ego_mats'] = sensor2ego_mats
mats_dict['intrin_mats'] = intrin_mats
mats_dict['ida_mats'] = ida_mats
mats_dict['sensor2sensor_mats'] = sensor2sensor_mats
mats_dict['bda_mat'] = bda_mat
preds = self.lss_fpn.forward(sweep_imgs, mats_dict)
assert preds.shape == torch.Size([2, 20, 40, 40])
================================================
FILE: test/test_layers/test_head.py
================================================
import unittest
import pytest
import torch
from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
from bevdepth.layers.heads.bev_depth_head import BEVDepthHead
class TestLSSFPN(unittest.TestCase):
def setUp(self) -> None:
bev_backbone = dict(
type='ResNet',
in_channels=10,
depth=18,
num_stages=3,
strides=(1, 2, 2),
dilations=(1, 1, 1),
out_indices=[0, 1, 2],
norm_eval=False,
base_channels=20,
)
bev_neck = dict(type='SECONDFPN',
in_channels=[10, 20, 40, 80],
upsample_strides=[1, 2, 4, 8],
out_channels=[8, 8, 8, 8])
TASKS = [
dict(num_class=1, class_names=['car']),
dict(num_class=2, class_names=['truck', 'construction_vehicle']),
dict(num_class=2, class_names=['bus', 'trailer']),
dict(num_class=1, class_names=['barrier']),
dict(num_class=2, class_names=['motorcycle', 'bicycle']),
dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
]
common_heads = dict(reg=(2, 2),
height=(1, 2),
dim=(3, 2),
rot=(2, 2),
vel=(2, 2))
bbox_coder = dict(
type='CenterPointBBoxCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_num=500,
score_threshold=0.1,
out_size_factor=32,
voxel_size=[0.2, 0.2, 8],
pc_range=[-51.2, -51.2, -5, 51.2, 51.2, 3],
code_size=9,
)
train_cfg = dict(
point_cloud_range=[-51.2, -51.2, -5, 51.2, 51.2, 3],
grid_size=[512, 512, 1],
voxel_size=[0.2, 0.2, 8],
out_size_factor=32,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5],
)
test_cfg = dict(
post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_per_img=500,
max_pool_nms=False,
min_radius=[4, 12, 10, 1, 0.85, 0.175],
score_threshold=0.1,
out_size_factor=4,
voxel_size=[0.2, 0.2, 8],
nms_type='circle',
pre_max_size=1000,
post_max_size=83,
nms_thr=0.2,
)
head_conf = {
'bev_backbone_conf': bev_backbone,
'bev_neck_conf': bev_neck,
'tasks': TASKS,
'common_heads': common_heads,
'bbox_coder': bbox_coder,
'train_cfg': train_cfg,
'test_cfg': test_cfg,
'in_channels': 32, # Equal to bev_neck output_channels.
'loss_cls': dict(type='GaussianFocalLoss', reduction='mean'),
'loss_bbox': dict(type='L1Loss',
reduction='mean',
loss_weight=0.25),
'gaussian_overlap': 0.1,
'min_radius': 2,
}
self.bevdet_head = BEVDepthHead(**head_conf).cuda()
@pytest.mark.skipif(torch.cuda.is_available() is False,
reason='No gpu available.')
def test_forward(self):
x = torch.rand(2, 10, 32, 32).cuda()
ret_results = self.bevdet_head.forward(x)
assert len(ret_results) == 6
assert ret_results[0][0]['reg'].shape == torch.Size([2, 2, 32, 32])
assert ret_results[0][0]['height'].shape == torch.Size([2, 1, 32, 32])
assert ret_results[0][0]['dim'].shape == torch.Size([2, 3, 32, 32])
assert ret_results[0][0]['rot'].shape == torch.Size([2, 2, 32, 32])
assert ret_results[0][0]['vel'].shape == torch.Size([2, 2, 32, 32])
assert ret_results[0][0]['heatmap'].shape == torch.Size([2, 1, 32, 32])
@pytest.mark.skipif(torch.cuda.is_available() is False,
reason='No gpu available.')
def test_get_targets(self):
gt_boxes_3d_0 = torch.rand(10, 9).cuda()
gt_boxes_3d_1 = torch.rand(15, 9).cuda()
gt_boxes_3d_0[:, :2] *= 10
gt_boxes_3d_1[:, :2] *= 10
gt_labels_3d_0 = torch.randint(0, 10, (10, )).cuda()
gt_labels_3d_1 = torch.randint(0, 10, (15, )).cuda()
gt_boxes_3d = [gt_boxes_3d_0, gt_boxes_3d_1]
gt_labels_3d = [gt_labels_3d_0, gt_labels_3d_1]
heatmaps, anno_boxes, inds, masks = self.bevdet_head.get_targets(
gt_boxes_3d, gt_labels_3d)
assert len(heatmaps) == 6
assert len(anno_boxes) == 6
assert len(inds) == 6
assert len(masks) == 6
assert heatmaps[0].shape == torch.Size([2, 1, 16, 16])
assert anno_boxes[0].shape == torch.Size([2, 500, 10])
assert inds[0].shape == torch.Size([2, 500])
assert masks[0].shape == torch.Size([2, 500])
@pytest.mark.skipif(torch.cuda.is_available() is False,
reason='No gpu available.')
def test_get_bboxes(self):
x = torch.rand(2, 10, 32, 32).cuda()
ret_results = self.bevdet_head.forward(x)
img_metas = [
dict(box_type_3d=LiDARInstance3DBoxes),
dict(box_type_3d=LiDARInstance3DBoxes)
]
pred_bboxes = self.bevdet_head.get_bboxes(ret_results,
img_metas=img_metas)
assert len(pred_bboxes) == 2
assert len(pred_bboxes[0]) == 3
assert pred_bboxes[0][1].shape == torch.Size([498])
assert pred_bboxes[0][2].shape == torch.Size([498])
================================================
FILE: test/test_layers/test_matrixvt.py
================================================
import unittest
import torch
from bevdepth.layers.backbones.matrixvt import MatrixVT
class TestMatrixVT(unittest.TestCase):
def setUp(self) -> None:
backbone_conf = {
'x_bound': [-10, 10, 0.5],
'y_bound': [-10, 10, 0.5],
'z_bound': [-5, 3, 8],
'd_bound': [2.0, 22, 1.0],
'final_dim': [64, 64],
'output_channels':
10,
'downsample_factor':
16,
'img_backbone_conf':
dict(
type='ResNet',
depth=18,
frozen_stages=0,
out_indices=[0, 1, 2, 3],
norm_eval=False,
base_channels=8,
),
'img_neck_conf':
dict(
type='SECONDFPN',
in_channels=[8, 16, 32, 64],
upsample_strides=[0.25, 0.5, 1, 2],
out_channels=[16, 16, 16, 16],
),
'depth_net_conf':
dict(in_channels=64, mid_channels=64),
}
model = MatrixVT(**backbone_conf)
return model
def test_forward(self):
model = self.setUp()
bev_feature, depth = model(
torch.rand((2, 1, 6, 3, 64, 64)),
{
'sensor2ego_mats': torch.rand((2, 1, 6, 4, 4)),
'intrin_mats': torch.rand((2, 1, 6, 4, 4)),
'ida_mats': torch.rand((2, 1, 6, 4, 4)),
'sensor2sensor_mats': torch.rand((2, 1, 6, 4, 4)),
'bda_mat': torch.rand((2, 4, 4)),
},
is_return_depth=True,
)
print(bev_feature.shape)
print(depth.shape)
assert bev_feature.shape == torch.Size([2, 10, 40, 40])
assert depth.shape == torch.Size([12, 20, 4, 4])
================================================
FILE: test/test_ops/test_voxel_pooling.py
================================================
import unittest
import pytest
import torch
from bevdepth.ops.voxel_pooling_train import voxel_pooling_train
class TestLSSFPN(unittest.TestCase):
@pytest.mark.skipif(condition=torch.cuda.is_available() is False,
reason='No gpu available.')
def test_voxel_pooling(self):
import numpy as np
np.random.seed(0)
torch.manual_seed(0)
geom_xyz = torch.rand([2, 6, 10, 10, 10, 3]) * 160 - 80
geom_xyz[..., 2] /= 100
geom_xyz = geom_xyz.reshape(2, -1, 3)
features = torch.rand([2, 6, 10, 10, 10, 80]) - 0.5
gt_features = features.reshape(2, -1, 80)
gt_bev_featuremap = features.new_zeros(2, 128, 128, 80)
for i in range(2):
for j in range(geom_xyz.shape[1]):
x = geom_xyz[i, j, 0].int()
y = geom_xyz[i, j, 1].int()
z = geom_xyz[i, j, 2].int()
if x < 0 or x >= 128 or y < 0 or y >= 128 or z < 0 or z >= 1:
continue
gt_bev_featuremap[i, y, x, :] += gt_features[i, j, :]
gt_bev_featuremap = gt_bev_featuremap.permute(0, 3, 1, 2).cuda()
bev_featuremap = voxel_pooling_train(
geom_xyz.cuda().int(), features.cuda(),
torch.tensor([128, 128, 1], dtype=torch.int, device='cuda'))
assert torch.allclose(gt_bev_featuremap.cuda(),
bev_featuremap,
rtol=1e-3)