Showing preview only (9,959K chars total). Download the full file or copy to clipboard to get everything.
Repository: huang-yh/SelfOcc
Branch: main
Commit: 1658c2afd464
Files: 181
Total size: 9.5 MB
Directory structure:
gitextract_ciq3vmv7/
├── .gitignore
├── LICENSE
├── README.md
├── assets/
│ └── .placeholder
├── config/
│ ├── _base_/
│ │ ├── dataset_v1.py
│ │ ├── optimizer.py
│ │ └── schedule.py
│ ├── kitti/
│ │ ├── kitti_novel_depth.py
│ │ └── kitti_occ.py
│ ├── kitti_raw/
│ │ └── kitti_raw_depth.py
│ ├── nuscenes/
│ │ ├── nuscenes_depth.py
│ │ ├── nuscenes_novel_depth.py
│ │ ├── nuscenes_occ.py
│ │ └── nuscenes_occ_bev.py
│ └── openseed/
│ └── openseed_swint_lang.yaml
├── dataset/
│ ├── __init__.py
│ ├── dataset_one_frame_eval.py
│ ├── dataset_one_frame_sweeps_dist.py
│ ├── dataset_one_frame_sweeps_dist_vis.py
│ ├── dataset_wrapper_temporal.py
│ ├── dataset_wrapper_vis.py
│ ├── kitti/
│ │ ├── helpers.py
│ │ ├── io_data.py
│ │ ├── kitti_dataset_eval.py
│ │ ├── kitti_dataset_one_frame.py
│ │ ├── params.py
│ │ └── semantic-kitti.yaml
│ ├── kitti_raw/
│ │ ├── kitti_raw_dataset.py
│ │ ├── kitti_raw_dataset_stereo.py
│ │ ├── orb-slam_poses/
│ │ │ ├── 2011_09_26/
│ │ │ │ ├── 2011_09_26_drive_0001_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0002_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0005_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0009_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0011_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0013_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0014_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0015_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0017_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0018_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0019_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0020_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0022_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0023_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0027_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0028_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0029_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0032_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0035_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0036_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0039_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0046_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0048_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0051_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0052_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0056_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0057_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0059_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0060_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0061_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0064_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0070_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0079_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0084_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0086_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0087_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0091_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0093_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0095_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0096_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0101_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0104_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0106_sync.txt
│ │ │ │ ├── 2011_09_26_drive_0113_sync.txt
│ │ │ │ └── 2011_09_26_drive_0117_sync.txt
│ │ │ ├── 2011_09_28/
│ │ │ │ ├── 2011_09_28_drive_0001_sync.txt
│ │ │ │ └── 2011_09_28_drive_0002_sync.txt
│ │ │ ├── 2011_09_29/
│ │ │ │ ├── 2011_09_29_drive_0004_sync.txt
│ │ │ │ ├── 2011_09_29_drive_0026_sync.txt
│ │ │ │ └── 2011_09_29_drive_0071_sync.txt
│ │ │ ├── 2011_09_30/
│ │ │ │ ├── 2011_09_30_drive_0016_sync.txt
│ │ │ │ ├── 2011_09_30_drive_0018_sync.txt
│ │ │ │ ├── 2011_09_30_drive_0020_sync.txt
│ │ │ │ ├── 2011_09_30_drive_0027_sync.txt
│ │ │ │ ├── 2011_09_30_drive_0028_sync.txt
│ │ │ │ ├── 2011_09_30_drive_0033_sync.txt
│ │ │ │ └── 2011_09_30_drive_0034_sync.txt
│ │ │ └── 2011_10_03/
│ │ │ ├── 2011_10_03_drive_0027_sync.txt
│ │ │ ├── 2011_10_03_drive_0034_sync.txt
│ │ │ ├── 2011_10_03_drive_0042_sync.txt
│ │ │ └── 2011_10_03_drive_0047_sync.txt
│ │ └── splits/
│ │ ├── eigen/
│ │ │ └── test_files.txt
│ │ ├── eigen_zhou/
│ │ │ ├── test_files.txt
│ │ │ ├── train_files.txt
│ │ │ └── val_files.txt
│ │ └── tulsiani/
│ │ ├── test_files.txt
│ │ ├── train_files.txt
│ │ └── val_files.txt
│ ├── loading.py
│ ├── sampler.py
│ ├── transform_3d.py
│ └── utils.py
├── docs/
│ ├── get_started.md
│ ├── installation.md
│ ├── prepare_data.md
│ └── visualization.md
├── eval_depth.py
├── eval_iou.py
├── eval_iou_kitti.py
├── eval_novel_depth.py
├── eval_novel_depth_kitti.py
├── examine_sweeps.py
├── generate_videos.py
├── loss/
│ ├── __init__.py
│ ├── base_loss.py
│ ├── edge_loss_3d_ms.py
│ ├── eikonal_loss.py
│ ├── multi_loss.py
│ ├── reproj_loss_mono_multi_new.py
│ ├── reproj_loss_mono_multi_new_combine.py
│ ├── rgb_loss_ms.py
│ ├── second_grad_loss.py
│ └── sparsity_loss.py
├── model/
│ ├── __init__.py
│ ├── backbone/
│ │ ├── __init__.py
│ │ └── unet2d.py
│ ├── encoder/
│ │ ├── __init__.py
│ │ ├── base_encoder.py
│ │ ├── bevformer/
│ │ │ ├── __init__.py
│ │ │ ├── attention/
│ │ │ │ ├── __init__.py
│ │ │ │ └── image_cross_attention.py
│ │ │ ├── bevformer_encoder.py
│ │ │ ├── bevformer_encoder_layer.py
│ │ │ ├── bevformer_pos_embed.py
│ │ │ ├── mappings.py
│ │ │ ├── mappings_old.py
│ │ │ └── utils.py
│ │ └── tpvformer/
│ │ ├── __init__.py
│ │ ├── attention/
│ │ │ ├── __init__.py
│ │ │ ├── cross_view_hybrid_attention.py
│ │ │ └── image_cross_attention.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── camera_se_net.py
│ │ │ ├── split_fpn.py
│ │ │ └── split_norm.py
│ │ ├── tpvformer_encoder.py
│ │ ├── tpvformer_encoder_layer.py
│ │ ├── tpvformer_pos_embed.py
│ │ └── utils.py
│ ├── head/
│ │ ├── __init__.py
│ │ ├── base_head.py
│ │ ├── nerfacc_head/
│ │ │ ├── bev_nerf.py
│ │ │ ├── estimator.py
│ │ │ ├── img2lidar.py
│ │ │ ├── nerfacc_head.py
│ │ │ ├── ray_sampler.py
│ │ │ └── rendering.py
│ │ ├── neus_head/
│ │ │ └── neus_head.py
│ │ └── utils/
│ │ └── sh_render.py
│ ├── lifter/
│ │ ├── __init__.py
│ │ ├── base_lifter.py
│ │ ├── bev_query_lifter.py
│ │ ├── tpv_pos_lifter.py
│ │ └── tpv_query_lifter.py
│ ├── neck/
│ │ ├── __init__.py
│ │ └── identity_neck.py
│ └── segmentor/
│ ├── __init__.py
│ ├── base_segmentor.py
│ └── tpv_segmentor.py
├── train.py
├── utils/
│ ├── __init__.py
│ ├── config_tools.py
│ ├── feat_tools.py
│ ├── metric_util.py
│ ├── misc.py
│ ├── openseed_utils.py
│ ├── scenerf_metric.py
│ ├── tb_wrapper.py
│ └── temporal_pkl.py
├── vis_3d.py
├── vis_3d_scene.py
└── vis_pics.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
ckpts
data
.DS_Store
out
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction
### [Paper](https://arxiv.org/pdf/2311.12754) | [Project Page](https://huang-yh.github.io/SelfOcc/)
> SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction, CVPR 2024
> [Yuanhui Huang](https://scholar.google.com/citations?hl=zh-CN&user=LKVgsk4AAAAJ)*, [Wenzhao Zheng](https://wzzheng.net/)\* $\dagger$, [Borui Zhang](https://boruizhang.site/), [Jie Zhou](https://scholar.google.com/citations?user=6a79aPwAAAAJ&hl=en&authuser=1), [Jiwen Lu](http://ivg.au.tsinghua.edu.cn/Jiwen_Lu/)$\ddagger$
\* Equal contribution $\dagger$ Project leader $\ddagger$ Corresponding author
SelfOcc empowers 3D autonomous driving world models (e.g., [OccWorld](https://github.com/wzzheng/OccWorld)) with scalable 3D representations, paving the way for **interpretable end-to-end large driving models**.
## News
- **[2024/9/5]** Check out our recent [GaussianFormer](https://github.com/huang-yh/GaussianFormer) for object-centric 3D occupancy prediction.
- **[2024/4/30]** Visualization code for 2D, 3D and video is released.
- **[2024/2/26]** SelfOcc is accepted to CVPR 2024!
- **[2023/12/16]** Training code release.
- **[2023/11/28]** Evaluation code release.
- **[2023/11/20]** Paper released on [arXiv](https://arxiv.org/abs/2311.12754).
- **[2023/11/20]** Demo release.
## Demo
### Trained using only video sequences and poses:

### Trained using an additional off-the-shelf 2D segmentor ([OpenSeeD](https://github.com/IDEA-Research/OpenSeeD)):


### More demo videos can be downloaded [here](https://cloud.tsinghua.edu.cn/d/640283b528f7436193a4/).
## Overview

- We first transform the images into the 3D space (e.g., bird's eye view, [tri-perspective view](https://github.com/wzzheng/TPVFormer)) to obtain 3D representation of the scene. We directly impose constraints on the 3D representations by treating them as signed distance fields. We can then render 2D images of previous and future frames as self-supervision signals to learn the 3D representations.
- Our SelfOcc outperforms the previous best method SceneRF by 58.7% using a single frame as input on SemanticKITTI and is the first self-supervised work that produces reasonable 3D occupancy for surround cameras on nuScenes.
- SelfOcc produces high-quality depth and achieves state-of-the-art results on **novel depth synthesis**, **monocular depth estimation**, and **surround-view depth estimation** on the SemanticKITTI, KITTI-2015, and nuScenes, respectively.
## Results
<img src=./assets/results.png height="480px" width="563px" />
## Getting Started
### Installation
Follow detailed instructions in [Installation](docs/installation.md).
### Preparing Dataset
Follow detailed instructions in [Prepare Dataset](docs/prepare_data.md).
We also provide our code for synchronizing sweep data according to keyframe samples.
### Run
[23/12/16 Update] Please update the timm package to 0.9.2 to run the training script.
#### 3D Occupancy Prediction
Download model weights [HERE](https://cloud.tsinghua.edu.cn/f/831c104c82a244e9878a/) and put it under out/nuscenes/occ/
```bash
# train
python train.py --py-config config/nuscenes/nuscenes_occ.py --work-dir out/nuscenes/occ_train --depth-metric
# eval
python eval_iou.py --py-config config/nuscenes/nuscenes_occ.py --work-dir out/nuscenes/occ --resume-from out/nuscenes/occ/model_state_dict.pth --occ3d --resolution 0.4 --sem --use-mask --scene-size 4
```
#### Novel Depth Synthesis
Download model weights [HERE](https://cloud.tsinghua.edu.cn/f/2d217cd298a34ed19039/) and put it under out/nuscenes/novel_depth/
```bash
# train
python train.py --py-config config/nuscenes/nuscenes_novel_depth.py --work-dir out/nuscenes/novel_depth_train --depth-metric
# evak
python eval_novel_depth.py --py-config config/nuscenes/nuscenes_novel_depth.py --work-dir out/nuscenes/novel_depth --resume-from out/nuscenes/novel_depth/model_state_dict.pth
```
#### Depth Estimation
Download model weights [HERE](https://cloud.tsinghua.edu.cn/f/1a722b9139234542ae1e/) and put it under out/nuscenes/depth/
```bash
# train
python train.py --py-config config/nuscenes/nuscenes_depth.py --work-dir out/nuscenes/depth_train --depth-metric
# eval
python eval_depth.py --py-config config/nuscenes/nuscenes_depth.py --work-dir out/nuscenes/depth --resume-from out/nuscenes/depth/model_state_dict.pth --depth-metric --batch 90000
```
Note that evaluating at a resolution (450\*800) of 1:2 against the raw image (900\*1600) takes about 90 min, because we batchify rays for rendering due to GPU memory limit. You can change the rendering resolution by the variable *NUM_RAYS* in utils/config_tools.py
More details on more datasets are detailed in [Run and Eval](docs/get_started.md).
### Visualization
Follow detailed instructions in [Visualization](docs/visualization.md).
## Related Projects
Our code is based on [TPVFormer](https://github.com/wzzheng/TPVFormer) and [PointOcc](https://github.com/wzzheng/PointOcc).
Also thanks to these excellent open-sourced repos:
[SurroundOcc](https://github.com/weiyithu/SurroundOcc)
[OccFormer](https://github.com/zhangyp15/OccFormer)
[BEVFormer](https://github.com/fundamentalvision/BEVFormer)
A pioneering work on object-centric 3D occupancy prediction: [GaussianFormer](https://github.com/huang-yh/GaussianFormer).
## Citation
If you find this project helpful, please consider citing the following paper:
```
@article{huang2023self,
title={SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction},
author={Huang, Yuanhui and Zheng, Wenzhao and Zhang, Borui and Zhou, Jie and Lu, Jiwen },
journal={arXiv preprint arXiv:2311.12754},
year={2023}
}
```
================================================
FILE: assets/.placeholder
================================================
================================================
FILE: config/_base_/dataset_v1.py
================================================
data_path = 'data/nuscenes/'
train_wrapper_config = dict(
phase='train',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
val_wrapper_config = dict(
phase='val',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
nusc = dict(
version = 'v1.0-trainval',
dataroot = data_path
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
================================================
FILE: config/_base_/optimizer.py
================================================
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=2e-5,
weight_decay=0.0001
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
grad_max_norm = 35
================================================
FILE: config/_base_/schedule.py
================================================
max_epochs = 12
print_freq = 50
================================================
FILE: config/kitti/kitti_novel_depth.py
================================================
_base_ = [
'../_base_/dataset_v1.py',
'../_base_/optimizer.py',
'../_base_/schedule.py',
]
img_size = [370, 1216]
num_rays = [55, 190]
amp = False
max_epochs = 24
warmup_iters = 1000
num_cams = 1
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=1e-3,
weight_decay=0.01,
# eps=1e-4
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
data_path = 'data/kitti/'
train_dataset_config = dict(
_delete_=True,
type='Kitti_One_Frame',
split = 'train',
root = data_path,
preprocess_root = data_path + 'preprocess',
cur_prob = 0.5,
crop_size = img_size,
strict = True,
prev_prob = 0.,
choose_nearest = True,
)
val_dataset_config = dict(
_delete_=True,
type='Kitti_One_Frame',
split = 'val',
root = data_path,
preprocess_root = data_path + 'preprocess',
cur_prob = 1.0,
crop_size = img_size,
strict = False,
prev_prob = 0.5,
choose_nearest = True,
return_depth = True
)
train_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='train',
scale_rate=1.0,
pad_img_size=[384, 1216],
pad_scale_rate=[1.038, 1.0],
photometric_aug=dict(
use_swap_channel=False,
),
img_norm_cfg=dict(
mean=[124.16, 116.74, 103.94],
std=[58.624, 57.344, 57.6], to_rgb=True)
)
val_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='val',
scale_rate=1.0,
pad_img_size=[384, 1216],
pad_scale_rate=[1.038, 1.0],
photometric_aug=dict(
use_swap_channel=False,
),
img_norm_cfg=dict(
mean=[124.16, 116.74, 103.94],
std=[58.624, 57.344, 57.6], to_rgb=True)
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
loss = dict(
type='MultiLoss',
loss_cfgs=[
dict(
type='ReprojLossMonoMultiNew',
weight=1.0,
no_ssim=False,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'curr_imgs': 'curr_imgs',
'prev_imgs': 'prev_imgs',
'next_imgs': 'next_imgs',
'ray_indices': 'ray_indices',
'weights': 'weights',
'ts': 'ts',
'metas': 'metas',
'ms_rays': 'ms_rays',
# 'deltas': 'deltas'
}),
dict(
type='RGBLossMS',
weight=0.1,
img_size=img_size,
no_ssim=False,
ray_resize=num_rays,
input_dict={
'ms_colors': 'ms_colors',
'ms_rays': 'ms_rays',
'gt_imgs': 'color_imgs'}),
dict(
type='EikonalLoss',
weight=0.1,),
dict(
type='SecondGradLoss',
weight=0.01),
# dict(
# type='SparsityLoss',
# weight=0.001,
# scale=0.1,
# input_dict={
# 'density': 'uniform_sdf'}),
])
loss_input_convertion = dict(
ms_depths='ms_depths',
ms_rays='ms_rays',
# ms_accs='ms_accs',
ms_colors='ms_colors',
ray_indices='ray_indices',
weights='weights',
ts='ts',
eik_grad='eik_grad',
second_grad='second_grad',
# uniform_sdf='uniform_sdf',
# deltas='deltas'
)
load_from = ''
_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
nonlinear_mode='linear',
h_size=[256, 0],
h_range=[51.2, 0],
h_half=True,
w_size=[128, 0],
w_range=[25.6, 0],
w_half=False,
d_size=[32, 0],
d_range=[-2.0, 4.4, 4.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 256
tpv_w = 1 + 2 * 128
tpv_z = 1 + 32 + 0
point_cloud_range = [-25.6, 0.0, -2.0, 25.6, 51.2, 4.4]
num_points_cross = [48, 48, 8]
num_points_self = 12
self_cross_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='CrossViewHybridAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=3,
num_points=num_points_self,
dropout=0.1,
batch_first=True),
dict(
type='TPVCrossAttention',
embed_dims=_dim_,
num_cams=num_cams,
dropout=0.1,
batch_first=True,
num_heads=num_heads,
num_levels=4,
num_points=num_points_cross)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)
model = dict(
type='TPVSegmentor',
img_backbone_out_indices=[0, 1, 2, 3],
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
norm_eval=False,
style='pytorch',
pretrained='./ckpts/resnet50-0676ba61.pth'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
lifter=dict(
type='TPVQueryLifter',
tpv_h=tpv_h,
tpv_w=tpv_w,
tpv_z=tpv_z,
dim=_dim_),
encoder=dict(
type='TPVFormerEncoder',
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
embed_dims=_dim_,
num_cams=num_cams,
num_feature_levels=4,
positional_encoding=dict(
type='TPVPositionalEncoding',
num_freqs=[12] * 3,
embed_dims=_dim_,
tot_range=point_cloud_range),
num_points_cross=num_points_cross,
num_points_self=[num_points_self] * 3,
transformerlayers=[
self_cross_layer,
self_cross_layer,
self_cross_layer,
self_cross_layer],
num_layers=4),
head=dict(
type='NeuSHead',
roi_aabb=point_cloud_range,
resolution=0.4,
near_plane=0.0,
far_plane=1e10,
num_samples=256,
num_samples_importance=0,
num_up_sample_steps=0,
base_variance=4,
beta_init=0.1,
beta_max=0.195,
total_iters=3516*11,
beta_hand_tune=False,
use_numerical_gradients=False,
sample_gradient=True,
return_uniform_sdf=False,
return_second_grad=True,
# rays args
ray_sample_mode='cellular', # fixed, cellular
ray_number=num_rays, # 192 * 400
ray_img_size=img_size,
ray_upper_crop=0,
# img2lidar args
trans_kw='temImg2lidar',
novel_view=None,
# render args
render_bkgd='random',
# bev nerf
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# mlp decoder
embed_dims=_dim_,
color_dims=3,
density_layers=2,
sh_deg=0,
sh_act='relu',
two_split=False,
tpv=True))
================================================
FILE: config/kitti/kitti_occ.py
================================================
_base_ = [
'../_base_/dataset_v1.py',
'../_base_/optimizer.py',
'../_base_/schedule.py',
]
img_size = [352, 1216]
num_rays = [55, 190]
amp = False
max_epochs = 24
warmup_iters = 1000
num_cams = 1
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=1e-3,
weight_decay=0.01,
# eps=1e-4
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
data_path = 'data/kitti/'
train_dataset_config = dict(
_delete_=True,
type='Kitti_One_Frame',
split = 'train',
root = data_path,
preprocess_root = data_path + 'preprocess',
frames_interval=0.4,
sequence_distance=[10, 40],
cur_prob = 0.333,
crop_size = img_size,
strict = True,
prev_prob = 0.2,
choose_nearest = True,
)
val_dataset_config = dict(
_delete_=True,
type='Kitti_One_Frame',
split = 'val',
root = data_path,
preprocess_root = data_path + 'preprocess',
frames_interval=0.4,
sequence_distance=[10, 40],
cur_prob = 1.0,
crop_size = img_size,
strict = False,
prev_prob = 0.2,
choose_nearest = True,
return_depth = True
)
train_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='train',
scale_rate=1.0,
photometric_aug=dict(
use_swap_channel=False,
),
img_norm_cfg=dict(
mean=[124.16, 116.74, 103.94],
std=[58.624, 57.344, 57.6], to_rgb=True)
)
val_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='val',
scale_rate=1.0,
photometric_aug=dict(
use_swap_channel=False,
),
img_norm_cfg=dict(
mean=[124.16, 116.74, 103.94],
std=[58.624, 57.344, 57.6], to_rgb=True)
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
loss = dict(
type='MultiLoss',
loss_cfgs=[
dict(
type='ReprojLossMonoMultiNew',
weight=1.0,
no_ssim=False,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'curr_imgs': 'curr_imgs',
'prev_imgs': 'prev_imgs',
'next_imgs': 'next_imgs',
'ray_indices': 'ray_indices',
'weights': 'weights',
'ts': 'ts',
'metas': 'metas',
'ms_rays': 'ms_rays',
# 'deltas': 'deltas'
}),
dict(
type='RGBLossMS',
weight=0.1,
img_size=img_size,
no_ssim=False,
ray_resize=num_rays,
input_dict={
'ms_colors': 'ms_colors',
'ms_rays': 'ms_rays',
'gt_imgs': 'color_imgs'}),
dict(
type='EikonalLoss',
weight=0.1,),
dict(
type='SecondGradLoss',
weight=0.1),
dict(
type='SoftSparsityLoss',
weight=0.005,
input_dict={
'density': 'uniform_sdf'})
# dict(
# type='SparsityLoss',
# weight=0.001,
# scale=0.1,
# input_dict={
# 'density': 'uniform_sdf'}),
])
loss_input_convertion = dict(
ms_depths='ms_depths',
ms_rays='ms_rays',
ms_accs='ms_accs',
ms_colors='ms_colors',
ray_indices='ray_indices',
weights='weights',
ts='ts',
eik_grad='eik_grad',
second_grad='second_grad',
uniform_sdf='uniform_sdf',
# deltas='deltas'
)
load_from = ''
_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
nonlinear_mode='linear',
h_size=[256, 0],
h_range=[51.2, 0],
h_half=True,
w_size=[128, 0],
w_range=[25.6, 0],
w_half=False,
d_size=[32, 0],
d_range=[-2.0, 4.4, 4.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 256
tpv_w = 1 + 2 * 128
tpv_z = 1 + 32 + 0
point_cloud_range = [-25.6, 0.0, -2.0, 25.6, 51.2, 4.4]
num_points_cross = [48, 48, 8]
num_points_self = 12
self_cross_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='CrossViewHybridAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=3,
num_points=num_points_self,
dropout=0.1,
batch_first=True),
dict(
type='TPVCrossAttention',
embed_dims=_dim_,
num_cams=num_cams,
dropout=0.1,
batch_first=True,
num_heads=num_heads,
num_levels=4,
num_points=num_points_cross)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)
model = dict(
type='TPVSegmentor',
img_backbone_out_indices=[0, 1, 2, 3],
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
norm_eval=False,
style='pytorch',
pretrained='./ckpts/resnet50-0676ba61.pth'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
lifter=dict(
type='TPVQueryLifter',
tpv_h=tpv_h,
tpv_w=tpv_w,
tpv_z=tpv_z,
dim=_dim_),
encoder=dict(
type='TPVFormerEncoder',
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
embed_dims=_dim_,
num_cams=num_cams,
num_feature_levels=4,
positional_encoding=dict(
type='TPVPositionalEncoding',
num_freqs=[12] * 3,
embed_dims=_dim_,
tot_range=point_cloud_range),
num_points_cross=num_points_cross,
num_points_self=[num_points_self] * 3,
transformerlayers=[
self_cross_layer,
self_cross_layer,
self_cross_layer,
self_cross_layer],
num_layers=4),
head=dict(
type='NeuSHead',
roi_aabb=point_cloud_range,
resolution=0.4,
near_plane=0.0,
far_plane=1e10,
num_samples=256,
num_samples_importance=0,
num_up_sample_steps=0,
base_variance=4,
beta_init=0.1,
beta_max=0.195,
total_iters=3516*11,
beta_hand_tune=False,
use_numerical_gradients=False,
sample_gradient=True,
use_compact_2nd_grad=True,
return_uniform_sdf=True,
return_second_grad=True,
# rays args
ray_sample_mode='cellular', # fixed, cellular
ray_number=num_rays, # 192 * 400
ray_img_size=img_size,
ray_upper_crop=0,
# img2lidar args
trans_kw='temImg2lidar',
novel_view=None,
# render args
render_bkgd='random',
# bev nerf
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# mlp decoder
embed_dims=_dim_,
color_dims=3,
density_layers=2,
sh_deg=0,
sh_act='relu',
two_split=False,
tpv=True))
================================================
FILE: config/kitti_raw/kitti_raw_depth.py
================================================
_base_ = [
'../_base_/dataset_v1.py',
'../_base_/optimizer.py',
'../_base_/schedule.py',
]
img_size = [370, 1216]
num_rays = [44, 152]
amp = False
max_epochs = 24
warmup_iters = 1000
num_cams = 1
multisteplr = True
multisteplr_config = dict(
decay_t = [4976 * 18],
decay_rate = 0.1,
warmup_t = warmup_iters,
warmup_lr_init = 1e-6,
t_in_epochs = False
)
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=1e-4,
weight_decay=0.01,
# eps=1e-4
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
data_path = 'data/kitti_raw/'
train_dataset_config = dict(
_delete_=True,
type = 'Kitti_Raw_Stereo',
root = data_path,
pose_path = 'dataset/kitti_raw/orb-slam_poses',
split_path = 'dataset/kitti_raw/splits/eigen_zhou/train_files.txt',
frames_interval=1.0,
sequence_distance=5.0,
cur_prob=1.0,
crop_size=img_size,
strict=True,
return_depth=False,
prev_prob=0,
choose_nearest = True,
eval_depth = 80,
include_stereo = True
)
val_dataset_config = dict(
_delete_=True,
type = 'Kitti_Raw_Stereo',
root = data_path,
pose_path = 'dataset/kitti_raw/orb-slam_poses',
split_path = 'dataset/kitti_raw/splits/eigen_zhou/test_files.txt',
frames_interval=1.0,
sequence_distance=5.0,
cur_prob=1.0,
crop_size=img_size,
strict=False,
return_depth=True,
prev_prob=0.5,
choose_nearest = True,
eval_depth = 80,
include_stereo = True
)
train_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='train',
scale_rate=0.84,
pad_img_size=[320, 1024],
pad_scale_rate=[0.8649, 0.8421],
photometric_aug=dict(
use_swap_channel=False,
),
img_norm_cfg=dict(
mean=[124.16, 116.74, 103.94],
std=[58.624, 57.344, 57.6], to_rgb=True)
)
val_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='val',
scale_rate=0.84,
pad_img_size=[320, 1024],
pad_scale_rate=[0.8649, 0.8421],
photometric_aug=dict(
use_swap_channel=False,
),
img_norm_cfg=dict(
mean=[124.16, 116.74, 103.94],
std=[58.624, 57.344, 57.6], to_rgb=True)
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
loss = dict(
type='MultiLoss',
loss_cfgs=[
dict(
type='ReprojLossMonoMultiNew',
weight=1.0,
no_ssim=False,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'curr_imgs': 'curr_imgs',
'prev_imgs': 'prev_imgs',
'next_imgs': 'next_imgs',
'ray_indices': 'ray_indices',
'weights': 'weights',
'ts': 'ts',
'metas': 'metas',
'ms_rays': 'ms_rays',
# 'deltas': 'deltas'
}),
# dict(
# type='RGBLossMS',
# weight=0.1,
# img_size=img_size,
# no_ssim=False,
# ray_resize=num_rays,
# input_dict={
# 'ms_colors': 'ms_colors',
# 'ms_rays': 'ms_rays',
# 'gt_imgs': 'color_imgs'}),
dict(
type='EikonalLoss',
weight=0.1,),
dict(
type='EdgeLoss3DMS',
weight=0.01,
img_size=img_size,
ray_resize=num_rays,
)
# dict(
# type='SecondGradLoss',
# weight=0.01),
# dict(
# type='SparsityLoss',
# weight=0.001,
# scale=0.1,
# input_dict={
# 'density': 'uniform_sdf'}),
])
loss_input_convertion = dict(
ms_depths='ms_depths',
ms_rays='ms_rays',
# ms_accs='ms_accs',
# ms_colors='ms_colors',
ray_indices='ray_indices',
weights='weights',
ts='ts',
eik_grad='eik_grad',
# second_grad='second_grad',
# uniform_sdf='uniform_sdf',
# deltas='deltas'
)
load_from = ''
_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
nonlinear_mode='linear',
h_size=[256, 0],
h_range=[51.2, 0],
h_half=True,
w_size=[128, 0],
w_range=[25.6, 0],
w_half=False,
d_size=[32, 0],
d_range=[-2.0, 4.4, 4.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 256
tpv_w = 1 + 2 * 128
tpv_z = 1 + 32 + 0
point_cloud_range = [-25.6, 0.0, -2.0, 25.6, 51.2, 4.4]
num_points_cross = [48, 48, 8]
num_points_self = 12
self_cross_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='CrossViewHybridAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=3,
num_points=num_points_self,
dropout=0.1,
batch_first=True),
dict(
type='TPVCrossAttention',
embed_dims=_dim_,
num_cams=num_cams,
dropout=0.1,
batch_first=True,
num_heads=num_heads,
num_levels=4,
num_points=num_points_cross)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)
model = dict(
type='TPVSegmentor',
img_backbone_out_indices=[0, 1, 2, 3],
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
norm_eval=False,
style='pytorch',
pretrained='./ckpts/resnet50-0676ba61.pth'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
lifter=dict(
type='TPVQueryLifter',
tpv_h=tpv_h,
tpv_w=tpv_w,
tpv_z=tpv_z,
dim=_dim_),
encoder=dict(
type='TPVFormerEncoder',
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
embed_dims=_dim_,
num_cams=num_cams,
num_feature_levels=4,
positional_encoding=dict(
type='TPVPositionalEncoding',
num_freqs=[12] * 3,
embed_dims=_dim_,
tot_range=point_cloud_range),
num_points_cross=num_points_cross,
num_points_self=[num_points_self] * 3,
transformerlayers=[
self_cross_layer,
self_cross_layer,
self_cross_layer,
self_cross_layer],
num_layers=4),
head=dict(
type='NeuSHead',
roi_aabb=point_cloud_range,
resolution=0.4,
near_plane=0.0,
far_plane=1e10,
num_samples=256,
num_samples_importance=0,
num_up_sample_steps=0,
base_variance=4,
beta_init=0.1,
beta_max=0.195,
total_iters=3516*11,
beta_hand_tune=False,
use_numerical_gradients=False,
sample_gradient=True,
return_uniform_sdf=False,
return_second_grad=False,
# rays args
ray_sample_mode='cellular', # fixed, cellular
ray_number=num_rays, # 192 * 400
ray_img_size=img_size,
ray_upper_crop=0,
# img2lidar args
trans_kw='temImg2lidar',
novel_view=None,
# render args
render_bkgd='random',
# bev nerf
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# mlp decoder
embed_dims=_dim_,
color_dims=0,
density_layers=2,
sh_deg=0,
sh_act='relu',
two_split=False,
tpv=True))
================================================
FILE: config/nuscenes/nuscenes_depth.py
================================================
_base_ = [
'../_base_/dataset_v1.py',
'../_base_/optimizer.py',
'../_base_/schedule.py',
]
# img_size = [768, 1600]
img_size = [896, 1600]
crop_size = [900, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 24
warmup_iters = 1000
multisteplr = True
multisteplr_config = dict(
decay_t = [3516 * 18],
decay_rate = 0.1,
warmup_t = warmup_iters,
warmup_lr_init = 1e-6,
t_in_epochs = False
)
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=1e-4,
weight_decay=0.01,
# eps=1e-4
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
data_path = 'data/nuscenes/'
train_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_train_sweeps.pkl',
crop_size = crop_size,
input_img_crop_size = img_size,
min_dist = 0.4,
max_dist = 6.0,
strict = True,
return_depth = False,
eval_depth = 80,
cur_prob = 1.0,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
)
val_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_val_sweeps.pkl',
crop_size = crop_size,
input_img_crop_size = img_size,
min_dist = 0.4,
max_dist = 6.0,
strict = False,
return_depth = True,
eval_depth = 80,
cur_prob = 1,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
)
train_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='train',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
),
# pad_img_size=[416, 864],
# random_scale=[0.92, 1.08],
# pad_scale_rate=[0.5417, 0.54],
# use_flip=True
)
val_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='val',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
),
# pad_img_size=[416, 864],
# pad_scale_rate=[0.5417, 0.54],
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
loss = dict(
type='MultiLoss',
loss_cfgs=[
dict(
type='ReprojLossMonoMultiNewCombine',
weight=1.0,
no_ssim=False,
img_size=crop_size,
ray_resize=num_rays,
no_automask=False,
input_dict={
'curr_imgs': 'curr_imgs',
'prev_imgs': 'prev_imgs',
'next_imgs': 'next_imgs',
'ray_indices': 'ray_indices',
'weights': 'weights',
'ts': 'ts',
'metas': 'metas',
'ms_rays': 'ms_rays',
# 'deltas': 'deltas'
}),
# dict(
# type='RGBLossMS',
# weight=0.1,
# img_size=img_size,
# no_ssim=False,
# ray_resize=num_rays,
# input_dict={
# 'ms_colors': 'ms_colors',
# 'ms_rays': 'ms_rays',
# 'gt_imgs': 'color_imgs'}),
dict(
type='EikonalLoss',
weight=0.1,),
dict(
type='EdgeLoss3DMS',
weight=0.01,
img_size=crop_size,
ray_resize=num_rays,
),
# dict(
# type='SecondGradLoss',
# weight=0.01),
# dict(
# type='SparsityLoss',
# weight=0.001,
# scale=0.1,
# input_dict={
# 'density': 'uniform_sdf'}),
])
loss_input_convertion = dict(
ms_depths='ms_depths',
ms_rays='ms_rays',
# ms_accs='ms_accs',
# ms_colors='ms_colors',
ray_indices='ray_indices',
weights='weights',
ts='ts',
eik_grad='eik_grad',
# second_grad='second_grad',
# uniform_sdf='uniform_sdf',
# deltas='deltas'
)
load_from = ''
_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
nonlinear_mode='linear',
h_size=[128, 0],
h_range=[51.2, 0],
h_half=False,
w_size=[128, 0],
w_range=[51.2, 0],
w_half=False,
d_size=[30, 0],
d_range=[-4.0, 5.0, 5.0]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 30
point_cloud_range = [-51.2, -51.2, -4.0, 51.2, 51.2, 5.0]
num_points_cross = [48, 48, 8]
num_points_self = 12
self_cross_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='CrossViewHybridAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=3,
num_points=num_points_self,
dropout=0.1,
batch_first=True),
dict(
type='TPVCrossAttention',
embed_dims=_dim_,
num_cams=6,
dropout=0.1,
batch_first=True,
num_heads=num_heads,
num_levels=4,
num_points=num_points_cross)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)
model = dict(
type='TPVSegmentor',
img_backbone_out_indices=[0, 1, 2, 3],
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
norm_eval=False,
style='pytorch',
pretrained='./ckpts/resnet50-0676ba61.pth'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
lifter=dict(
type='TPVQueryLifter',
tpv_h=tpv_h,
tpv_w=tpv_w,
tpv_z=tpv_z,
dim=_dim_),
encoder=dict(
type='TPVFormerEncoder',
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# camera_aware=True,
# camera_aware_mid_channels=96,
embed_dims=_dim_,
num_cams=6,
num_feature_levels=4,
positional_encoding=dict(
type='TPVPositionalEncoding',
num_freqs=[12] * 3,
embed_dims=_dim_,
tot_range=point_cloud_range),
num_points_cross=num_points_cross,
num_points_self=[num_points_self] * 3,
transformerlayers=[
self_cross_layer,
self_cross_layer,
self_cross_layer,
self_cross_layer],
num_layers=4),
head=dict(
type='NeuSHead',
roi_aabb=point_cloud_range,
resolution=0.4,
near_plane=0.0,
far_plane=1e10,
num_samples=256,
num_samples_importance=0,
num_up_sample_steps=0,
base_variance=4,
beta_init=0.1,
beta_max=0.195,
total_iters=3516*11,
beta_hand_tune=False,
use_numerical_gradients=False,
sample_gradient=True,
return_uniform_sdf=False,
return_second_grad=False,
# rays args
ray_sample_mode='cellular', # fixed, cellular
ray_number=num_rays, # 192 * 400
ray_img_size=crop_size,
ray_upper_crop=0,
# img2lidar args
trans_kw='temImg2lidar',
novel_view=None,
# render args
render_bkgd='random',
# bev nerf
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# mlp decoder
embed_dims=_dim_,
color_dims=0,
density_layers=2,
sh_deg=0,
sh_act='relu',
two_split=False,
tpv=True))
================================================
FILE: config/nuscenes/nuscenes_novel_depth.py
================================================
_base_ = [
'../_base_/dataset_v1.py',
'../_base_/optimizer.py',
'../_base_/schedule.py',
]
img_size = [768, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 24
warmup_iters = 1000
multisteplr = True
multisteplr_config = dict(
decay_t = [3516 * 18],
decay_rate = 0.1,
warmup_t = warmup_iters,
warmup_lr_init = 1e-6,
t_in_epochs = False
)
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=1e-4,
weight_decay=0.01,
# eps=1e-4
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
data_path = 'data/nuscenes/'
train_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_train_sweeps.pkl',
crop_size = img_size,
min_dist = 0.4,
max_dist = 10.0,
strict = True,
return_depth = False,
eval_depth = 80,
cur_prob = 0.333,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
)
val_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_val_sweeps.pkl',
crop_size = img_size,
min_dist = 0.4,
max_dist = 10.0,
strict = False,
return_depth = True,
eval_depth = 80,
cur_prob = 1,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
)
train_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='train',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
val_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='val',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
loss = dict(
type='MultiLoss',
loss_cfgs=[
dict(
type='ReprojLossMonoMultiNewCombine',
weight=1.0,
no_ssim=False,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'curr_imgs': 'curr_imgs',
'prev_imgs': 'prev_imgs',
'next_imgs': 'next_imgs',
'ray_indices': 'ray_indices',
'weights': 'weights',
'ts': 'ts',
'metas': 'metas',
'ms_rays': 'ms_rays',
# 'deltas': 'deltas'
}),
dict(
type='RGBLossMS',
weight=0.1,
img_size=img_size,
no_ssim=False,
ray_resize=num_rays,
input_dict={
'ms_colors': 'ms_colors',
'ms_rays': 'ms_rays',
'gt_imgs': 'color_imgs'}),
dict(
type='EikonalLoss',
weight=0.1,),
dict(
type='SecondGradLoss',
weight=0.01),
# dict(
# type='SparsityLoss',
# weight=0.001,
# scale=0.1,
# input_dict={
# 'density': 'uniform_sdf'}),
])
loss_input_convertion = dict(
ms_depths='ms_depths',
ms_rays='ms_rays',
# ms_accs='ms_accs',
ms_colors='ms_colors',
ray_indices='ray_indices',
weights='weights',
ts='ts',
eik_grad='eik_grad',
second_grad='second_grad',
# uniform_sdf='uniform_sdf',
# deltas='deltas'
)
load_from = ''
_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
nonlinear_mode='linear',
h_size=[128, 0],
h_range=[51.2, 0],
h_half=False,
w_size=[128, 0],
w_range=[51.2, 0],
w_half=False,
d_size=[30, 0],
d_range=[-4.0, 5.0, 5.0]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 30
point_cloud_range = [-51.2, -51.2, -4.0, 51.2, 51.2, 5.0]
num_points_cross = [48, 48, 8]
num_points_self = 12
self_cross_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='CrossViewHybridAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=3,
num_points=num_points_self,
dropout=0.1,
batch_first=True),
dict(
type='TPVCrossAttention',
embed_dims=_dim_,
num_cams=6,
dropout=0.1,
batch_first=True,
num_heads=num_heads,
num_levels=4,
num_points=num_points_cross)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)
model = dict(
type='TPVSegmentor',
img_backbone_out_indices=[0, 1, 2, 3],
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
norm_eval=False,
style='pytorch',
pretrained='./ckpts/resnet50-0676ba61.pth'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
lifter=dict(
type='TPVQueryLifter',
tpv_h=tpv_h,
tpv_w=tpv_w,
tpv_z=tpv_z,
dim=_dim_),
encoder=dict(
type='TPVFormerEncoder',
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
embed_dims=_dim_,
num_cams=6,
num_feature_levels=4,
positional_encoding=dict(
type='TPVPositionalEncoding',
num_freqs=[12] * 3,
embed_dims=_dim_,
tot_range=point_cloud_range),
num_points_cross=num_points_cross,
num_points_self=[num_points_self] * 3,
transformerlayers=[
self_cross_layer,
self_cross_layer,
self_cross_layer,
self_cross_layer],
num_layers=4),
head=dict(
type='NeuSHead',
roi_aabb=point_cloud_range,
resolution=0.4,
near_plane=0.0,
far_plane=1e10,
num_samples=256,
num_samples_importance=0,
num_up_sample_steps=0,
base_variance=4,
beta_init=0.1,
beta_max=0.195,
total_iters=3516*11,
beta_hand_tune=False,
use_numerical_gradients=False,
sample_gradient=True,
return_uniform_sdf=False,
return_second_grad=True,
# rays args
ray_sample_mode='cellular', # fixed, cellular
ray_number=num_rays, # 192 * 400
ray_img_size=img_size,
ray_upper_crop=0,
# img2lidar args
trans_kw='temImg2lidar',
novel_view=None,
# render args
render_bkgd='random',
# bev nerf
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# mlp decoder
embed_dims=_dim_,
color_dims=3,
density_layers=2,
sh_deg=0,
sh_act='relu',
two_split=False,
tpv=True))
================================================
FILE: config/nuscenes/nuscenes_occ.py
================================================
_base_ = [
'../_base_/dataset_v1.py',
'../_base_/optimizer.py',
'../_base_/schedule.py',
]
img_size = [768, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 12
warmup_iters = 1000
sem = True
multisteplr = True
multisteplr_config = dict(
decay_t = [3516 * 9],
decay_rate = 0.1,
warmup_t = warmup_iters,
warmup_lr_init = 1e-6,
t_in_epochs = False
)
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=1e-4,
weight_decay=0.01,
# eps=1e-4
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
data_path = 'data/nuscenes/'
train_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_train_sweeps.pkl',
crop_size = img_size,
min_dist = 0.4,
max_dist = 30.0,
strict = True,
return_depth = False,
eval_depth = 80,
cur_prob = 0.333,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
ego_centric=True,
)
val_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_val_sweeps.pkl',
crop_size = img_size,
min_dist = 0.4,
max_dist = 30.0,
strict = False,
return_depth = True,
eval_depth = 80,
cur_prob = 1,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
ego_centric=True
)
train_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='train',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
val_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='val',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
loss = dict(
type='MultiLoss',
loss_cfgs=[
dict(
type='ReprojLossMonoMultiNewCombine',
weight=1.0,
no_ssim=False,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'curr_imgs': 'curr_imgs',
'prev_imgs': 'prev_imgs',
'next_imgs': 'next_imgs',
'ray_indices': 'ray_indices',
'weights': 'weights',
'ts': 'ts',
'metas': 'metas',
'ms_rays': 'ms_rays',
# 'deltas': 'deltas'
}),
dict(
type='RGBLossMS',
weight=0.1,
img_size=img_size,
no_ssim=False,
ray_resize=num_rays,
input_dict={
'ms_colors': 'ms_colors',
'ms_rays': 'ms_rays',
'gt_imgs': 'color_imgs'}),
dict(
type='EikonalLoss',
weight=0.1,),
dict(
type='SecondGradLoss',
weight=0.01),
dict(
type='SemCELossMS',
weight=0.1,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'sem': 'sem',
'metas': 'metas',
'ms_rays': 'ms_rays'}),
# dict(
# type='AdaptiveSparsityLoss',
# weight=0.001,
# slack=4.0,
# input_dict={
# 'sdfs': 'sdfs',
# 'ts': 'ts',
# 'ms_depths': 'ms_depths'})
# dict(
# type='SparsityLoss',
# weight=0.001,
# scale=0.1,
# input_dict={
# 'density': 'uniform_sdf'}),
])
loss_input_convertion = dict(
ms_depths='ms_depths',
ms_rays='ms_rays',
# ms_accs='ms_accs',
ms_colors='ms_colors',
ray_indices='ray_indices',
weights='weights',
ts='ts',
eik_grad='eik_grad',
second_grad='second_grad',
sem='sem',
# sdfs='sample_sdf'
# uniform_sdf='uniform_sdf',
# deltas='deltas'
)
load_from = ''
_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
nonlinear_mode='linear',
h_size=[128, 0],
h_range=[40.0, 0],
h_half=False,
w_size=[128, 0],
w_range=[40.0, 0],
w_half=False,
d_size=[24, 0],
d_range=[-1.0, 5.4, 5.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 24
point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
num_points_cross = [48, 48, 8]
num_points_self = 12
self_cross_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='CrossViewHybridAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=3,
num_points=num_points_self,
dropout=0.1,
batch_first=True),
dict(
type='TPVCrossAttention',
embed_dims=_dim_,
num_cams=6,
dropout=0.1,
batch_first=True,
num_heads=num_heads,
num_levels=4,
num_points=num_points_cross)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)
model = dict(
type='TPVSegmentor',
img_backbone_out_indices=[0, 1, 2, 3],
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
norm_eval=False,
style='pytorch',
pretrained='./ckpts/resnet50-0676ba61.pth'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
lifter=dict(
type='TPVQueryLifter',
tpv_h=tpv_h,
tpv_w=tpv_w,
tpv_z=tpv_z,
dim=_dim_),
encoder=dict(
type='TPVFormerEncoder',
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
embed_dims=_dim_,
num_cams=6,
num_feature_levels=4,
positional_encoding=dict(
type='TPVPositionalEncoding',
num_freqs=[12] * 3,
embed_dims=_dim_,
tot_range=point_cloud_range),
num_points_cross=num_points_cross,
num_points_self=[num_points_self] * 3,
transformerlayers=[
self_cross_layer,
self_cross_layer,
self_cross_layer,
self_cross_layer],
num_layers=4),
head=dict(
type='NeuSHead',
roi_aabb=point_cloud_range,
resolution=0.4,
near_plane=0.0,
far_plane=1e10,
num_samples=256,
num_samples_importance=0,
num_up_sample_steps=0,
base_variance=4,
beta_init=0.1,
beta_max=0.195,
total_iters=3516*11,
beta_hand_tune=False,
use_numerical_gradients=False,
sample_gradient=True,
return_uniform_sdf=False,
return_second_grad=True,
return_sem=True,
return_sample_sdf=False,
# rays args
ray_sample_mode='cellular', # fixed, cellular
ray_number=num_rays, # 192 * 400
ray_img_size=img_size,
ray_upper_crop=0,
# img2lidar args
trans_kw='temImg2lidar',
novel_view=None,
# render args
render_bkgd='random',
# bev nerf
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# mlp decoder
embed_dims=_dim_,
color_dims=24,
density_layers=2,
sh_deg=0,
sh_act='relu',
two_split=False,
tpv=True))
================================================
FILE: config/nuscenes/nuscenes_occ_bev.py
================================================
_base_ = [
'../_base_/dataset_v1.py',
'../_base_/optimizer.py',
'../_base_/schedule.py',
]
img_size = [768, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 12
warmup_iters = 1000
sem = True
multisteplr = True
multisteplr_config = dict(
decay_t = [3516 * 9],
decay_rate = 0.1,
warmup_t = warmup_iters,
warmup_lr_init = 1e-6,
t_in_epochs = False
)
optimizer = dict(
optimizer=dict(
type='AdamW',
lr=1e-4,
weight_decay=0.01,
# eps=1e-4
),
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),}
),
)
data_path = 'data/nuscenes/'
train_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_train_sweeps.pkl',
crop_size = img_size,
min_dist = 0.4,
max_dist = 30.0,
strict = True,
return_depth = False,
eval_depth = 80,
cur_prob = 0.333,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
ego_centric=True,
)
val_dataset_config = dict(
_delete_=True,
type='nuScenes_One_Frame_Sweeps_Dist',
data_path = data_path,
imageset = 'data/nuscenes_infos_val_sweeps.pkl',
crop_size = img_size,
min_dist = 0.4,
max_dist = 30.0,
strict = False,
return_depth = True,
eval_depth = 80,
cur_prob = 1,
prev_prob = 0.5,
choose_nearest = True,
ref_sensor = 'CAM_FRONT',
composite_prev_next=True,
sensor_mus=[0.5, 0.5],
sensor_sigma=0.5,
ego_centric=True
)
train_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='train',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
val_wrapper_config = dict(
type='tpvformer_dataset_nuscenes_temporal',
phase='val',
scale_rate=0.5,
photometric_aug=dict(
use_swap_channel=False,
)
)
train_loader = dict(
batch_size = 1,
shuffle = True,
num_workers = 1,
)
val_loader = dict(
batch_size = 1,
shuffle = False,
num_workers = 1,
)
loss = dict(
type='MultiLoss',
loss_cfgs=[
dict(
type='ReprojLossMonoMultiNewCombine',
weight=1.0,
no_ssim=False,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'curr_imgs': 'curr_imgs',
'prev_imgs': 'prev_imgs',
'next_imgs': 'next_imgs',
'ray_indices': 'ray_indices',
'weights': 'weights',
'ts': 'ts',
'metas': 'metas',
'ms_rays': 'ms_rays',
# 'deltas': 'deltas'
}),
dict(
type='RGBLossMS',
weight=0.1,
img_size=img_size,
no_ssim=False,
ray_resize=num_rays,
input_dict={
'ms_colors': 'ms_colors',
'ms_rays': 'ms_rays',
'gt_imgs': 'color_imgs'}),
dict(
type='EikonalLoss',
weight=0.1,),
dict(
type='SecondGradLoss',
weight=0.01),
dict(
type='SemLossMS',
weight=0.1,
img_size=img_size,
ray_resize=num_rays,
input_dict={
'sem': 'sem',
'metas': 'metas',
'ms_rays': 'ms_rays'}),
dict(
type='SoftSparsityLoss',
weight=0.001,
input_dict={
'density': 'uniform_sdf'})
# dict(
# type='SparsityLoss',
# weight=0.001,
# scale=0.1,
# input_dict={
# 'density': 'uniform_sdf'}),
])
loss_input_convertion = dict(
ms_depths='ms_depths',
ms_rays='ms_rays',
# ms_accs='ms_accs',
ms_colors='ms_colors',
ray_indices='ray_indices',
weights='weights',
ts='ts',
eik_grad='eik_grad',
second_grad='second_grad',
sem='sem',
# sdfs='sample_sdf'
uniform_sdf='uniform_sdf',
# deltas='deltas'
)
load_from = ''
_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
nonlinear_mode='linear',
h_size=[128, 0],
h_range=[40.0, 0],
h_half=False,
w_size=[128, 0],
w_range=[40.0, 0],
w_half=False,
d_size=[24, 0],
d_range=[-1.0, 5.4, 5.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 24
point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
num_points_cross = 8
num_points_self = 12
self_cross_layer = dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='MultiScaleDeformableAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=1,
num_points=num_points_self,
dropout=0.1,
batch_first=True),
dict(
type='BEVCrossAttention',
embed_dims=_dim_,
num_cams=6,
dropout=0.1,
batch_first=True,
deformable_attention=dict(
type='BEVDeformableAttention',
embed_dims=_dim_,
num_heads=num_heads,
num_levels=4,
num_points=num_points_cross,
dropout=0.1,
batch_first=True))
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)
model = dict(
type='TPVSegmentor',
img_backbone_out_indices=[0, 1, 2, 3],
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=0,
norm_eval=False,
style='pytorch',
pretrained='./ckpts/resnet50-0676ba61.pth'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
lifter=dict(
type='BEVQueryLifter',
bev_h=tpv_h,
bev_w=tpv_w,
dim=_dim_),
encoder=dict(
type='BEVFormerEncoder',
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# camera_aware=True,
# camera_aware_mid_channels=96,
embed_dims=_dim_,
num_cams=6,
num_feature_levels=4,
positional_encoding=dict(
type='BEVPositionalEncoding',
num_freqs=12,
embed_dims=_dim_,
tot_range=point_cloud_range),
num_points_cross=num_points_cross,
num_points_self=num_points_self,
transformerlayers=[
self_cross_layer,
self_cross_layer,
self_cross_layer,
self_cross_layer],
num_layers=4),
head=dict(
type='NeuSHead',
roi_aabb=point_cloud_range,
resolution=0.4,
near_plane=0.0,
far_plane=1e10,
num_samples=256,
num_samples_importance=0,
num_up_sample_steps=0,
base_variance=4,
beta_init=0.1,
beta_max=0.195,
total_iters=3516*11,
beta_hand_tune=False,
use_numerical_gradients=False,
sample_gradient=True,
return_uniform_sdf=True,
return_second_grad=True,
return_sem=True,
return_sample_sdf=False,
# rays args
ray_sample_mode='cellular', # fixed, cellular
ray_number=num_rays, # 192 * 400
ray_img_size=img_size,
ray_upper_crop=0,
# img2lidar args
trans_kw='temImg2lidar',
novel_view=None,
# render args
render_bkgd='random',
# bev nerf
# bev_inner=bev_inner,
# bev_outer=bev_outer,
# range_inner=range_inner,
# range_outer=range_outer,
# nonlinear_mode=nonlinear_mode,
# z_inner=z_inner,
# z_outer=z_outer,
# z_ranges=z_ranges,
mapping_args=mapping_args,
# mlp decoder
embed_dims=_dim_,
color_dims=24,
density_layers=2,
sh_deg=0,
sh_act='relu',
two_split=False,
tpv=False))
================================================
FILE: config/openseed/openseed_swint_lang.yaml
================================================
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# ------------------------------------------------------------------------
# OpenSeeD
# Copyright (c) 2023 IDEA. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from MaskDINO https://github.com/IDEA-Research/MaskDINO by Feng Li and Hao Zhang.
# ------------------------------------------------------------------------
##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
VERBOSE: true
OUTPUT_DIR: '../../data/output/test'
# misc
LOADER:
JOINT: True
KEY_DATASET: 'coco'
# model
MODEL:
NAME: openseed_model
HEAD: openseed_head
MASK_ON: false
KEYPOINT_ON: false
LOAD_PROPOSALS: false
DIM_PROJ: 512
BACKBONE_DIM: 768
BACKGROUND: False
WEIGHTS: ''
TEXT:
ARCH: encoder
NAME: transformer
TOKENIZER: clip
CONTEXT_LENGTH: 18 # 18
WIDTH: 512
HEADS: 8
LAYERS: 12
AUTOGRESSIVE: True
BACKBONE:
NAME: swin
PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
LOAD_PRETRAINED: true
SWIN:
PRETRAIN_IMG_SIZE: 224
PATCH_SIZE: 4
EMBED_DIM: 96
DEPTHS: [ 2, 2, 6, 2 ]
NUM_HEADS: [ 3, 6, 12, 24 ]
WINDOW_SIZE: 7
MLP_RATIO: 4.0
QKV_BIAS: true
QK_SCALE: ~
DROP_RATE: 0.0
ATTN_DROP_RATE: 0.0
DROP_PATH_RATE: 0.3
APE: false
PATCH_NORM: true
USE_CHECKPOINT: false
OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
ENCODER:
NAME: encoder_deform
IGNORE_VALUE: 255
NUM_CLASSES: 133
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
TOTAL_NUM_FEATURE_LEVELS: 4
NUM_FEATURE_LEVELS: 3
FEATURE_ORDER: "low2high"
DECODER:
NAME: openseed_decoder
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
MASK: True
BOX: True
GROUNDING:
ENABLED: False
MAX_LEN: 5
TEXT_WEIGHT: 2.0
CLASS_WEIGHT: 0.5
CAPTION:
ENABLED: False
PHRASE_PROB: 0.0
SIM_THRES: 0.95
CAPTIONING:
ENABLED: False
STEP: 50
RETRIEVAL:
ENABLED: False
DIM_IMG: 768
ENSEMBLE: True
OPENIMAGE:
ENABLED: False
NEGATIVE_SAMPLES: 5
GROUNDING:
ENABLED: False
MAX_LEN: 5
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 4.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
BOX_WEIGHT: 5.0
GIOU_WEIGHT: 2.0
COST_CLASS_WEIGHT: 4.0
COST_DICE_WEIGHT: 5.0
COST_MASK_WEIGHT: 5.0
COST_BOX_WEIGHT: 5.0
COST_GIOU_WEIGHT: 2.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 300
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TWO_STAGE: True
INITIALIZE_BOX_TYPE: 'no'
DN: seg
DN_NOISE_SCALE: 0.4
DN_NUM: 100
INITIAL_PRED: True
LEARN_TGT: False
TOTAL_NUM_FEATURE_LEVELS: 4
SEMANTIC_CE_LOSS: False
PANO_BOX_LOSS: False
COCO: True
O365: False
TEST:
SEMANTIC_ON: True
INSTANCE_ON: True
PANOPTIC_ON: True
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.25
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
TEST_FOUCUS_ON_BOX: False
PANO_TRANSFORM_EVAL: True
PANO_TEMPERATURE: 0.06
TEST:
EVAL_PERIOD: 5000
PRECISE_BN:
NUM_ITER: 1
ENABLED: False
AUG:
ENABLED: False
COCO:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATASET_MAPPER_NAME: "coco_panoptic_lsj"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
DATASET:
DATASET: 'coco'
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
TRAIN:
BATCH_SIZE_TOTAL: 16
BATCH_SIZE_PER_GPU: 2
SHUFFLE: true
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 2
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
VLP:
INPUT:
IMAGE_SIZE: 224
DATASET_MAPPER_NAME: "vlpretrain"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
TRAIN:
BATCH_SIZE_TOTAL: 2
BATCH_SIZE_PER_GPU: 2
TEST:
BATCH_SIZE_TOTAL: 256
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 16
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
DATASETS:
TRAIN: ["coco_2017_train_panoptic"]
TEST: ["coco_2017_val_panoptic_with_sem_seg"]
CLASS_CONCAT: false
SIZE_DIVISIBILITY: 32
PROPOSAL_FILES_TRAIN: []
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 16
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
# Detectron2 training config for optimizer and lr scheduler
SOLVER:
BASE_LR_END: 0.0
MOMENTUM: 0.9
NESTEROV: False
CHECKPOINT_PERIOD: 5000
IMS_PER_BATCH: 16
REFERENCE_WORLD_SIZE: 0
BIAS_LR_FACTOR: 1.0
WEIGHT_DECAY_BIAS: None
# original
BASE_LR: 0.0001
STEPS: [327778, 355092]
MAX_ITER: 368750
GAMMA: 0.1
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WARMUP_METHOD: "linear"
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
LR_MULTIPLIER:
backbone: 0.1
lang_encoder: 0.1
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_EMBED: 0.0
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
# Evaluation Dataset
ADE20K:
INPUT:
MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 640
MAX_SIZE_TRAIN: 2560
MAX_SIZE_TEST: 2560
MASK_FORMAT: "polygon"
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: [640, 640]
SINGLE_CATEGORY_MAX_AREA: 1.0
IGNORE_VALUE: 255
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: 640 # used in dataset mapper
DATASET_MAPPER_NAME: "mask_former_panoptic"
FORMAT: "RGB"
DATASET:
DATASET: 'ade'
TRAIN:
ASPECT_RATIO_GROUPING: true
BATCH_SIZE_TOTAL: 16
BATCH_SIZE_PER_GPU: 2
SHUFFLE: true
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 8
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
REF:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
FORMAT: "RGB"
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
SUN:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
SCAN:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
BDD:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
CITY:
INPUT:
MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 1024
MAX_SIZE_TRAIN: 4096
MAX_SIZE_TEST: 2048
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: [ 512, 1024 ]
SINGLE_CATEGORY_MAX_AREA: 1.0
IGNORE_VALUE: 255
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: -1
FORMAT: "RGB"
DATASET_MAPPER_NAME: "mask_former_panoptic"
MASK_FORMAT: "polygon"
TEST:
EVAL_PERIOD: 5000
BATCH_SIZE_TOTAL: 1
AUG:
ENABLED: False
MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
MAX_SIZE: 4096
FLIP: True
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: True
NUM_WORKERS: 2
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
TRAIN:
ASPECT_RATIO_GROUPING: true
BATCH_SIZE_TOTAL: 2
BATCH_SIZE_PER_GPU: 2
SHUFFLE: true
================================================
FILE: dataset/__init__.py
================================================
from mmengine.registry import Registry
OPENOCC_DATASET = Registry('openocc_dataset')
OPENOCC_DATAWRAPPER = Registry('openocc_datawrapper')
from .dataset_one_frame_sweeps_dist import nuScenes_One_Frame_Sweeps_Dist
from .dataset_one_frame_sweeps_dist_vis import nuScenes_One_Frame_Sweeps_Dist_Vis
from .dataset_wrapper_vis import tpvformer_dataset_nuscenes_vis
from .dataset_one_frame_eval import nuScenes_One_Frame_Eval
from .dataset_wrapper_temporal import tpvformer_dataset_nuscenes_temporal, custom_collate_fn_temporal
from .sampler import CustomDistributedSampler
from .kitti.kitti_dataset_eval import Kitti_Novel_View_Eval
from .kitti_raw.kitti_raw_dataset import Kitti_Raw
from .kitti_raw.kitti_raw_dataset_stereo import Kitti_Raw_Stereo
from .kitti.kitti_dataset_one_frame import Kitti_One_Frame
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.dataloader import DataLoader
def get_dataloader(
train_dataset_config,
val_dataset_config,
train_wrapper_config,
val_wrapper_config,
train_loader,
val_loader,
nusc=dict(
version='v1.0-trainval',
dataroot='data/nuscenes'),
dist=False,
iter_resume=False,
train_sampler_config=dict(
shuffle=True,
drop_last=True),
val_sampler_config=dict(
shuffle=False,
drop_last=False),
val_only=False,
):
# if nusc is not None:
# from nuscenes import NuScenes
# nusc = NuScenes(**nusc)
if val_only:
val_dataset = OPENOCC_DATASET.build(
val_dataset_config,
default_args={'nusc': nusc})
val_wrapper = OPENOCC_DATAWRAPPER.build(
val_wrapper_config,
default_args={'in_dataset': val_dataset})
val_sampler = None
if dist:
val_sampler = DistributedSampler(val_wrapper, **val_sampler_config)
val_dataset_loader = DataLoader(
dataset=val_wrapper,
batch_size=val_loader["batch_size"],
collate_fn=custom_collate_fn_temporal,
shuffle=False,
sampler=val_sampler,
num_workers=val_loader["num_workers"],
pin_memory=True)
return None, val_dataset_loader
train_dataset = OPENOCC_DATASET.build(
train_dataset_config,
default_args={'nusc': nusc})
val_dataset = OPENOCC_DATASET.build(
val_dataset_config,
default_args={'nusc': nusc})
train_wrapper = OPENOCC_DATAWRAPPER.build(
train_wrapper_config,
default_args={'in_dataset': train_dataset})
val_wrapper = OPENOCC_DATAWRAPPER.build(
val_wrapper_config,
default_args={'in_dataset': val_dataset})
train_sampler = val_sampler = None
if dist:
if iter_resume:
train_sampler = CustomDistributedSampler(train_wrapper, **train_sampler_config)
else:
train_sampler = DistributedSampler(train_wrapper, **train_sampler_config)
val_sampler = DistributedSampler(val_wrapper, **val_sampler_config)
train_dataset_loader = DataLoader(
dataset=train_wrapper,
batch_size=train_loader["batch_size"],
collate_fn=custom_collate_fn_temporal,
shuffle=False if dist else train_loader["shuffle"],
sampler=train_sampler,
num_workers=train_loader["num_workers"],
pin_memory=True)
val_dataset_loader = DataLoader(
dataset=val_wrapper,
batch_size=val_loader["batch_size"],
collate_fn=custom_collate_fn_temporal,
shuffle=False,
sampler=val_sampler,
num_workers=val_loader["num_workers"],
pin_memory=True)
return train_dataset_loader, val_dataset_loader
================================================
FILE: dataset/dataset_one_frame_eval.py
================================================
import os, numpy as np, pickle, random
from mmcv.image.io import imread
from pyquaternion import Quaternion
from copy import deepcopy
from . import OPENOCC_DATASET
if 'HFAI' in os.environ:
hfai = True
from dataset.loading import LoadMultiViewImageFromFilesHF, \
LoadPtsFromFilesHF
else:
hfai = False
@OPENOCC_DATASET.register_module()
class nuScenes_One_Frame_Eval:
def __init__(
self,
data_path,
imageset='train',
nusc=None,
crop_size=[768, 1600],
cam_types=None,
eval_depth=80,
scene_name=None,
**kwargs):
with open(imageset, 'rb') as f:
data = pickle.load(f)
self.nusc_infos = data['infos']
if scene_name is not None:
assert isinstance(scene_name, str) and scene_name.startswith('scene-')
selected_indices = data['scene_info'][scene_name]
self.selected_indices = list(range(selected_indices[0], selected_indices[1] + 1))
else:
self.selected_indices = list(range(len(self.nusc_infos)))
self.data_path = data_path
self.nusc = nusc
self.crop_size = crop_size
self.cam_types = [
'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'
] if cam_types is None else cam_types
self.eval_depth = eval_depth
if hfai:
self.img_loader = LoadMultiViewImageFromFilesHF(
to_float32=True,
file_client_args=dict(
backend='ffrecord',
fname=data_path+'CAM',
filename2idx='img_fname2idx.pkl'))
self.pts_loader = LoadPtsFromFilesHF(
to_float32=True,
file_client_args=dict(
backend='ffrecord',
fname=data_path+'LIDAR',
filename2idx='pts_fname2idx.pkl'))
else:
self.img_loader = self.pts_loader = None
self.lidar2cam_rect = np.eye(4)
def __len__(self):
'Denotes the total number of samples'
return len(self.selected_indices)
def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
# lidar2img: N, 4, 4
scan = np.fromfile(lidar_path, dtype=np.float32)
scan = scan.reshape((-1, 5))[:, :4]
scan[:, 3] = 1.0
# points_hcoords = scan[scan[:, 0] > 0, :]
points_hcoords = np.expand_dims(scan.T, 0) # 1, 4, n
img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4
depth = img_points[..., 2] # N, n
img_points = img_points[..., :2] # N, n, 2
mask = (depth < self.eval_depth) & (depth > 1e-3) # get points with depth < max_sample_depth
img_points = img_points / np.expand_dims(depth, axis=2) # scale 2D points
img_points[..., 0] = img_points[..., 0] / image_size[1]
img_points[..., 1] = img_points[..., 1] / image_size[0]
# img_points = np.round(img_points).astype(int)
mask = mask & (img_points[..., 0] > 0) & \
(img_points[..., 1] > 0) & \
(img_points[..., 0] < 1) & \
(img_points[..., 1] < 1)
return img_points, depth, mask
def __getitem__(self, index):
#### 2. get self, prev, next infos for the stem, and also temp_depth info
while True:
index = self.selected_indices[index]
info = deepcopy(self.nusc_infos[index])
if len(info['nice_neighbor_prev']) == 0 and len(info['nice_neighbor_next']) ==0:
index = np.random.randint(len(self))
continue
break
#### 3. prepare img_metas
imgs_info = self.get_data_info(info)
img_metas = {
'input_imgs_path': imgs_info['img_filename'],
'lidar2img': imgs_info['lidar2img'],
'img2lidar': imgs_info['img2lidar'],
'ego2lidar': imgs_info['ego2lidar'],
'token': info['token'],
'timestamp': info['timestamp'],
'intrinsic': imgs_info['cam_intrinsic']}
anchor_imgs = [] # list[list[array]]
anchor_depth_locs = [] # list[array]
anchor_depth_gts = [] # list[array]
anchor_depth_masks = [] # list[array]
temImg2lidars = [] # list[list[array]]
anchor_frame_dist = \
info['prev_dists'] + info['next_dists'] # list[float]
for anchor in info['nice_neighbor_prev'] + info['nice_neighbor_next']:
anchor_info = deepcopy(self.nusc_infos[anchor])
anchor_imgs_info = self.get_data_info_temporal(info, anchor_info)
anchor_img_path = anchor_imgs_info['image_paths']
temImg2lidar = anchor_imgs_info['temImg2lidar']
anchor_img = self.read_surround_imgs(anchor_img_path)
depth_loc, depth_gt, depth_mask = self.get_depth_from_lidar(
anchor_info['lidar_path'], img_metas['lidar2img'], self.crop_size)
anchor_imgs.append(anchor_img)
anchor_depth_locs.append(depth_loc)
anchor_depth_gts.append(depth_gt)
anchor_depth_masks.append(depth_mask)
temImg2lidars.append(temImg2lidar)
img_metas.update({
'depth_loc': anchor_depth_locs,
'depth_gt': anchor_depth_gts,
'depth_mask': anchor_depth_masks,
'temImg2lidars': temImg2lidars,
'frame_dists': anchor_frame_dist
})
#### 4. read imgs
input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'])
data_tuple = (input_imgs, anchor_imgs, img_metas)
return data_tuple
def read_surround_imgs(self, img_paths):
if hfai:
imgs = self.img_loader.load(img_paths)
else:
imgs = []
for filename in img_paths:
imgs.append(
imread(filename, 'unchanged').astype(np.float32))
imgs = [img[:self.crop_size[0], :self.crop_size[1], :] for img in imgs]
return imgs
def get_data_info_temporal(self, info, info_tem):
image_paths = []
temImg2lidars = []
lidar2ego_r = Quaternion(info['lidar2ego_rotation']).rotation_matrix
lidar2ego = np.eye(4)
lidar2ego[:3, :3] = lidar2ego_r
lidar2ego[:3, 3] = np.array(info['lidar2ego_translation']).T
ego2global_r = Quaternion(info['ego2global_rotation']).rotation_matrix
ego2global = np.eye(4)
ego2global[:3, :3] = ego2global_r
ego2global[:3, 3] = np.array(info['ego2global_translation']).T
lidar2global = ego2global @ lidar2ego
global2lidar = np.linalg.inv(lidar2global)
for cam_type_tem, cam_info_tem in info_tem['cams'].items():
image_paths.append(cam_info_tem['data_path'])
temIntrinsic = cam_info_tem['cam_intrinsic']
temImg2temSensor = np.eye(4)
temImg2temSensor[:3, :3] = temIntrinsic
temImg2temSensor = np.linalg.inv(temImg2temSensor)
temSensor2temEgo_r = Quaternion(cam_info_tem['sensor2ego_rotation']).rotation_matrix
temSensor2temEgo = np.eye(4)
temSensor2temEgo[:3, :3] = temSensor2temEgo_r
temSensor2temEgo[:3, 3] = np.array(cam_info_tem['sensor2ego_translation']).T
temEgo2global_r = Quaternion(cam_info_tem['ego2global_rotation']).rotation_matrix
temEgo2global = np.eye(4)
temEgo2global[:3, :3] = temEgo2global_r
temEgo2global[:3, 3] = np.array(cam_info_tem['ego2global_translation']).T
temImg2global = temEgo2global @ temSensor2temEgo @ temImg2temSensor
temImg2lidar = global2lidar @ temImg2global
temImg2lidars.append(temImg2lidar)
out_dict = dict(
image_paths=image_paths,
temImg2lidar=np.asarray(temImg2lidars)
)
return out_dict
def get_data_info(self, info):
image_paths = []
lidar2img_rts = []
img2lidar_rts = []
cam_intrinsics = []
lidar2ego_r = Quaternion(info['lidar2ego_rotation']).rotation_matrix
lidar2ego = np.eye(4)
lidar2ego[:3, :3] = lidar2ego_r
lidar2ego[:3, 3] = np.array(info['lidar2ego_translation']).T
ego2lidar = np.linalg.inv(lidar2ego)
for cam_type, cam_info in info['cams'].items():
image_paths.append(cam_info['data_path'])
lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
lidar2cam_t = cam_info['sensor2lidar_translation'] @ lidar2cam_r.T
lidar2cam_rt = np.eye(4)
lidar2cam_rt[:3, :3] = lidar2cam_r.T
lidar2cam_rt[3, :3] = -lidar2cam_t
intrinsic = cam_info['cam_intrinsic']
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
lidar2img_rt = (viewpad @ lidar2cam_rt.T)
img2lidar_rt = np.linalg.inv(lidar2img_rt)
cam_intrinsics.append(viewpad)
lidar2img_rts.append(lidar2img_rt)
img2lidar_rts.append(img2lidar_rt)
input_dict =dict(
img_filename=image_paths,
lidar2img=np.asarray(lidar2img_rts),
img2lidar=np.asarray(img2lidar_rts),
cam_intrinsic=np.asarray(cam_intrinsics),
ego2lidar=ego2lidar)
return input_dict
================================================
FILE: dataset/dataset_one_frame_sweeps_dist.py
================================================
import os, numpy as np, random, mmengine, math
from mmcv.image.io import imread
from pyquaternion import Quaternion
from copy import deepcopy
from . import OPENOCC_DATASET
if 'HFAI' in os.environ:
hfai = True
from dataset.loading import LoadMultiViewImageFromFilesHF, \
LoadPtsFromFilesHF
else:
hfai = False
def get_xyz(pose_dict):
return np.array(pose_dict['translation'])
def get_img2global(calib_dict, pose_dict):
cam2img = np.eye(4)
cam2img[:3, :3] = np.asarray(calib_dict['camera_intrinsic'])
img2cam = np.linalg.inv(cam2img)
cam2ego = np.eye(4)
cam2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
cam2ego[:3, 3] = np.asarray(calib_dict['translation']).T
ego2global = np.eye(4)
ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
ego2global[:3, 3] = np.asarray(pose_dict['translation']).T
img2global = ego2global @ cam2ego @ img2cam
return img2global
def get_lidar2global(calib_dict, pose_dict):
lidar2ego = np.eye(4)
lidar2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
lidar2ego[:3, 3] = np.asarray(calib_dict['translation']).T
ego2global = np.eye(4)
ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
ego2global[:3, 3] = np.asarray(pose_dict['translation']).T
lidar2global = ego2global @ lidar2ego
return lidar2global
@OPENOCC_DATASET.register_module()
class nuScenes_One_Frame_Sweeps_Dist:
def __init__(
self,
data_path,
imageset='train',
crop_size=[768, 1600],
input_img_crop_size=None,
min_dist=0.4,
max_dist=10.0,
strict=True,
return_depth=False,
eval_depth=80,
cur_prob=1.0,
prev_prob=0.5,
choose_nearest=False,
ref_sensor='CAM_FRONT',
composite_prev_next=False,
sensor_mus=[3.0, 0.5],
sensor_sigma=0.5,
ego_centric=False,
**kwargs):
data = mmengine.load(imageset)
self.scene_infos = data['infos']
self.keyframes = data['metadata']
self.data_path = data_path
self.crop_size = crop_size
self.input_img_crop_size = crop_size if input_img_crop_size is None else input_img_crop_size
self.strict = strict
self.return_depth = return_depth
self.eval_depth = eval_depth
self.cur_prob = cur_prob
self.prev_prob = prev_prob
self.choose_nearest = choose_nearest
self.composite_prev_next = composite_prev_next
self.sensor_mus = {
'CAM_FRONT': sensor_mus[0], 'CAM_FRONT_RIGHT': sensor_mus[1], 'CAM_FRONT_LEFT': sensor_mus[1],
'CAM_BACK': sensor_mus[0], 'CAM_BACK_LEFT': sensor_mus[1], 'CAM_BACK_RIGHT': sensor_mus[1]
}
self.sensor_sigma = sensor_sigma
self.sensor_types = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
self.ego_centric = ego_centric
if hfai:
self.img_loader = LoadMultiViewImageFromFilesHF(
to_float32=True,
file_client_args=dict(
backend='ffrecord',
fname=data_path+'CAM',
filename2idx='img_fname2idx.pkl'))
self.pts_loader = LoadPtsFromFilesHF(
to_float32=True,
file_client_args=dict(
backend='ffrecord',
fname=data_path+'LIDAR',
filename2idx='pts_fname2idx.pkl'))
else:
self.img_loader = self.pts_loader = None
self.lidar2cam_rect = np.eye(4)
#### collect temporal information
for scene_idx, (scene_token, scene_samples) in enumerate(self.scene_infos.items()):
if (scene_idx + 1) % 50 == 0:
print(f'One_Frame_Sweeps: processed {scene_idx + 1} scenes.')
length = len(scene_samples)
for i, sample in enumerate(scene_samples):
curr_xyz = get_xyz(sample['data'][ref_sensor]['pose'])
prev_samples, next_samples = [], []
prev_dists, next_dists = [], []
for j in range(i - 1, -1, -1):
temp_xyz = get_xyz(scene_samples[j]['data'][ref_sensor]['pose'])
temp_dist = np.linalg.norm(curr_xyz - temp_xyz)
if temp_dist > max_dist:
break
if temp_dist > min_dist:
prev_samples.append((scene_token, j))
prev_dists.append(temp_dist)
for j in range(i + 1, length, 1):
temp_xyz = get_xyz(scene_samples[j]['data'][ref_sensor]['pose'])
temp_dist = np.linalg.norm(curr_xyz - temp_xyz)
if temp_dist > max_dist:
break
if temp_dist > min_dist:
next_samples.append((scene_token, j))
next_dists.append(temp_dist)
if not strict:
prev_samples.append((scene_token, i))
prev_dists.append(0.)
next_samples.append((scene_token, i))
next_dists.append(0.)
sample.update({
'prev_samples': prev_samples,
'prev_dists': prev_dists,
'next_samples': next_samples,
'next_dists': next_dists,})
def __len__(self):
'Denotes the total number of samples'
# return len(self.nusc_infos)
return len(self.keyframes)
def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
# lidar2img: N, 4, 4
scan = np.fromfile(os.path.join(self.data_path, lidar_path), dtype=np.float32)
scan = scan.reshape((-1, 5))[:, :4]
scan[:, 3] = 1.0
# points_hcoords = scan[scan[:, 0] > 0, :]
points_hcoords = np.expand_dims(scan.T, 0) # 1, 4, n
img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4
depth = img_points[..., 2] # N, n
img_points = img_points[..., :2] # N, n, 2
# mask = (depth < self.eval_depth) & (depth > 1e-3) # get points with depth < max_sample_depth
mask = (depth < self.eval_depth) & (depth > 1.0) # get points with depth < max_sample_depth
img_points = img_points / np.expand_dims(depth, axis=2) # scale 2D points
img_points[..., 0] = img_points[..., 0] / image_size[1]
img_points[..., 1] = img_points[..., 1] / image_size[0]
# img_points = np.round(img_points).astype(int)
mask = mask & (img_points[..., 0] > 0) & \
(img_points[..., 1] > 0) & \
(img_points[..., 0] < 1) & \
(img_points[..., 1] < 1)
return img_points, depth, mask
def composite_dict(self, anchor_info):
datas = []
for prefix in ['prev_', 'next_']:
data = dict()
dists = np.asarray(anchor_info[prefix + 'dists'])
for sensor_type in self.sensor_types:
mu = self.sensor_mus[sensor_type]
sigma = self.sensor_sigma
probs = 1 / math.sqrt(2 * math.pi) / sigma * np.exp(-1 / (2 * sigma * sigma) * ((dists - mu) ** 2))
probs = probs / np.sum(probs)
idx = np.random.choice(len(dists), p=probs)
scene_token, sample_idx = anchor_info[prefix + 'samples'][idx]
data.update({sensor_type: self.scene_infos[scene_token][sample_idx]['data'][sensor_type]})
datas.append(data)
return {'data': datas[0]}, {'data': datas[1]}
def __getitem__(self, index):
#### 1. get color, temporal_depth choice if necessary
if random.random() < self.cur_prob:
temporal_supervision = 'curr'
elif random.random() < self.prev_prob:
temporal_supervision = 'prev'
else:
temporal_supervision = 'next'
#### 2. get self, prev, next infos for the stem, and also temp_depth info
while True:
scene_token, index = self.keyframes[index]
info = deepcopy(self.scene_infos[scene_token][index])
if temporal_supervision == 'curr':
anchor_info = deepcopy(info)
elif temporal_supervision == 'prev':
if len(info['prev_samples']) == 0:
index = np.random.randint(len(self))
continue
anchor_scene_token, anchor_info_id = info['prev_samples'][np.random.randint(len(info['prev_samples']))]
# anchor_scene_token, anchor_info_id = np.random.choice(info['prev_samples'])
assert anchor_scene_token == scene_token and anchor_info_id <= index
anchor_info = deepcopy(self.scene_infos[scene_token][anchor_info_id])
else:
if len(info['next_samples']) == 0:
index = np.random.randint(len(self))
continue
anchor_scene_token, anchor_info_id = info['next_samples'][np.random.randint(len(info['next_samples']))]
# anchor_scene_token, anchor_info_id = np.random.choice(info['next_samples'])
assert anchor_scene_token == scene_token and anchor_info_id >= index
anchor_info = deepcopy(self.scene_infos[scene_token][anchor_info_id])
if len(anchor_info['prev_samples']) == 0 or \
len(anchor_info['next_samples']) == 0:
index = np.random.randint(len(self))
continue
if self.composite_prev_next:
anchor_prev, anchor_next = self.composite_dict(anchor_info)
else:
if self.choose_nearest:
anchor_prev_scene_token, anchor_prev_idx = anchor_info['prev_samples'][0]
anchor_next_scene_token, anchor_next_idx = anchor_info['next_samples'][0]
else:
anchor_prev_scene_token, anchor_prev_idx = anchor_info['prev_samples'][np.random.randint(len(anchor_info['prev_samples']))]
anchor_next_scene_token, anchor_next_idx = anchor_info['next_samples'][np.random.randint(len(anchor_info['next_samples']))]
# anchor_prev_scene_token, anchor_prev_idx = np.random.choice(anchor_info['prev_samples'])
# anchor_next_scene_token, anchor_next_idx = np.random.choice(anchor_info['next_samples'])
assert anchor_prev_scene_token == scene_token and \
anchor_next_scene_token == scene_token
anchor_prev = deepcopy(self.scene_infos[scene_token][anchor_prev_idx])
anchor_next = deepcopy(self.scene_infos[scene_token][anchor_next_idx])
break
#### 3. prepare img_metas
imgs_info = self.get_data_info(info)
anchor_dict = self.get_data_info_anchor(info, anchor_info)
prev_dict = self.get_data_info_temporal(anchor_info, anchor_prev)
next_dict = self.get_data_info_temporal(anchor_info, anchor_next)
img_metas = {
'input_imgs_path': imgs_info['img_filename'],
'curr_imgs_path': anchor_dict['image_paths'],
'prev_imgs_path': prev_dict['image_paths'],
'next_imgs_path': next_dict['image_paths'],
'lidar2img': imgs_info['lidar2img'],
'img2lidar': imgs_info['img2lidar'],
'intrinsic': imgs_info['cam_intrinsic'],
'cam2ego': imgs_info['cam2ego'],
'temImg2lidar': anchor_dict['temImg2lidar'],
'ego2lidar': imgs_info['ego2lidar'],
'token': info['token'],
'timestamp': info['timestamp'],
'img2prevImg': prev_dict['img2temImg'],
'img2nextImg': next_dict['img2temImg'],}
if self.return_depth:
depth_loc, depth_gt, depth_mask = self.get_depth_from_lidar(
info['data']['LIDAR_TOP']['filename'], img_metas['lidar2img'], self.crop_size)
img_metas.update({
'depth_loc': depth_loc,
'depth_gt': depth_gt,
'depth_mask': depth_mask})
if self.ego_centric:
ego2lidar = img_metas['ego2lidar']
lidar2ego = np.linalg.inv(ego2lidar)
ego2img = img_metas['lidar2img'] @ ego2lidar[None, ...]
img2ego = lidar2ego[None, ...] @ img_metas['img2lidar']
temImg2ego = lidar2ego[None, ...] @ img_metas['temImg2lidar']
img_metas.update({
'lidar2img': ego2img,
'img2lidar': img2ego,
'temImg2lidar': temImg2ego,
'ego2lidar': np.eye(4)})
#### 4. read imgs
input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'], self.input_img_crop_size)
curr_imgs = self.read_surround_imgs(img_metas['curr_imgs_path'], self.crop_size)
prev_imgs = self.read_surround_imgs(img_metas['prev_imgs_path'], self.crop_size)
next_imgs = self.read_surround_imgs(img_metas['next_imgs_path'], self.crop_size)
data_tuple = (
[input_imgs, curr_imgs, prev_imgs, next_imgs], img_metas)
return data_tuple
def read_surround_imgs(self, img_paths, crop_size):
if hfai:
imgs = self.img_loader.load(img_paths)
else:
imgs = []
for filename in img_paths:
imgs.append(
imread(filename, 'unchanged').astype(np.float32))
imgs = [img[:crop_size[0], :crop_size[1], :] for img in imgs]
return imgs
def get_data_info_temporal(self, info, info_tem):
image_paths = []
img2temImgs = []
for cam_type in self.sensor_types:
cam_info_tem = info_tem['data'][cam_type]
cam_info = info['data'][cam_type]
image_paths.append(os.path.join(self.data_path, cam_info_tem['filename']))
temImg2global = get_img2global(cam_info_tem['calib'], cam_info_tem['pose'])
img2global = get_img2global(cam_info['calib'], cam_info['pose'])
img2temImg = np.linalg.inv(temImg2global) @ img2global
img2temImgs.append(img2temImg)
out_dict = dict(
image_paths=image_paths,
img2temImg=np.asarray(img2temImgs))
return out_dict
def get_data_info_anchor(self, info, info_tem):
image_paths = []
temImg2lidars = []
lidar2global = get_lidar2global(info['data']['LIDAR_TOP']['calib'], info['data']['LIDAR_TOP']['pose'])
for cam_type in self.sensor_types:
cam_info_tem = info_tem['data'][cam_type]
image_paths.append(os.path.join(self.data_path, cam_info_tem['filename']))
temImg2global = get_img2global(cam_info_tem['calib'], cam_info_tem['pose'])
temImg2lidar = np.linalg.inv(lidar2global) @ temImg2global
temImg2lidars.append(temImg2lidar)
out_dict = dict(
image_paths=image_paths,
temImg2lidar=np.asarray(temImg2lidars))
return out_dict
def get_data_info(self, info):
image_paths = []
lidar2img_rts = []
img2lidar_rts = []
cam_intrinsics = []
cam2ego_rts = []
lidar2ego_r = Quaternion(info['data']['LIDAR_TOP']['calib']['rotation']).rotation_matrix
lidar2ego = np.eye(4)
lidar2ego[:3, :3] = lidar2ego_r
lidar2ego[:3, 3] = np.array(info['data']['LIDAR_TOP']['calib']['translation']).T
ego2lidar = np.linalg.inv(lidar2ego)
lidar2global = get_lidar2global(info['data']['LIDAR_TOP']['calib'], info['data']['LIDAR_TOP']['pose'])
for cam_type in self.sensor_types:
image_paths.append(os.path.join(self.data_path, info['data'][cam_type]['filename']))
img2global = get_img2global(info['data'][cam_type]['calib'], info['data'][cam_type]['pose'])
lidar2img = np.linalg.inv(img2global) @ lidar2global
img2lidar = np.linalg.inv(lidar2global) @ img2global
cam2ego_r = Quaternion(info['data'][cam_type]['calib']['rotation']).rotation_matrix
cam2ego = np.eye(4)
cam2ego[:3, :3] = cam2ego_r
cam2ego[:3, 3] = np.array(info['data'][cam_type]['calib']['translation']).T
intrinsic = info['data'][cam_type]['calib']['camera_intrinsic']
viewpad = np.eye(4)
viewpad[:3, :3] = intrinsic
lidar2img_rts.append(lidar2img)
img2lidar_rts.append(img2lidar)
cam_intrinsics.append(viewpad)
cam2ego_rts.append(cam2ego)
input_dict =dict(
img_filename=image_paths,
lidar2img=np.asarray(lidar2img_rts),
img2lidar=np.asarray(img2lidar_rts),
cam_intrinsic=np.asarray(cam_intrinsics),
ego2lidar=ego2lidar,
cam2ego=np.asarray(cam2ego_rts))
return input_dict
# def get_data_info(self, info):
# image_paths = []
# lidar2img_rts = []
# img2lidar_rts = []
# cam_intrinsics = []
# cam2ego_rts = []
# lidar2ego_r = Quaternion(info['data']['LIDAR_TOP']['calib']['rotation']).rotation_matrix
# lidar2ego = np.eye(4)
# lidar2ego[:3, :3] = lidar2ego_r
# lidar2ego[:3, 3] = np.array(info['data']['LIDAR_TOP']['calib']['translation']).T
# ego2lidar = np.linalg.inv(lidar2ego)
# for cam_type in self.sensor_types:
# image_paths.append(os.path.join(self.data_path, info['data'][cam_type]['filename']))
# cam2ego_r = Quaternion(info['data'][cam_type]['calib']['rotation']).rotation_matrix
# cam2ego = np.eye(4)
# cam2ego[:3, :3] = cam2ego_r
# cam2ego[:3, 3] = np.array(info['data'][cam_type]['calib']['translation']).T
# ego2cam = np.linalg.inv(cam2ego)
# lidar2cam = ego2cam @ lidar2ego
# intrinsic = info['data'][cam_type]['calib']['camera_intrinsic']
# viewpad = np.eye(4)
# viewpad[:3, :3] = intrinsic
# lidar2img = viewpad @ lidar2cam
# img2lidar = np.linalg.inv(lidar2img)
# lidar2img_rts.append(lidar2img)
# img2lidar_rts.append(img2lidar)
# cam_intrinsics.append(viewpad)
# cam2ego_rts.append(cam2ego)
# input_dict =dict(
# img_filename=image_paths,
# lidar2img=np.asarray(lidar2img_rts),
# img2lidar=np.asarray(img2lidar_rts),
# cam_intrinsic=np.asarray(cam_intrinsics),
# ego2lidar=ego2lidar,
# cam2ego=np.asarray(cam2ego_rts))
# return input_dict
if __name__ == '__main__':
import torch
dataset = nuScenes_One_Frame_Sweeps(
'data/nuscenes',
'data/nuscenes_infos_val_temporal.pkl',
)
batch = dataset[300]
imgs, img_metas, points = batch
curr_imgs, prev_imgs, next_imgs = imgs
def list2tensor(imgs):
imgs = np.asarray(imgs)
imgs = torch.from_numpy(imgs)
return imgs
curr_imgs = list2tensor(curr_imgs)
prev_imgs = list2tensor(prev_imgs)
next_imgs = list2tensor(next_imgs)
print('img metas contains keys: ', list(img_metas.keys()))
l2i = list2tensor(img_metas['lidar2img'])
img2prevImg = list2tensor(img_metas['img2prevImg'])
img2nextImg = list2tensor(img_metas['img2nextImg'])
def get_diagonal(trans):
trans = trans.reshape(6, 6, 4, 4)
trans = torch.diagonal(trans, 0, 0, 1)
trans = trans.permute(2, 0, 1)
return trans
img2prevImg = get_diagonal(img2prevImg)
img2nextImg = get_diagonal(img2nextImg)
def filter_invalid_pts(pts):
mask = torch.norm(pts, dim=-1)
mask = mask > 2
pts = pts[mask]
return pts
points = torch.from_numpy(points)
points = filter_invalid_pts(points)
points = torch.cat([
points,
torch.ones_like(points[:, :1])
], dim=1)
curr_pts_uv = torch.matmul(
l2i.unsqueeze(1).to(points.dtype),
points.reshape(1, -1, 4, 1))
prev_pts_uv = torch.matmul(
img2prevImg.unsqueeze(1).to(points.dtype),
curr_pts_uv)
next_pts_uv = torch.matmul(
img2nextImg.unsqueeze(1).to(points.dtype),
curr_pts_uv)
curr_pts_uv = curr_pts_uv.squeeze(-1)
prev_pts_uv = prev_pts_uv.squeeze(-1)
next_pts_uv = next_pts_uv.squeeze(-1)
def get_pixel(uv, eps=1e-5):
uv = uv[..., :2] / torch.maximum(
uv[..., 2:3],
torch.ones_like(uv[..., :1] * eps))
return uv
curr_pts_uv = get_pixel(curr_pts_uv)
prev_pts_uv = get_pixel(prev_pts_uv)
next_pts_uv = get_pixel(next_pts_uv)
mask1 = torch.logical_and(
curr_pts_uv[0, :, 0] > 600,
curr_pts_uv[0, :, 0] < 1000)
mask2 = torch.logical_and(
curr_pts_uv[0, :, 1] > 400,
curr_pts_uv[0, :, 1] < 500)
mask = torch.logical_and(mask1, mask2)
curr_pts_uv = curr_pts_uv[:, mask, :]
prev_pts_uv = prev_pts_uv[:, mask, :]
next_pts_uv = next_pts_uv[:, mask, :]
def filter_invalid_points(uv):
uv[..., 0] = torch.clamp(uv[..., 0], 0, 1600)
uv[..., 1] = torch.clamp(uv[..., 1], 0, 900)
return uv
curr_pts_uv = filter_invalid_points(curr_pts_uv)
prev_pts_uv = filter_invalid_points(prev_pts_uv)
next_pts_uv = filter_invalid_points(next_pts_uv)
import matplotlib.pyplot as plt
def saveimg(i, imgs, uv, name):
plt.imshow(imgs[i] / 256)
plt.scatter(uv[i][:, 0], uv[i][:, 1], s=2)
plt.savefig(f'{name}_{i}.jpg')
plt.cla()
for i in range(6):
saveimg(i, curr_imgs, curr_pts_uv, 'curr')
saveimg(i, prev_imgs, prev_pts_uv, 'prev')
saveimg(i, next_imgs, next_pts_uv, 'next')
pass
================================================
FILE: dataset/dataset_one_frame_sweeps_dist_vis.py
================================================
import os, numpy as np, random, mmengine, math
from mmcv.image.io import imread
from pyquaternion import Quaternion
from copy import deepcopy
from . import OPENOCC_DATASET
def get_xyz(pose_dict):
return np.array(pose_dict['translation'])
def get_img2global(calib_dict, pose_dict):
cam2img = np.eye(4)
cam2img[:3, :3] = np.asarray(calib_dict['camera_intrinsic'])
img2cam = np.linalg.inv(cam2img)
cam2ego = np.eye(4)
cam2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
cam2ego[:3, 3] = np.asarray(calib_dict['translation']).T
ego2global = np.eye(4)
ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
ego2global[:3, 3] = np.asarray(pose_dict['translation']).T
img2global = ego2global @ cam2ego @ img2cam
return img2global
def get_lidar2global(calib_dict, pose_dict):
lidar2ego = np.eye(4)
lidar2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
lidar2ego[:3, 3] = np.asarray(calib_dict['translation']).T
ego2global = np.eye(4)
ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
ego2global[:3, 3] = np.asarray(pose_dict['translation']).T
lidar2global = ego2global @ lidar2ego
return lidar2global
@OPENOCC_DATASET.register_module()
class nuScenes_One_Frame_Sweeps_Dist_Vis:
def __init__(
self,
data_path,
imageset='train',
crop_size=[768, 1600],
return_depth=False,
eval_depth=80,
ego_centric=False,
scene_token=None,
**kwargs):
assert scene_token is not None
data = mmengine.load(imageset)
self.scene_infos = data['infos']
assert scene_token in self.scene_infos
self.infos = self.scene_infos[scene_token]
self.data_path = data_path
self.crop_size = crop_size
self.return_depth = return_depth
self.eval_depth = eval_depth
self.sensor_types = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
self.ego_centric = ego_centric
self.img_loader = self.pts_loader = None
self.lidar2cam_rect = np.eye(4)
def __len__(self):
'Denotes the total number of samples'
return len(self.infos)
def __getitem__(self, index):
info = deepcopy(self.infos[index])
imgs_info = self.get_data_info(info)
img_metas = {
'input_imgs_path': imgs_info['img_filename'],
'lidar2img': imgs_info['lidar2img'],
'img2lidar': imgs_info['img2lidar'],
'intrinsic': imgs_info['cam_intrinsic'],
'cam2ego': imgs_info['cam2ego'],
'ego2lidar': imgs_info['ego2lidar'],
# 'token': info['token'],
'timestamp': info['timestamp'],
'cam_positions': imgs_info['cam_positions'],
'focal_positions': imgs_info['focal_positions']
}
if self.ego_centric:
ego2lidar = img_metas['ego2lidar']
lidar2ego = np.linalg.inv(ego2lidar)
ego2img = img_metas['lidar2img'] @ ego2lidar[None, ...]
img2ego = lidar2ego[None, ...] @ img_metas['img2lidar']
img_metas.update({
'lidar2img': ego2img,
'img2lidar': img2ego})
#### 4. read imgs
input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'])
data_tuple = (input_imgs, img_metas)
return data_tuple
def read_surround_imgs(self, img_paths):
imgs = []
for filename in img_paths:
imgs.append(
imread(filename, 'unchanged').astype(np.float32))
imgs = [img[:self.crop_size[0], :self.crop_size[1], :] for img in imgs]
return imgs
def get_data_info(self, info):
f = 0.0055
image_paths = []
lidar2img_rts = []
img2lidar_rts = []
cam_intrinsics = []
cam2ego_rts = []
cam_positions = []
focal_positions = []
lidar2ego_r = Quaternion(info['data']['LIDAR_TOP']['calib']['rotation']).rotation_matrix
lidar2ego = np.eye(4)
lidar2ego[:3, :3] = lidar2ego_r
lidar2ego[:3, 3] = np.array(info['data']['LIDAR_TOP']['calib']['translation']).T
ego2lidar = np.linalg.inv(lidar2ego)
lidar2global = get_lidar2global(info['data']['LIDAR_TOP']['calib'], info['data']['LIDAR_TOP']['pose'])
for cam_type in self.sensor_types:
image_paths.append(os.path.join(self.data_path, info['data'][cam_type]['filename']))
img2global = get_img2global(info['data'][cam_type]['calib'], info['data'][cam_type]['pose'])
lidar2img = np.linalg.inv(img2global) @ lidar2global
img2lidar = np.linalg.inv(lidar2global) @ img2global
cam2ego_r = Quaternion(info['data'][cam_type]['calib']['rotation']).rotation_matrix
cam2ego = np.eye(4)
cam2ego[:3, :3] = cam2ego_r
cam2ego[:3, 3] = np.array(info['data'][cam_type]['calib']['translation']).T
intrinsic = info['data'][cam_type]['calib']['camera_intrinsic']
viewpad = np.eye(4)
viewpad[:3, :3] = intrinsic
cam_position = img2lidar @ viewpad @ np.array([0., 0., 0., 1.]).reshape([4, 1])
cam_positions.append(cam_position.flatten()[:3])
focal_position = img2lidar @ viewpad @ np.array([0., 0., f, 1.]).reshape([4, 1])
focal_positions.append(focal_position.flatten()[:3])
lidar2img_rts.append(lidar2img)
img2lidar_rts.append(img2lidar)
cam_intrinsics.append(viewpad)
cam2ego_rts.append(cam2ego)
input_dict =dict(
img_filename=image_paths,
lidar2img=np.asarray(lidar2img_rts),
img2lidar=np.asarray(img2lidar_rts),
cam_intrinsic=np.asarray(cam_intrinsics),
ego2lidar=ego2lidar,
cam2ego=np.asarray(cam2ego_rts),
cam_positions=cam_positions,
focal_positions=focal_positions)
return input_dict
================================================
FILE: dataset/dataset_wrapper_temporal.py
================================================
import numpy as np, torch
from torch.utils import data
from dataset.transform_3d import PadMultiViewImage, NormalizeMultiviewImage, \
PhotoMetricDistortionMultiViewImage, RandomScaleImageMultiViewImage, \
RandomFlip
import torch.nn.functional as F
from copy import deepcopy
from mmengine import MMLogger
logger = MMLogger.get_instance('selfocc')
from . import OPENOCC_DATAWRAPPER
# img_norm_cfg = dict(
# mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
@OPENOCC_DATAWRAPPER.register_module()
class tpvformer_dataset_nuscenes_temporal(data.Dataset):
def __init__(
self,
in_dataset,
phase='train',
scale_rate=1,
photometric_aug=dict(
use_swap_channel=False,
),
use_temporal_aug=False,
temporal_aug_list=[],
img_norm_cfg=img_norm_cfg,
supervision_img_size=None,
supervision_scale_rate=None,
use_flip=False,
ref_focal_len=None,
pad_img_size=None,
random_scale=None,
pad_scale_rate=None,
):
'Initialization'
self.point_cloud_dataset = in_dataset
self.scale_rate = scale_rate
self.use_temporal_aug = use_temporal_aug
if use_temporal_aug:
assert len(temporal_aug_list) > 0
self.temporal_aug_list = temporal_aug_list
photometric = PhotoMetricDistortionMultiViewImage(**photometric_aug)
logger.info('using photometric augmentation: '+ str(photometric_aug))
train_transforms = [
photometric,
NormalizeMultiviewImage(**img_norm_cfg),
PadMultiViewImage(size_divisor=32, size=pad_img_size)
]
val_transforms = [
NormalizeMultiviewImage(**img_norm_cfg),
PadMultiViewImage(size_divisor=32, size=pad_img_size)
]
if scale_rate != 1 or ref_focal_len is not None or random_scale is not None or pad_scale_rate is not None:
train_transforms.insert(2, RandomScaleImageMultiViewImage([scale_rate], ref_focal_len, random_scale, pad_scale_rate))
val_transforms.insert(1, RandomScaleImageMultiViewImage([scale_rate], ref_focal_len, pad_scale_rate=pad_scale_rate))
if use_flip:
train_transforms.append(RandomFlip(0.5))
if phase == 'train':
self.transforms = train_transforms
else:
self.transforms = val_transforms
if use_temporal_aug:
self.temporal_transforms = [
NormalizeMultiviewImage(**img_norm_cfg),
PadMultiViewImage(size_divisor=4)]
if supervision_scale_rate != 1:
self.temporal_transforms.insert(1, RandomScaleImageMultiViewImage([supervision_scale_rate]))
self.supervision_img_size = supervision_img_size
def __len__(self):
return len(self.point_cloud_dataset)
def to_tensor(self, imgs):
imgs = np.stack(imgs).astype(np.float32)
imgs = torch.from_numpy(imgs)
imgs = imgs.permute(0, 3, 1, 2)
return imgs
def __getitem__(self, index):
data = self.point_cloud_dataset[index]
if len(data) == 2:
return self.deal_with_length2_dataset(data)
elif len(data) == 3:
return self.deal_with_length3_dataset(data)
def deal_with_length3_dataset(self, data):
input_imgs, anchor_imgs, img_metas = data
img_metas['img_shape'] = input_imgs[0].shape[:2]
# deal with img augmentations
input_imgs, imgs_dict = forward_aug(input_imgs, img_metas, self.transforms)
input_imgs = self.to_tensor(input_imgs)
# img_metas['img_shape'] = imgs_dict['img_shape']
img_metas['scale_rate'] = self.scale_rate
# if 'focal_ratios' in imgs_dict:
# img_metas['focal_ratios'] = imgs_dict['focal_ratios']
if 'focal_ratios_x' in imgs_dict:
img_metas['focal_ratios_x'] = imgs_dict['focal_ratios_x']
if 'focal_ratios_y' in imgs_dict:
img_metas['focal_ratios_y'] = imgs_dict['focal_ratios_y']
img_metas['flip'] = imgs_dict.get('flip', False)
anchor_imgs = np.asarray(anchor_imgs).astype(np.float32)
anchor_imgs = torch.from_numpy(anchor_imgs)
anchor_imgs = anchor_imgs.permute(0, 1, 4, 2, 3)
data_tuple = (input_imgs,
anchor_imgs / 256.,
img_metas)
return data_tuple
def deal_with_length2_dataset(self, data):
imgs, img_metas = data
if len(imgs) == 4:
input_imgs, curr_imgs, prev_imgs, next_imgs = imgs
color_imgs = deepcopy(curr_imgs)
elif len(imgs) == 3:
curr_imgs, prev_imgs, next_imgs = imgs
input_imgs = deepcopy(curr_imgs)
color_imgs = deepcopy(curr_imgs)
elif len(imgs) == 5:
input_imgs, curr_imgs, prev_imgs, next_imgs, color_imgs = imgs
img_metas['img_shape'] = input_imgs[0].shape[:2]
# deal with img augmentations
input_imgs, imgs_dict = forward_aug(input_imgs, img_metas, self.transforms)
input_imgs = self.to_tensor(input_imgs)
curr_aug = prev_aug = next_aug = None
if 'curr_imgs' in self.temporal_aug_list:
curr_aug, _ = forward_aug(curr_imgs, {}, self.temporal_transforms)
curr_aug = self.to_tensor(curr_aug)
if 'prev_imgs' in self.temporal_aug_list:
prev_aug, _ = forward_aug(prev_imgs, {}, self.temporal_transforms)
prev_aug = self.to_tensor(prev_aug)
if 'next_imgs' in self.temporal_aug_list:
next_aug, _ = forward_aug(next_imgs, {}, self.temporal_transforms)
next_aug = self.to_tensor(next_aug)
curr_imgs = self.to_tensor(curr_imgs)
prev_imgs = self.to_tensor(prev_imgs)
next_imgs = self.to_tensor(next_imgs)
color_imgs = self.to_tensor(color_imgs)
if self.supervision_img_size is not None:
curr_imgs = F.interpolate(curr_imgs, size=self.supervision_img_size, mode='bilinear', align_corners=True)
prev_imgs = F.interpolate(prev_imgs, size=self.supervision_img_size, mode='bilinear', align_corners=True)
next_imgs = F.interpolate(next_imgs, size=self.supervision_img_size, mode='bilinear', align_corners=True)
# img_metas['img_shape'] = imgs_dict['img_shape']
img_metas['scale_rate'] = self.scale_rate
# if 'focal_ratios' in imgs_dict:
# img_metas['focal_ratios'] = imgs_dict['focal_ratios']
if 'focal_ratios_x' in imgs_dict:
img_metas['focal_ratios_x'] = imgs_dict['focal_ratios_x']
if 'focal_ratios_y' in imgs_dict:
img_metas['focal_ratios_y'] = imgs_dict['focal_ratios_y']
img_metas['flip'] = imgs_dict.get('flip', False)
data_tuple = (input_imgs,
curr_imgs / 256.,
prev_imgs / 256.,
next_imgs / 256.,
color_imgs / 256.,
img_metas,
curr_aug,
prev_aug,
next_aug)
return data_tuple
def custom_collate_fn_temporal(data):
data_tuple = []
for i, item in enumerate(data[0]):
if isinstance(item, torch.Tensor):
data_tuple.append(torch.stack([d[i] for d in data]))
elif isinstance(item, (dict, str)):
data_tuple.append([d[i] for d in data])
elif item is None:
data_tuple.append(None)
else:
raise NotImplementedError
return data_tuple
def forward_aug(imgs, metas, transforms):
imgs_dict = {
'img': imgs,
'metas': metas,
}
for t in transforms:
imgs_dict = t(imgs_dict)
aug_imgs = imgs_dict['img']
return aug_imgs, imgs_dict
================================================
FILE: dataset/dataset_wrapper_vis.py
================================================
import numpy as np, torch
from torch.utils import data
from dataset.transform_3d import PadMultiViewImage, NormalizeMultiviewImage, \
RandomScaleImageMultiViewImage
from mmengine import MMLogger
logger = MMLogger.get_instance('selfocc')
from . import OPENOCC_DATAWRAPPER
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
@OPENOCC_DATAWRAPPER.register_module()
class tpvformer_dataset_nuscenes_vis(data.Dataset):
def __init__(
self,
in_dataset,
scale_rate=1,
img_norm_cfg=img_norm_cfg,
pad_img_size=None,
pad_scale_rate=None,
**kwargs
):
'Initialization'
self.point_cloud_dataset = in_dataset
self.scale_rate = scale_rate
val_transforms = [
NormalizeMultiviewImage(**img_norm_cfg),
PadMultiViewImage(size_divisor=32, size=pad_img_size)]
if scale_rate != 1 or pad_scale_rate is not None:
val_transforms.insert(1, RandomScaleImageMultiViewImage([scale_rate], pad_scale_rate=pad_scale_rate))
self.transforms = val_transforms
def __len__(self):
return len(self.point_cloud_dataset)
def to_tensor(self, imgs):
imgs = np.stack(imgs).astype(np.float32)
imgs = torch.from_numpy(imgs)
imgs = imgs.permute(0, 3, 1, 2)
return imgs
def __getitem__(self, index):
data = self.point_cloud_dataset[index]
return self.deal_with_length2_dataset(data)
def deal_with_length2_dataset(self, data):
input_imgs, img_metas = data
img_metas['img_shape'] = input_imgs[0].shape[:2]
# deal with img augmentations
input_imgs, imgs_dict = forward_aug(input_imgs, img_metas, self.transforms)
input_imgs = self.to_tensor(input_imgs)
img_metas['scale_rate'] = self.scale_rate
if 'focal_ratios_x' in imgs_dict:
img_metas['focal_ratios_x'] = imgs_dict['focal_ratios_x']
if 'focal_ratios_y' in imgs_dict:
img_metas['focal_ratios_y'] = imgs_dict['focal_ratios_y']
img_metas['flip'] = imgs_dict.get('flip', False)
data_tuple = (input_imgs,
img_metas,)
return data_tuple
def custom_collate_fn_temporal(data):
data_tuple = []
for i, item in enumerate(data[0]):
if isinstance(item, torch.Tensor):
data_tuple.append(torch.stack([d[i] for d in data]))
elif isinstance(item, (dict, str)):
data_tuple.append([d[i] for d in data])
elif item is None:
data_tuple.append(None)
else:
raise NotImplementedError
return data_tuple
def forward_aug(imgs, metas, transforms):
imgs_dict = {
'img': imgs,
'metas': metas,
}
for t in transforms:
imgs_dict = t(imgs_dict)
aug_imgs = imgs_dict['img']
return aug_imgs, imgs_dict
================================================
FILE: dataset/kitti/helpers.py
================================================
import numpy as np
# import scenerf.data.utils.fusion as fusion
# import open3d as o3d
from PIL import Image
# def make_open3d_point_cloud(xyz, color=None):
# pcd = o3d.geometry.PointCloud()
# pcd.points = o3d.utility.Vector3dVector(xyz)
# if color is not None:
# if len(color) != len(xyz):
# color = np.tile(color, (len(xyz), 1))
# pcd.colors = o3d.utility.Vector3dVector(color)
# return pcd
def apply_transform(pts, T):
"""
cam_ptx_from: B, 3
"""
ones = np.ones((pts.shape[0], 1))
homo_pts = np.concatenate([pts, ones], axis=1)
homo_cam_pts_to = (T @ homo_pts.T).T
cam_pts_to = homo_cam_pts_to[:, :3]
return cam_pts_to
def dump_xyz(P):
return P[0:3, 3]
def read_rgb(path):
img = Image.open(path).convert("RGB")
# PIL to numpy
img = np.array(img, dtype=np.float32, copy=False) / 255.0
img = img[:370, :1220, :] # crop image
return img
def read_poses(path):
# Read and parse the poses
poses = []
with open(path, 'r') as f:
lines = f.readlines()
for line in lines:
T_w_cam0 = np.fromstring(line, dtype=float, sep=' ')
T_w_cam0 = T_w_cam0.reshape(3, 4)
T_w_cam0 = np.vstack((T_w_cam0, [0, 0, 0, 1]))
poses.append(T_w_cam0)
return poses
def read_calib(calib_path):
"""
Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
:param calib_path: Path to a calibration text file.
:return: dict with calibration matrices.
"""
calib_all = {}
with open(calib_path, "r") as f:
for line in f.readlines():
if line == "\n":
break
key, value = line.split(":", 1)
calib_all[key] = np.array([float(x) for x in value.split()])
# reshape matrices
calib_out = {}
# 3x4 projection matrix for left camera
calib_out["P2"] = calib_all["P2"].reshape(3, 4)
calib_out["Tr"] = np.identity(4) # 4x4 matrix
calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
T2 = np.eye(4)
T2[0, 3] = calib_out["P2"][0, 3] / calib_out["P2"][0, 0]
calib_out["T_cam0_2_cam2"] = T2
return calib_out
# def compute_transformation(
# lidar_path_source, lidar_path_infer, lidar_path_target,
# pose_source, pose_infer, pose_target,
# T_velo_2_cam2, T_cam0_2_cam2):
# pts_velo_source = np.fromfile(lidar_path_source, dtype=np.float32).reshape((-1, 4))[:, :3]
# pts_velo_infer = np.fromfile(lidar_path_infer, dtype=np.float32).reshape((-1, 4))[:, :3]
# pts_velo_target = np.fromfile(lidar_path_target, dtype=np.float32).reshape((-1, 4))[:, :3]
# pts_cam2_source = apply_transform(pts_velo_source, T_velo_2_cam2)
# pts_cam2_infer = apply_transform(pts_velo_infer, T_velo_2_cam2)
# pts_cam2_target = apply_transform(pts_velo_target, T_velo_2_cam2)
# T_cam2_2_cam0 = np.linalg.inv(T_cam0_2_cam2)
# T_source2infer = T_cam0_2_cam2 @ np.linalg.inv(pose_infer) @ pose_source @ T_cam2_2_cam0
# T_source2target = T_cam0_2_cam2 @ np.linalg.inv(pose_target) @ pose_source @ T_cam2_2_cam0
# pts_cam2_source2infer = apply_transform(pts_cam2_source, T_source2infer)
# pts_cam2_source2target = apply_transform(pts_cam2_source, T_source2target)
# pcd_source2infer = make_open3d_point_cloud(pts_cam2_source2infer).voxel_down_sample(voxel_size=0.05)
# pcd_source2target = make_open3d_point_cloud(pts_cam2_source2target).voxel_down_sample(voxel_size=0.05)
# pcd_infer = make_open3d_point_cloud(pts_cam2_infer).voxel_down_sample(voxel_size=0.05)
# pcd_target = make_open3d_point_cloud(pts_cam2_target).voxel_down_sample(voxel_size=0.05)
# reg_source2infer = o3d.pipelines.registration.registration_icp(
# pcd_source2infer, pcd_infer, 0.2, np.eye(4),
# o3d.pipelines.registration.TransformationEstimationPointToPoint(),
# o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration=200))
# reg_source2target = o3d.pipelines.registration.registration_icp(
# pcd_source2target, pcd_target, 0.2, np.eye(4),
# o3d.pipelines.registration.TransformationEstimationPointToPoint(),
# o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration=200))
# return {
# "T_source2infer": T_source2infer @ reg_source2infer.transformation,
# "T_source2target": T_source2target @ reg_source2target.transformation
# }
# def vox2pix(cam_E, cam_K,
# vox_origin, voxel_size,
# img_W, img_H,
# scene_size):
# """
# compute the 2D projection of voxels centroids
# Parameters:
# ----------
# cam_E: 4x4
# =camera pose in case of NYUv2 dataset
# =Transformation from camera to lidar coordinate in case of SemKITTI
# cam_K: 3x3
# camera intrinsics
# vox_origin: (3,)
# world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
# img_W: int
# image width
# img_H: int
# image height
# scene_size: (3,)
# scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
# Returns
# -------
# projected_pix: (N, 2)
# Projected 2D positions of voxels
# fov_mask: (N,)
# Voxels mask indice voxels inside image's FOV
# sensor_distance: (N,)
# Voxels'distance to the sensor in meter
# """
# # Compute the x, y, z bounding of the scene in meter
# vol_bnds = np.zeros((3,2))
# vol_bnds[:,0] = vox_origin
# vol_bnds[:,1] = vox_origin + np.array(scene_size)
# # Compute the voxels centroids in lidar cooridnates
# vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
# xv, yv, zv = np.meshgrid(
# range(vol_dim[0]),
# range(vol_dim[1]),
# range(vol_dim[2]),
# indexing='ij'
# )
# vox_coords = np.concatenate([
# xv.reshape(1,-1),
# yv.reshape(1,-1),
# zv.reshape(1,-1)
# ], axis=0).astype(int).T
# # Project voxels'centroid from lidar coordinates to camera coordinates
# cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
# cam_pts = fusion.rigid_transform(cam_pts, cam_E)
# # Project camera coordinates to pixel positions
# projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_K)
# pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
# # Eliminate pixels outside view frustum
# sensor_distance = cam_pts[:, 2]
# fov_mask = np.logical_and(pix_x >= 0,
# np.logical_and(pix_x < img_W,
# np.logical_and(pix_y >= 0,
# np.logical_and(pix_y < img_H,
# sensor_distance > 0))))
# return projected_pix, fov_mask, sensor_distance
================================================
FILE: dataset/kitti/io_data.py
================================================
"""
Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
"""
import numpy as np
import yaml
import imageio
def unpack(compressed):
''' given a bit encoded voxel grid, make a normal voxel grid out of it. '''
uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
uncompressed[::8] = compressed[:] >> 7 & 1
uncompressed[1::8] = compressed[:] >> 6 & 1
uncompressed[2::8] = compressed[:] >> 5 & 1
uncompressed[3::8] = compressed[:] >> 4 & 1
uncompressed[4::8] = compressed[:] >> 3 & 1
uncompressed[5::8] = compressed[:] >> 2 & 1
uncompressed[6::8] = compressed[:] >> 1 & 1
uncompressed[7::8] = compressed[:] & 1
return uncompressed
def img_normalize(img, mean, std):
img = img.astype(np.float32) / 255.0
img = img - mean
img = img / std
return img
def pack(array):
""" convert a boolean array into a bitwise array. """
array = array.reshape((-1))
#compressing bit flags.
# yapf: disable
compressed = array[::8] << 7 | array[1::8] << 6 | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
# yapf: enable
return np.array(compressed, dtype=np.uint8)
def get_grid_coords(dims, resolution):
'''
:param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
:return coords_grid: is the center coords of voxels in the grid
'''
# The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
# The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
g_yy = np.arange(0, dims[1] + 1)
# The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
# if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
# (we go to 22 + 1 for the histogramdd)
# ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
sensor_pose = 10
g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)
# Obtaining the grid with coords...
xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
coords_grid = coords_grid.astype(np.float)
coords_grid = (coords_grid * resolution) + resolution/2
temp = np.copy(coords_grid)
temp[:, 0] = coords_grid[:, 1]
temp[:, 1] = coords_grid[:, 0]
coords_grid = np.copy(temp)
return coords_grid, g_xx, g_yy, g_zz
def _get_remap_lut(config_path):
'''
remap_lut to remap classes of semantic kitti for training...
:return:
'''
dataset_config = yaml.safe_load(open(config_path, 'r'))
# make lookup table for mapping
maxkey = max(dataset_config['learning_map'].keys())
# +100 hack making lut bigger just in case there are unknown labels
remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
# in completion we have to distinguish empty and invalid voxels.
# Important: For voxels 0 corresponds to "empty" and not "unlabeled".
remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
remap_lut[0] = 0 # only 'empty' stays 'empty'.
return remap_lut
def get_inv_map():
'''
remap_lut to remap classes of semantic kitti for training...
:return:
'''
config_path = "./semantic-kitti.yaml"
dataset_config = yaml.safe_load(open(config_path, 'r'))
# make lookup table for mapping
inv_map = np.zeros(20, dtype=np.int32)
inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())
return inv_map
def _read_SemKITTI(path, dtype, do_unpack):
bin = np.fromfile(path, dtype=dtype) # Flattened array
if do_unpack:
bin = unpack(bin)
return bin
def _read_label_SemKITTI(path):
label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
return label
def _read_invalid_SemKITTI(path):
invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
return invalid
def _read_occluded_SemKITTI(path):
occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
return occluded
def _read_occupancy_SemKITTI(path):
occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
return occupancy
def _read_rgb_SemKITTI(path):
rgb = np.asarray(imageio.imread(path))
return rgb
def _read_pointcloud_SemKITTI(path):
'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
pointcloud = pointcloud.reshape((-1, 4))
return pointcloud
def _read_calib_SemKITTI(calib_path):
"""
:param calib_path: Path to a calibration text file.
:return: dict with calibration matrices.
"""
calib_all = {}
with open(calib_path, 'r') as f:
for line in f.readlines():
if line == '\n':
break
key, value = line.split(':', 1)
calib_all[key] = np.array([float(x) for x in value.split()])
# reshape matrices
calib_out = {}
calib_out['P2'] = calib_all['P2'].reshape(3, 4) # 3x4 projection matrix for left camera
calib_out['Tr'] = np.identity(4) # 4x4 matrix
calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
return calib_out
def get_remap_lut(path):
'''
remap_lut to remap classes of semantic kitti for training...
:return:
'''
dataset_config = yaml.safe_load(open(path, 'r'))
# make lookup table for mapping
maxkey = max(dataset_config['learning_map'].keys())
# +100 hack making lut bigger just in case there are unknown labels
remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
# in completion we have to distinguish empty and invalid voxels.
# Important: For voxels 0 corresponds to "empty" and not "unlabeled".
remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid'
remap_lut[0] = 0 # only 'empty' stays 'empty'.
return remap_lut
def data_augmentation_3Dflips(flip, data):
# The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
# into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
# https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
# Dims -> {XZY}
# Flipping around the X axis...
if np.isclose(flip, 1):
data = np.flip(data, axis=0).copy()
# Flipping around the Y axis...
if np.isclose(flip, 2):
data = np.flip(data, 2).copy()
# Flipping around the X and the Y axis...
if np.isclose(flip, 3):
data = np.flip(np.flip(data, axis=0), axis=2).copy()
return data
def get_cmap_semanticKITTI20():
colors = np.array([
# [0 , 0 , 0, 255],
[100, 150, 245, 255],
[100, 230, 245, 255],
[30, 60, 150, 255],
[80, 30, 180, 255],
[100, 80, 250, 255],
[255, 30, 30, 255],
[255, 40, 200, 255],
[150, 30, 90, 255],
[255, 0, 255, 255],
[255, 150, 255, 255],
[75, 0, 75, 255],
[175, 0, 75, 255],
[255, 200, 0, 255],
[255, 120, 50, 255],
[0, 175, 0, 255],
[135, 60, 0, 255],
[150, 240, 80, 255],
[255, 240, 150, 255],
[255, 0, 0, 255]]).astype(np.uint8)
return colors
================================================
FILE: dataset/kitti/kitti_dataset_eval.py
================================================
import glob
import os
import time
import numpy as np
import torch
# from torchvision import transforms
from dataset.kitti.helpers import dump_xyz, read_calib, read_poses, read_rgb
from dataset.kitti.params import val_error_frames
# from . import io_data as SemanticKittiIO
from copy import deepcopy
from mmcv.image.io import imread
from .. import OPENOCC_DATASET
if 'HFAI' in os.environ:
hfai = True
from dataset.loading import LoadMultiViewImageFromFilesHF, LoadPtsFromFilesHF
else:
hfai = False
@OPENOCC_DATASET.register_module()
class Kitti_Novel_View_Eval:
def __init__(
self,
split,
root,
preprocess_root,
frames_interval=0.4,
sequence_distance=10,
n_sources=1,
eval_depth=80,
sequences=None,
selected_frames=None,
crop_size=[370, 1220],
**kwargs,
):
self.root = root
self.preprocess_root = preprocess_root
self.depth_preprocess_root = os.path.join(preprocess_root, "depth")
self.transform_preprocess_root = os.path.join(preprocess_root, "transform")
self.n_classes = 20
self.n_sources = n_sources
self.eval_depth = eval_depth
self.transxy = [
[0, -1., 0, 0],
[1., 0, 0, 0],
[0, 0, 1., 0],
[0, 0, 0, 1.]]
self.transxy = np.array(self.transxy)
splits = {
"train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
"val": ["08"],
"test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
}
self.split = split
if sequences is not None:
self.sequences = sequences
else:
self.sequences = splits[split]
self.output_scale = 1
self.scene_size = (51.2, 51.2, 6.4)
self.vox_origin = np.array([0, -25.6, -2])
self.frames_interval = frames_interval
self.sequence_distance = sequence_distance
self.voxel_size = 0.2 # 0.2m
self.img_W = crop_size[1]
self.img_H = crop_size[0]
start_time = time.time()
self.scans = []
self.frame2scan = {}
for sequence in self.sequences:
pose_path = os.path.join(self.root, "dataset", "poses", sequence + ".txt")
gt_global_poses = read_poses(pose_path)
calib = read_calib(
os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
)
P = calib["P2"]
T_cam0_2_cam2 = calib['T_cam0_2_cam2']
T_cam2_2_cam0 = np.linalg.inv(T_cam0_2_cam2)
T_velo_2_cam = T_cam0_2_cam2 @ calib["Tr"]
if split == "val":
glob_path = os.path.join(
self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
)
else:
glob_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", "*.png"
)
dists_glob_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", "*.png"
)
seq_img_paths = glob.glob(glob_path)
dists_seq_img_paths = glob.glob(dists_glob_path)
max_length = 0
min_length = 50
paired_dists = {}
# dist_step = 1 if split == 'train' else 5
dist_step = 1
for seq_img_path in dists_seq_img_paths:
filename = os.path.basename(seq_img_path)
frame_id = os.path.splitext(filename)[0]
prev_frame_id = "{:06d}".format(int(frame_id) - dist_step)
img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", prev_frame_id + ".png")
if not os.path.exists(img_path):
paired_dists[frame_id] = 0.
else:
curr_pose = gt_global_poses[int(frame_id)]
prev_pose = gt_global_poses[int(prev_frame_id)]
prev_xyz = dump_xyz(prev_pose)
curr_xyz = dump_xyz(curr_pose)
rel_distance = np.sqrt(
(prev_xyz[0] - curr_xyz[0]) ** 2 + (prev_xyz[2] - curr_xyz[2]) ** 2)
paired_dists[frame_id] = rel_distance
for seq_img_path in seq_img_paths:
filename = os.path.basename(seq_img_path)
frame_id = os.path.splitext(filename)[0]
current_img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png")
current_lid_path = os.path.join(
self.root, "dataset", "sequences", sequence, "velodyne", frame_id + ".bin")
prev_frame_ids, next_frame_ids = [], []
prev_img_paths, next_img_paths = [], []
prev_lid_paths, next_lid_paths = [], []
prev_poses, next_poses = [gt_global_poses[int(frame_id)]], [gt_global_poses[int(frame_id)]]
prev_dists, next_dists = [], []
# if self.split == 'train':
# pos_step = 1
# neg_step = -1
# else:
# pos_step = 5
# neg_step = -5
pos_step = 1
neg_step = -1
# deal with prev
step = -1
cnt = 0
dist = 0.
while True:
# cnt += step
cnt += neg_step
rel_frame_id = "{:06d}".format(int(frame_id) + cnt)
img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")
if not os.path.exists(img_path):
break
prev_pose = prev_poses[-1]
prev_xyz = dump_xyz(prev_pose)
current_xyz = dump_xyz(gt_global_poses[int(rel_frame_id)])
tmp_dist = np.sqrt(
(prev_xyz[0] - current_xyz[0]) ** 2 + (prev_xyz[2] - current_xyz[2]) ** 2)
# tmp_dist = paired_dists["{:06d}".format(int(rel_frame_id) + pos_step)]
dist += tmp_dist
# if dist < frames_interval:
if tmp_dist < frames_interval:
continue
if dist > sequence_distance:
break
if split == "val" and rel_frame_id in val_error_frames:
continue
prev_frame_ids.append(rel_frame_id)
prev_img_paths.append(img_path)
lidar_path = os.path.join(
self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
prev_lid_paths.append(lidar_path)
prev_poses.append(gt_global_poses[int(rel_frame_id)])
prev_dists.append(dist)
# deal with next
step = 1
cnt = 0
dist = 0.
while True:
# cnt += step
cnt += pos_step
rel_frame_id = "{:06d}".format(int(frame_id) + cnt)
img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")
if not os.path.exists(img_path):
break
prev_pose = next_poses[-1]
prev_xyz = dump_xyz(prev_pose)
current_xyz = dump_xyz(gt_global_poses[int(rel_frame_id)])
tmp_dist = np.sqrt(
(prev_xyz[0] - current_xyz[0]) ** 2 + (prev_xyz[2] - current_xyz[2]) ** 2)
# tmp_dist = paired_dists[rel_frame_id]
dist += tmp_dist
# if dist < frames_interval:
if tmp_dist < frames_interval:
continue
if dist > sequence_distance:
break
if split == "val" and rel_frame_id in val_error_frames:
continue
next_frame_ids.append(rel_frame_id)
next_img_paths.append(img_path)
lidar_path = os.path.join(
self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
next_lid_paths.append(lidar_path)
next_poses.append(gt_global_poses[int(rel_frame_id)])
next_dists.append(dist)
# ignore error frame
if split == "val" and frame_id in val_error_frames:
continue
is_included = len(next_frame_ids) > 0 if selected_frames is None else frame_id in selected_frames
if is_included:
# prev_next_len = len(prev_poses) + len(next_poses)
prev_next_len = len(next_frame_ids)
if prev_next_len > max_length:
max_length = prev_next_len
if prev_next_len < min_length:
min_length = prev_next_len
self.frame2scan.update({str(sequence) + "_" + frame_id: len(self.scans)})
self.scans.append(
{
"frame_id": frame_id,
"sequence": sequence,
"img_path": current_img_path,
"lid_path": current_lid_path,
"pose": gt_global_poses[int(frame_id)],
"prev_img_paths": prev_img_paths,
"prev_lid_paths": prev_lid_paths,
"next_img_paths": next_img_paths,
"next_lid_paths": next_lid_paths,
"T_velo_2_cam": T_velo_2_cam,
"P": P,
"T_cam0_2_cam2": T_cam0_2_cam2,
"T_cam2_2_cam0": T_cam2_2_cam0,
"prev_poses": prev_poses[1:],
"next_poses": next_poses[1:],
"prev_dists": prev_dists,
"next_dists": next_dists,
"prev_frame_ids": prev_frame_ids,
"next_frame_ids": next_frame_ids
}
)
print(sequence, min_length, max_length)
# self.to_tensor_normalized = transforms.Compose(
# [
# transforms.ToTensor(),
# transforms.Normalize(
# mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
# ),
# ]
# )
# self.to_tensor = transforms.Compose([
# transforms.ToTensor()
# ])
print("Preprocess time: --- %s seconds ---" % (time.time() - start_time))
def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
# lidar2img: N, 4, 4
scan = np.fromfile(lidar_path, dtype=np.float32)
scan = scan.reshape((-1, 4))
scan[:, 3] = 1.0
# points_hcoords = scan[scan[:, 0] > 0, :]
points_hcoords = np.expand_dims(self.transxy @ scan.T, 0) # 1, 4, n
img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4
depth = img_points[..., 2] # N, n
img_points = img_points[..., :2] # N, n, 2
mask = (depth < self.eval_depth) & (depth > 1e-3) # get points with depth < max_sample_depth
img_points = img_points / np.expand_dims(depth, axis=2) # scale 2D points
img_points[..., 0] = img_points[..., 0] / image_size[1]
img_points[..., 1] = img_points[..., 1] / image_size[0]
# img_points = np.round(img_points).astype(int)
mask = mask & (img_points[..., 0] > 0) & \
(img_points[..., 1] > 0) & \
(img_points[..., 0] < 1) & \
(img_points[..., 1] < 1)
return img_points, depth, mask
def __len__(self):
return len(self.scans)
def prepare_temImg2lidar(self, scan, anchor):
intrinsic = np.eye(4)
intrinsic[:3, :3] = scan['P'][:3, :3]
temImg2lidar = self.transxy @ np.linalg.inv(scan['T_velo_2_cam']) @ \
scan['T_cam0_2_cam2'] @ \
np.linalg.inv(scan['pose']) @ \
scan['next_poses'][anchor] @ \
np.linalg.inv(scan['T_cam0_2_cam2']) @ \
np.linalg.inv(intrinsic)
return [temImg2lidar]
def prepare_img_metas(self, scan):
img_metas = {}
img_metas.update({
'input_imgs_path': [scan['img_path']]})
intrinsic = np.eye(4)
intrinsic[:3, :3] = scan['P'][:3, :3]
lidar2img = intrinsic @ scan['T_velo_2_cam'] @ np.linalg.inv(self.transxy)
img2lidar = np.linalg.inv(lidar2img)
img_metas.update({
'lidar2img': np.expand_dims(lidar2img, axis=0),
'img2lidar': [img2lidar],
'token': scan['frame_id'],
'sequence': scan['sequence']
})
return img_metas
def read_surround_imgs(self, img_paths):
if hfai:
imgs = self.img_loader.load(img_paths)
else:
imgs = []
for filename in img_paths:
imgs.append(
imread(filename, 'unchanged').astype(np.float32))
imgs = [img[:self.img_H, :self.img_W, :] for img in imgs]
return imgs
def __getitem__(self, index):
#### 1. get color, temporal_depth choice if necessary
#### 2. get self, prev, next infos for the stem, and also temp_depth info
scan = deepcopy(self.scans[index])
img_metas = self.prepare_img_metas(scan)
anchor_imgs = [] # list[list[array]]
anchor_depth_locs = [] # list[array]
anchor_depth_gts = [] # list[array]
anchor_depth_masks = [] # list[array]
temImg2lidars = [] # list[list[array]]
anchor_frame_dist = scan['next_dists'] # list[float]
for anchor in range(len(scan['next_frame_ids'])):
anchor_img_path = scan['next_img_paths'][anchor]
anchor_lid_path = scan['next_lid_paths'][anchor]
temImg2lidar = self.prepare_temImg2lidar(scan, anchor)
anchor_img = self.read_surround_imgs([anchor_img_path])
depth_loc, depth_gt, depth_mask = self.get_depth_from_lidar(
anchor_lid_path, img_metas['lidar2img'], [self.img_H, self.img_W])
anchor_imgs.append(anchor_img)
anchor_depth_locs.append(depth_loc)
anchor_depth_gts.append(depth_gt)
anchor_depth_masks.append(depth_mask)
temImg2lidars.append(temImg2lidar)
img_metas.update({
'depth_loc': anchor_depth_locs,
'depth_gt': anchor_depth_gts,
'depth_mask': anchor_depth_masks,
'temImg2lidars': temImg2lidars,
'frame_dists': anchor_frame_dist
})
# read 6 cams
input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'])
data_tuple = (input_imgs, anchor_imgs, img_metas)
return data_tuple
# frame_id = scan['frame_id']
# sequence = scan['sequence']
# lidar_paths = scan['lidar_paths']
# rel_frame_ids = scan['rel_frame_ids']
# distances = scan['distances']
# infer_id = 0
# P = scan["P"]
# T_velo_2_cam = scan["T_velo_2_cam"]
# img_paths = scan["img_paths"]
# img_sources = []
# img_input_sources = []
# img_targets = []
# lidar_depths = []
# depths = []
# loc2d_with_depths = []
# T_source2infers = []
# T_source2targets = []
# source_distances = []
# source_frame_ids = []
# n_sources = min(len(distances) - 1, self.n_sources)
# for d_id in range(n_sources):
# if self.n_sources < len(distances):
# source_id = np.random.randint(1, len(distances))
# source_distance = distances[source_id]
# else:
# source_id = d_id + 1
# source_distance = distances[source_id]
# source_distances.append(source_distance)
# rel_frame_id = rel_frame_ids[source_id]
# source_frame_ids.append(rel_frame_id)
# target_id = source_id - 1
# img_input_source = self.to_tensor_normalized(read_rgb(img_paths[source_id]))
# img_input_sources.append(img_input_source)
# img_source = self.to_tensor(read_rgb(img_paths[source_id]))
# img_target = self.to_tensor(read_rgb(img_paths[target_id]))
# lidar_path = lidar_paths[source_id]
# loc2d_with_depth, lidar_depth, _ = self.get_depth_from_lidar(lidar_path, P, T_velo_2_cam,
# (self.img_W, self.img_H))
# if self.n_rays < lidar_depth.shape[0]:
# idx = np.random.choice(lidar_depth.shape[0], size=self.n_rays, replace=False)
# loc2d_with_depth = loc2d_with_depth[idx, :]
# lidar_depth = lidar_depth[idx]
# img_sources.append(img_source)
# img_targets.append(img_target)
# lidar_depths.append(torch.from_numpy(lidar_depth))
# loc2d_with_depths.append(torch.from_numpy(loc2d_with_depth))
# # Get transformation from source to target coord
# transform_dir = os.path.join(self.transform_preprocess_root,
# "{}_{}_all".format(sequence, self.frames_interval))
# os.makedirs(transform_dir, exist_ok=True)
# transform_path = os.path.join(transform_dir, "{}.pkl".format(frame_id))
# if os.path.exists(transform_path):
# try:
# with open(transform_path, "rb") as input_file:
# transform_data = pickle.load(input_file)
# except EOFError:
# transform_data = {}
# else:
# transform_data = {}
# if '{}'.format(source_id) in transform_data:
# T_out_dict = transform_data['{}'.format(source_id)]
# else:
# poses = scan["poses"]
# pose_source = poses[source_id]
# pose_infer = poses[infer_id]
# pose_target = poses[target_id]
# lidar_path_source = lidar_paths[source_id]
# lidar_path_target = lidar_paths[target_id]
# lidar_path_infer = lidar_paths[infer_id]
# T_out_dict = compute_transformation(
# lidar_path_source, lidar_path_infer, lidar_path_target,
# pose_source, pose_infer, pose_target,
# T_velo_2_cam, scan['T_cam0_2_cam2'])
# transform_data['{}'.format(source_id)] = T_out_dict
# with open(transform_path, "wb") as input_file:
# pickle.dump(transform_data, input_file)
# print("{}: saved {} to {}".format(frame_id, source_id, transform_path))
# T_source2infer = T_out_dict['T_source2infer']
# T_source2target = T_out_dict['T_source2target']
# T_source2infers.append(torch.from_numpy(T_source2infer).float())
# T_source2targets.append(torch.from_numpy(T_source2target).float())
# data = {
# "img_input_sources": img_input_sources,
# "source_distances": source_distances,
# "source_frame_ids": source_frame_ids,
# "img_sources": img_sources,
# "img_targets": img_targets,
# "lidar_depths": lidar_depths,
# "depths": depths,
# "loc2d_with_depths": loc2d_with_depths,
# "T_source2infers": T_source2infers,
# "T_source2targets": T_source2targets,
# "frame_id": frame_id,
# "sequence": sequence,
# "P": P,
# "T_velo_2_cam": T_velo_2_cam,
# "T_cam2_2_cam0": scan['T_cam2_2_cam0'],
# "T_cam0_2_cam2": scan['T_cam0_2_cam2'],
# }
# scale_3ds = [self.output_scale]
# data["scale_3ds"] = scale_3ds
# cam_K = P[0:3, 0:3]
# data["cam_K"] = cam_K
# for scale_3d in scale_3ds:
# # compute the 3D-2D mapping
# projected_pix, fov_mask, sensor_distance = vox2pix(
# T_velo_2_cam,
# cam_K,
# self.vox_origin,
# self.voxel_size * scale_3d,
# self.img_W,
# self.img_H,
# self.scene_size,
# )
# data["projected_pix_{}".format(scale_3d)] = projected_pix
# data["sensor_distance_{}".format(scale_3d)] = sensor_distance
# data["fov_mask_{}".format(scale_3d)] = fov_mask
# img_input = read_rgb(img_paths[infer_id])
# img_input = self.to_tensor_normalized(img_input)
# data["img_input"] = img_input
# label_path = os.path.join(
# self.root, "dataset", "sequences", sequence, "voxels", "{}.label".format(frame_id)
# )
# invalid_path = os.path.join(
# self.root, "dataset", "sequences", sequence, "voxels", "{}.invalid".format(frame_id)
# )
# # data['target_1_1'] = self.read_semKITTI_label(label_path, invalid_path)
# return data
# @staticmethod
# def read_semKITTI_label(label_path, invalid_path):
# remap_lut = SemanticKittiIO.get_remap_lut("./scenerf/data/semantic_kitti/semantic-kitti.yaml")
# LABEL = SemanticKittiIO._read_label_SemKITTI(label_path)
# INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_path)
# LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
# np.float32
# ) # Remap 20 classes semanticKITTI SSC
# LABEL[
# np.isclose(INVALID, 1)
# ] = 255 # Setting to unknown all voxels marked on invalid mask...
# LABEL = LABEL.reshape(256,256, 32)
# return LABEL
# def __len__(self):
# return len(self.scans)
if __name__ == "__main__":
import os, sys
print(sys.path)
dataset = Kitti_Smart(
'train',
root='data/kitti',
preprocess_root='data/kitti/preprocess',
cur_prob=1.0,
crop_size=[352, 1216])
batch = dataset[0]
import pdb; pdb.set_trace()
================================================
FILE: dataset/kitti/kitti_dataset_one_frame.py
================================================
import glob, os, time, random
import numpy as np
import torch
# from torchvision import transforms
from dataset.kitti.helpers import dump_xyz, read_calib, read_poses, read_rgb
from dataset.kitti.params import val_error_frames
# from . import io_data as SemanticKittiIO
from copy import deepcopy
from mmcv.image.io import imread
from .. import OPENOCC_DATASET
if 'HFAI' in os.environ:
hfai = True
from dataset.loading import LoadMultiViewImageFromFilesHF, LoadPtsFromFilesHF
else:
hfai = False
@OPENOCC_DATASET.register_module()
class Kitti_One_Frame:
def __init__(
self,
split,
root, preprocess_root,
frames_interval=0.4,
sequence_distance=10,
n_sources=1,
eval_depth=80,
sequences=None,
selected_frames=None,
n_rays=1200,
cur_prob=1.0,
crop_size=[370, 1220],
strict=True,
return_depth=False,
prev_prob=0.5,
choose_nearest=False,
return_sem=False,
sem_path=None,
**kwargs,
):
self.root = root
self.preprocess_root = preprocess_root
self.depth_preprocess_root = os.path.join(preprocess_root, "depth")
self.transform_preprocess_root = os.path.join(preprocess_root, "transform")
self.n_classes = 20
self.n_sources = n_sources
self.eval_depth = eval_depth
self.n_rays = n_rays
self.cur_prob = cur_prob
self.return_depth = return_depth
self.prev_prob = prev_prob
self.choose_nearest = choose_nearest
self.return_sem = return_sem
assert (not return_sem) or os.path.exists(sem_path)
self.sem_path = sem_path
self.transxy = [
[0, -1., 0, 0],
[1., 0, 0, 0],
[0, 0, 1., 0],
[0, 0, 0, 1.]]
self.transxy = np.array(self.transxy)
splits = {
"train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
"val": ["08"],
"test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
}
self.split = split
if sequences is not None:
self.sequences = sequences
else:
self.sequences = splits[split]
self.output_scale = 1
self.scene_size = (51.2, 51.2, 6.4)
self.vox_origin = np.array([0, -25.6, -2])
self.frames_interval = frames_interval
if not isinstance(sequence_distance, list):
sequence_distance = [sequence_distance] * 2
self.sequence_distance = sequence_distance
self.strict = strict
self.voxel_size = 0.2 # 0.2m
self.img_W = crop_size[1]
self.img_H = crop_size[0]
start_time = time.time()
self.scans = []
self.frame2scan = {}
for sequence in self.sequences:
pose_path = os.path.join(self.root, "dataset", "poses", sequence + ".txt")
gt_global_poses = read_poses(pose_path)
calib = read_calib(
os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
)
P = calib["P2"]
T_cam0_2_cam2 = calib['T_cam0_2_cam2']
T_cam2_2_cam0 = np.linalg.inv(T_cam0_2_cam2)
T_velo_2_cam = T_cam0_2_cam2 @ calib["Tr"]
if split == "val":
glob_path = os.path.join(
self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
)
else:
glob_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", "*.png"
)
seq_img_paths = glob.glob(glob_path)
max_length = 0
min_length = 50
paired_dists = {}
dist_step = 1 if split == 'train' else 5
for seq_img_path in seq_img_paths:
filename = os.path.basename(seq_img_path)
frame_id = os.path.splitext(filename)[0]
prev_frame_id = "{:06d}".format(int(frame_id) - dist_step)
img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", prev_frame_id + ".png")
if not os.path.exists(img_path):
paired_dists[frame_id] = 0.
else:
curr_pose = gt_global_poses[int(frame_id)]
prev_pose = gt_global_poses[int(prev_frame_id)]
prev_xyz = dump_xyz(prev_pose)
curr_xyz = dump_xyz(curr_pose)
rel_distance = np.sqrt(
(prev_xyz[0] - curr_xyz[0]) ** 2 + (prev_xyz[2] - curr_xyz[2]) ** 2)
paired_dists[frame_id] = rel_distance
for seq_img_path in seq_img_paths:
filename = os.path.basename(seq_img_path)
frame_id = os.path.splitext(filename)[0]
current_img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png")
current_lid_path = os.path.join(
self.root, "dataset", "sequences", sequence, "velodyne", frame_id + ".bin")
prev_frame_ids, next_frame_ids = [], []
prev_img_paths, next_img_paths = [], []
prev_lid_paths, next_lid_paths = [], []
prev_poses, next_poses = [], []
prev_dists, next_dists = [], []
if self.split == 'train':
pos_step = 1
neg_step = -1
else:
pos_step = 5
neg_step = -5
# deal with prev
step = -1
cnt = 0
dist = 0.
while True:
# cnt += step
cnt += neg_step
rel_frame_id = "{:06d}".format(int(frame_id) + cnt)
img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")
if not os.path.exists(img_path):
break
dist += paired_dists["{:06d}".format(int(rel_frame_id) + pos_step)]
if dist < frames_interval:
continue
if dist > sequence_distance[0]:
break
if split == "val" and rel_frame_id in val_error_frames:
continue
prev_frame_ids.append(rel_frame_id)
prev_img_paths.append(img_path)
lidar_path = os.path.join(
self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
prev_lid_paths.append(lidar_path)
prev_poses.append(gt_global_poses[int(rel_frame_id)])
prev_dists.append(dist)
# deal with next
step = 1
cnt = 0
dist = 0.
while True:
# cnt += step
cnt += pos_step
rel_frame_id = "{:06d}".format(int(frame_id) + cnt)
img_path = os.path.join(
self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")
if not os.path.exists(img_path):
break
dist += paired_dists[rel_frame_id]
if dist < frames_interval:
continue
if dist > sequence_distance[1]:
break
if split == "val" and rel_frame_id in val_error_frames:
continue
next_frame_ids.append(rel_frame_id)
next_img_paths.append(img_path)
lidar_path = os.path.join(
self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
next_lid_paths.append(lidar_path)
next_poses.append(gt_global_poses[int(rel_frame_id)])
next_dists.append(dist)
# ignore error frame
if split == "val" and frame_id in val_error_frames:
continue
is_included = True if selected_frames is None else frame_id in selected_frames
if is_included:
prev_next_len = len(prev_poses) + len(next_poses)
if prev_next_len > max_length:
max_length = prev_next_len
if prev_next_len < min_length:
min_length = prev_next_len
self.frame2scan.update({str(sequence) + "_" + frame_id: len(self.scans)})
if not self.strict:
prev_img_paths.append(current_img_path)
prev_lid_paths.append(current_lid_path)
next_img_paths.append(current_img_path)
next_lid_paths.append(current_lid_path)
prev_poses.append(curr_pose)
next_poses.append(curr_pose)
prev_dists.append(0.)
next_dists.append(0.)
prev_frame_ids.append(frame_id)
next_frame_ids.append(frame_id)
self.scans.append(
{
"frame_id": frame_id,
"sequence": sequence,
"img_path": current_img_path,
"lid_path": current_lid_path,
"pose": gt_global_poses[int(frame_id)],
"prev_img_paths": prev_img_paths,
"prev_lid_paths": prev_lid_paths,
"next_img_paths": next_img_paths,
"next_lid_paths": next_lid_paths,
"T_velo_2_cam": T_velo_2_cam,
"P": P,
"T_cam0_2_cam2": T_cam0_2_cam2,
"T_cam2_2_cam0": T_cam2_2_cam0,
"prev_poses": prev_poses,
"next_poses": next_poses,
"prev_dists": prev_dists,
"next_dists": next_dists,
"prev_frame_ids": prev_frame_ids,
"next_frame_ids": next_frame_ids
}
)
print(sequence, min_length, max_length)
# self.to_tensor_normalized = transforms.Compose(
# [
# transforms.ToTensor(),
# transforms.Normalize(
# mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
# ),
# ]
# )
# self.to_tensor = transforms.Compose([
# transforms.ToTensor()
# ])
print("Preprocess time: --- %s seconds ---" % (time.time() - start_time))
def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
# lidar2img: N, 4, 4
scan = np.fromfile(lidar_path, dtype=np.float32)
scan = scan.reshape((-1, 4))
scan[:, 3] = 1.0
# points_hcoords = scan[scan[:, 0] > 0, :]
points_hcoords = np.expand_dims(self.transxy @ scan.T, 0) # 1, 4, n
img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4
depth = img_points[..., 2] # N, n
img_points = img_points[..., :2] # N, n, 2
mask = (depth < self.eval_depth) & (depth > 1e-3) # get points with depth < max_sample_depth
img_points = img_points / np.expand_dims(depth, axis=2) # scale 2D points
img_points[..., 0] = img_points[..., 0] / image_size[1]
img_points[..., 1] = img_points[..., 1] / image_size[0]
# img_points = np.round(img_points).astype(int)
mask = mask & (img_points[..., 0] > 0) & \
(img_points[..., 1] > 0) & \
(img_points[..., 0] < 1) & \
(img_points[..., 1] < 1)
return img_points, depth, mask
def load_2d_sem_label(self, scan):
sequence = scan['sequence']
filename = os.path.basename(scan['img_path'])
sem_path = os.path.join(self.sem_path, sequence, 'image_02', filename + '.npy')
sem = np.load(sem_path)[None, ...]
return sem
def __len__(self):
return len(self.scans)
def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
img_metas = {}
img_metas.update({
'input_imgs_path': [scan['img_path']],
'curr_imgs_path': [anchor_scan['img_path']],
'prev_imgs_path': [anchor_scan['prev_img_paths'][anchor_prev]],
'next_imgs_path': [anchor_scan['next_img_paths'][anchor_next]]
})
intrinsic = np.eye(4)
intrinsic[:3, :3] = scan['P'][:3, :3]
lidar2img = intrinsic @ scan['T_velo_2_cam'] @ np.linalg.inv(self.transxy)
img2lidar = np.linalg.inv(lidar2img)
temImg2lidar = self.transxy @ np.linalg.inv(scan['T_velo_2_cam']) @ \
scan['T_cam0_2_cam2'] @ \
np.linalg.inv(scan['pose']) @ \
anchor_scan['pose'] @ \
anchor_scan['T_cam2_2_cam0'] @ \
np.linalg.inv(intrinsic)
img2prevImg = intrinsic @ \
anchor_scan['T_cam0_2_cam2'] @ \
np.linalg.inv(anchor_scan['prev_poses'][anchor_prev]) @ \
anchor_scan['pose'] @ \
anchor_scan['T_cam2_2_cam0'] @ \
np.linalg.inv(intrinsic)
img2nextImg = intrinsic @ \
anchor_scan['T_cam0_2_cam2'] @ \
np.linalg.inv(anchor_scan['next_poses'][anchor_next]) @ \
anchor_scan['pose'] @ \
anchor_scan['T_cam2_2_cam0'] @ \
np.linalg.inv(intrinsic)
img_metas.update({
'lidar2img': np.expand_dims(lidar2img, axis=0),
'img2lidar': [img2lidar],
'img2prevImg': [img2prevImg],
'img2nextImg': [img2nextImg],
'token': scan['frame_id'],
'sequence': scan['sequence'],
'temImg2lidar': [temImg2lidar]
})
return img_metas
def read_surround_imgs(self, img_paths):
if hfai:
imgs = self.img_loader.load(img_paths)
else:
imgs = []
for filename in img_paths:
imgs.append(
imread(filename, 'unchanged').astype(np.float32))
imgs = [img[:self.img_H, :self.img_W, :] for img in imgs]
return imgs
def __getitem__(self, index):
#### 1. get color, temporal_depth choice if necessary
if random.random() < self.cur_prob:
temporal_supervision = 'curr'
elif random.random() < self.prev_prob:
temporal_supervision = 'prev'
else:
temporal_supervision = 'next'
#### 2. get self, prev, next infos for the stem, and also temp_depth info
while True:
scan = deepcopy(self.scans[index])
sequence = scan['sequence']
# scan_id = scan['frame_id']
# day, seq = scan['sequence']
# is_right = scan['is_right']
# stereo_sign = 'r' if is_right else 'l'
if temporal_supervision == 'curr':
anchor_scan = deepcopy(scan)
elif temporal_supervision == 'prev':
if len(scan['prev_frame_ids']) == 0:
index = np.random.randint(len(self))
continue
anchor_scan_id = np.random.choice(scan['prev_frame_ids'])
anchor_scan = deepcopy(self.scans[self.frame2scan[str(sequence) + '_' + anchor_scan_id]])
else:
if len(scan['next_frame_ids']) == 0:
index = np.random.randint(len(self))
continue
anchor_scan_id = np.random.choice(scan['next_frame_ids'])
anchor_scan = deepcopy(self.scans[self.frame2scan[str(sequence) + '_' + anchor_scan_id]])
if len(anchor_scan['prev_frame_ids']) == 0 or len(anchor_scan['next_frame_ids']) == 0:
index = np.random.randint(len(self))
continue
anchor_prev = 0 if self.choose_nearest else np.random.randint(len(anchor_scan['prev_frame_ids']))
anchor_next = 0 if self.choose_near
gitextract_ciq3vmv7/ ├── .gitignore ├── LICENSE ├── README.md ├── assets/ │ └── .placeholder ├── config/ │ ├── _base_/ │ │ ├── dataset_v1.py │ │ ├── optimizer.py │ │ └── schedule.py │ ├── kitti/ │ │ ├── kitti_novel_depth.py │ │ └── kitti_occ.py │ ├── kitti_raw/ │ │ └── kitti_raw_depth.py │ ├── nuscenes/ │ │ ├── nuscenes_depth.py │ │ ├── nuscenes_novel_depth.py │ │ ├── nuscenes_occ.py │ │ └── nuscenes_occ_bev.py │ └── openseed/ │ └── openseed_swint_lang.yaml ├── dataset/ │ ├── __init__.py │ ├── dataset_one_frame_eval.py │ ├── dataset_one_frame_sweeps_dist.py │ ├── dataset_one_frame_sweeps_dist_vis.py │ ├── dataset_wrapper_temporal.py │ ├── dataset_wrapper_vis.py │ ├── kitti/ │ │ ├── helpers.py │ │ ├── io_data.py │ │ ├── kitti_dataset_eval.py │ │ ├── kitti_dataset_one_frame.py │ │ ├── params.py │ │ └── semantic-kitti.yaml │ ├── kitti_raw/ │ │ ├── kitti_raw_dataset.py │ │ ├── kitti_raw_dataset_stereo.py │ │ ├── orb-slam_poses/ │ │ │ ├── 2011_09_26/ │ │ │ │ ├── 2011_09_26_drive_0001_sync.txt │ │ │ │ ├── 2011_09_26_drive_0002_sync.txt │ │ │ │ ├── 2011_09_26_drive_0005_sync.txt │ │ │ │ ├── 2011_09_26_drive_0009_sync.txt │ │ │ │ ├── 2011_09_26_drive_0011_sync.txt │ │ │ │ ├── 2011_09_26_drive_0013_sync.txt │ │ │ │ ├── 2011_09_26_drive_0014_sync.txt │ │ │ │ ├── 2011_09_26_drive_0015_sync.txt │ │ │ │ ├── 2011_09_26_drive_0017_sync.txt │ │ │ │ ├── 2011_09_26_drive_0018_sync.txt │ │ │ │ ├── 2011_09_26_drive_0019_sync.txt │ │ │ │ ├── 2011_09_26_drive_0020_sync.txt │ │ │ │ ├── 2011_09_26_drive_0022_sync.txt │ │ │ │ ├── 2011_09_26_drive_0023_sync.txt │ │ │ │ ├── 2011_09_26_drive_0027_sync.txt │ │ │ │ ├── 2011_09_26_drive_0028_sync.txt │ │ │ │ ├── 2011_09_26_drive_0029_sync.txt │ │ │ │ ├── 2011_09_26_drive_0032_sync.txt │ │ │ │ ├── 2011_09_26_drive_0035_sync.txt │ │ │ │ ├── 2011_09_26_drive_0036_sync.txt │ │ │ │ ├── 2011_09_26_drive_0039_sync.txt │ │ │ │ ├── 2011_09_26_drive_0046_sync.txt │ │ │ │ ├── 2011_09_26_drive_0048_sync.txt │ │ │ │ ├── 2011_09_26_drive_0051_sync.txt │ │ │ │ ├── 2011_09_26_drive_0052_sync.txt │ │ │ │ ├── 2011_09_26_drive_0056_sync.txt │ │ │ │ ├── 2011_09_26_drive_0057_sync.txt │ │ │ │ ├── 2011_09_26_drive_0059_sync.txt │ │ │ │ ├── 2011_09_26_drive_0060_sync.txt │ │ │ │ ├── 2011_09_26_drive_0061_sync.txt │ │ │ │ ├── 2011_09_26_drive_0064_sync.txt │ │ │ │ ├── 2011_09_26_drive_0070_sync.txt │ │ │ │ ├── 2011_09_26_drive_0079_sync.txt │ │ │ │ ├── 2011_09_26_drive_0084_sync.txt │ │ │ │ ├── 2011_09_26_drive_0086_sync.txt │ │ │ │ ├── 2011_09_26_drive_0087_sync.txt │ │ │ │ ├── 2011_09_26_drive_0091_sync.txt │ │ │ │ ├── 2011_09_26_drive_0093_sync.txt │ │ │ │ ├── 2011_09_26_drive_0095_sync.txt │ │ │ │ ├── 2011_09_26_drive_0096_sync.txt │ │ │ │ ├── 2011_09_26_drive_0101_sync.txt │ │ │ │ ├── 2011_09_26_drive_0104_sync.txt │ │ │ │ ├── 2011_09_26_drive_0106_sync.txt │ │ │ │ ├── 2011_09_26_drive_0113_sync.txt │ │ │ │ └── 2011_09_26_drive_0117_sync.txt │ │ │ ├── 2011_09_28/ │ │ │ │ ├── 2011_09_28_drive_0001_sync.txt │ │ │ │ └── 2011_09_28_drive_0002_sync.txt │ │ │ ├── 2011_09_29/ │ │ │ │ ├── 2011_09_29_drive_0004_sync.txt │ │ │ │ ├── 2011_09_29_drive_0026_sync.txt │ │ │ │ └── 2011_09_29_drive_0071_sync.txt │ │ │ ├── 2011_09_30/ │ │ │ │ ├── 2011_09_30_drive_0016_sync.txt │ │ │ │ ├── 2011_09_30_drive_0018_sync.txt │ │ │ │ ├── 2011_09_30_drive_0020_sync.txt │ │ │ │ ├── 2011_09_30_drive_0027_sync.txt │ │ │ │ ├── 2011_09_30_drive_0028_sync.txt │ │ │ │ ├── 2011_09_30_drive_0033_sync.txt │ │ │ │ └── 2011_09_30_drive_0034_sync.txt │ │ │ └── 2011_10_03/ │ │ │ ├── 2011_10_03_drive_0027_sync.txt │ │ │ ├── 2011_10_03_drive_0034_sync.txt │ │ │ ├── 2011_10_03_drive_0042_sync.txt │ │ │ └── 2011_10_03_drive_0047_sync.txt │ │ └── splits/ │ │ ├── eigen/ │ │ │ └── test_files.txt │ │ ├── eigen_zhou/ │ │ │ ├── test_files.txt │ │ │ ├── train_files.txt │ │ │ └── val_files.txt │ │ └── tulsiani/ │ │ ├── test_files.txt │ │ ├── train_files.txt │ │ └── val_files.txt │ ├── loading.py │ ├── sampler.py │ ├── transform_3d.py │ └── utils.py ├── docs/ │ ├── get_started.md │ ├── installation.md │ ├── prepare_data.md │ └── visualization.md ├── eval_depth.py ├── eval_iou.py ├── eval_iou_kitti.py ├── eval_novel_depth.py ├── eval_novel_depth_kitti.py ├── examine_sweeps.py ├── generate_videos.py ├── loss/ │ ├── __init__.py │ ├── base_loss.py │ ├── edge_loss_3d_ms.py │ ├── eikonal_loss.py │ ├── multi_loss.py │ ├── reproj_loss_mono_multi_new.py │ ├── reproj_loss_mono_multi_new_combine.py │ ├── rgb_loss_ms.py │ ├── second_grad_loss.py │ └── sparsity_loss.py ├── model/ │ ├── __init__.py │ ├── backbone/ │ │ ├── __init__.py │ │ └── unet2d.py │ ├── encoder/ │ │ ├── __init__.py │ │ ├── base_encoder.py │ │ ├── bevformer/ │ │ │ ├── __init__.py │ │ │ ├── attention/ │ │ │ │ ├── __init__.py │ │ │ │ └── image_cross_attention.py │ │ │ ├── bevformer_encoder.py │ │ │ ├── bevformer_encoder_layer.py │ │ │ ├── bevformer_pos_embed.py │ │ │ ├── mappings.py │ │ │ ├── mappings_old.py │ │ │ └── utils.py │ │ └── tpvformer/ │ │ ├── __init__.py │ │ ├── attention/ │ │ │ ├── __init__.py │ │ │ ├── cross_view_hybrid_attention.py │ │ │ └── image_cross_attention.py │ │ ├── modules/ │ │ │ ├── __init__.py │ │ │ ├── camera_se_net.py │ │ │ ├── split_fpn.py │ │ │ └── split_norm.py │ │ ├── tpvformer_encoder.py │ │ ├── tpvformer_encoder_layer.py │ │ ├── tpvformer_pos_embed.py │ │ └── utils.py │ ├── head/ │ │ ├── __init__.py │ │ ├── base_head.py │ │ ├── nerfacc_head/ │ │ │ ├── bev_nerf.py │ │ │ ├── estimator.py │ │ │ ├── img2lidar.py │ │ │ ├── nerfacc_head.py │ │ │ ├── ray_sampler.py │ │ │ └── rendering.py │ │ ├── neus_head/ │ │ │ └── neus_head.py │ │ └── utils/ │ │ └── sh_render.py │ ├── lifter/ │ │ ├── __init__.py │ │ ├── base_lifter.py │ │ ├── bev_query_lifter.py │ │ ├── tpv_pos_lifter.py │ │ └── tpv_query_lifter.py │ ├── neck/ │ │ ├── __init__.py │ │ └── identity_neck.py │ └── segmentor/ │ ├── __init__.py │ ├── base_segmentor.py │ └── tpv_segmentor.py ├── train.py ├── utils/ │ ├── __init__.py │ ├── config_tools.py │ ├── feat_tools.py │ ├── metric_util.py │ ├── misc.py │ ├── openseed_utils.py │ ├── scenerf_metric.py │ ├── tb_wrapper.py │ └── temporal_pkl.py ├── vis_3d.py ├── vis_3d_scene.py └── vis_pics.py
SYMBOL INDEX (448 symbols across 78 files)
FILE: dataset/__init__.py
function get_dataloader (line 20) | def get_dataloader(
FILE: dataset/dataset_one_frame_eval.py
class nuScenes_One_Frame_Eval (line 16) | class nuScenes_One_Frame_Eval:
method __init__ (line 17) | def __init__(
method __len__ (line 67) | def __len__(self):
method get_depth_from_lidar (line 71) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
method __getitem__ (line 95) | def __getitem__(self, index):
method read_surround_imgs (line 152) | def read_surround_imgs(self, img_paths):
method get_data_info_temporal (line 163) | def get_data_info_temporal(self, info, info_tem):
method get_data_info (line 209) | def get_data_info(self, info):
FILE: dataset/dataset_one_frame_sweeps_dist.py
function get_xyz (line 15) | def get_xyz(pose_dict):
function get_img2global (line 18) | def get_img2global(calib_dict, pose_dict):
function get_lidar2global (line 35) | def get_lidar2global(calib_dict, pose_dict):
class nuScenes_One_Frame_Sweeps_Dist (line 50) | class nuScenes_One_Frame_Sweeps_Dist:
method __init__ (line 51) | def __init__(
method __len__ (line 153) | def __len__(self):
method get_depth_from_lidar (line 158) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
method composite_dict (line 183) | def composite_dict(self, anchor_info):
method __getitem__ (line 199) | def __getitem__(self, index):
method read_surround_imgs (line 305) | def read_surround_imgs(self, img_paths, crop_size):
method get_data_info_temporal (line 316) | def get_data_info_temporal(self, info, info_tem):
method get_data_info_anchor (line 337) | def get_data_info_anchor(self, info, info_tem):
method get_data_info (line 358) | def get_data_info(self, info):
function list2tensor (line 458) | def list2tensor(imgs):
function get_diagonal (line 471) | def get_diagonal(trans):
function filter_invalid_pts (line 479) | def filter_invalid_pts(pts):
function get_pixel (line 507) | def get_pixel(uv, eps=1e-5):
function filter_invalid_points (line 528) | def filter_invalid_points(uv):
function saveimg (line 537) | def saveimg(i, imgs, uv, name):
FILE: dataset/dataset_one_frame_sweeps_dist_vis.py
function get_xyz (line 8) | def get_xyz(pose_dict):
function get_img2global (line 11) | def get_img2global(calib_dict, pose_dict):
function get_lidar2global (line 28) | def get_lidar2global(calib_dict, pose_dict):
class nuScenes_One_Frame_Sweeps_Dist_Vis (line 43) | class nuScenes_One_Frame_Sweeps_Dist_Vis:
method __init__ (line 44) | def __init__(
method __len__ (line 73) | def __len__(self):
method __getitem__ (line 77) | def __getitem__(self, index):
method read_surround_imgs (line 109) | def read_surround_imgs(self, img_paths):
method get_data_info (line 117) | def get_data_info(self, info):
FILE: dataset/dataset_wrapper_temporal.py
class tpvformer_dataset_nuscenes_temporal (line 19) | class tpvformer_dataset_nuscenes_temporal(data.Dataset):
method __init__ (line 20) | def __init__(
method __len__ (line 77) | def __len__(self):
method to_tensor (line 80) | def to_tensor(self, imgs):
method __getitem__ (line 86) | def __getitem__(self, index):
method deal_with_length3_dataset (line 93) | def deal_with_length3_dataset(self, data):
method deal_with_length2_dataset (line 119) | def deal_with_length2_dataset(self, data):
function custom_collate_fn_temporal (line 178) | def custom_collate_fn_temporal(data):
function forward_aug (line 191) | def forward_aug(imgs, metas, transforms):
FILE: dataset/dataset_wrapper_vis.py
class tpvformer_dataset_nuscenes_vis (line 14) | class tpvformer_dataset_nuscenes_vis(data.Dataset):
method __init__ (line 15) | def __init__(
method __len__ (line 35) | def __len__(self):
method to_tensor (line 38) | def to_tensor(self, imgs):
method __getitem__ (line 44) | def __getitem__(self, index):
method deal_with_length2_dataset (line 48) | def deal_with_length2_dataset(self, data):
function custom_collate_fn_temporal (line 68) | def custom_collate_fn_temporal(data):
function forward_aug (line 81) | def forward_aug(imgs, metas, transforms):
FILE: dataset/kitti/helpers.py
function apply_transform (line 17) | def apply_transform(pts, T):
function dump_xyz (line 29) | def dump_xyz(P):
function read_rgb (line 33) | def read_rgb(path):
function read_poses (line 43) | def read_poses(path):
function read_calib (line 56) | def read_calib(calib_path):
FILE: dataset/kitti/io_data.py
function unpack (line 10) | def unpack(compressed):
function img_normalize (line 25) | def img_normalize(img, mean, std):
function pack (line 33) | def pack(array):
function get_grid_coords (line 45) | def get_grid_coords(dims, resolution):
function _get_remap_lut (line 77) | def _get_remap_lut(config_path):
function get_inv_map (line 99) | def get_inv_map():
function _read_SemKITTI (line 113) | def _read_SemKITTI(path, dtype, do_unpack):
function _read_label_SemKITTI (line 120) | def _read_label_SemKITTI(path):
function _read_invalid_SemKITTI (line 125) | def _read_invalid_SemKITTI(path):
function _read_occluded_SemKITTI (line 130) | def _read_occluded_SemKITTI(path):
function _read_occupancy_SemKITTI (line 135) | def _read_occupancy_SemKITTI(path):
function _read_rgb_SemKITTI (line 140) | def _read_rgb_SemKITTI(path):
function _read_pointcloud_SemKITTI (line 145) | def _read_pointcloud_SemKITTI(path):
function _read_calib_SemKITTI (line 152) | def _read_calib_SemKITTI(calib_path):
function get_remap_lut (line 173) | def get_remap_lut(path):
function data_augmentation_3Dflips (line 196) | def data_augmentation_3Dflips(flip, data):
function get_cmap_semanticKITTI20 (line 216) | def get_cmap_semanticKITTI20():
FILE: dataset/kitti/kitti_dataset_eval.py
class Kitti_Novel_View_Eval (line 23) | class Kitti_Novel_View_Eval:
method __init__ (line 24) | def __init__(
method get_depth_from_lidar (line 287) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
method __len__ (line 311) | def __len__(self):
method prepare_temImg2lidar (line 314) | def prepare_temImg2lidar(self, scan, anchor):
method prepare_img_metas (line 327) | def prepare_img_metas(self, scan):
method read_surround_imgs (line 346) | def read_surround_imgs(self, img_paths):
method __getitem__ (line 357) | def __getitem__(self, index):
FILE: dataset/kitti/kitti_dataset_one_frame.py
class Kitti_One_Frame (line 21) | class Kitti_One_Frame:
method __init__ (line 22) | def __init__(
method get_depth_from_lidar (line 294) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
method load_2d_sem_label (line 318) | def load_2d_sem_label(self, scan):
method __len__ (line 325) | def __len__(self):
method prepare_img_metas (line 328) | def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
method read_surround_imgs (line 375) | def read_surround_imgs(self, img_paths):
method __getitem__ (line 386) | def __getitem__(self, index):
FILE: dataset/kitti_raw/kitti_raw_dataset.py
class Kitti_Raw (line 20) | class Kitti_Raw:
method __init__ (line 21) | def __init__(
method _get_sequences (line 213) | def _get_sequences(data_path):
method _load_split (line 228) | def _load_split(split_path):
method _load_calibs (line 241) | def _load_calibs(data_path):
method _load_poses (line 299) | def _load_poses(pose_path, sequences):
method get_depth_from_lidar (line 325) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
method load_depth (line 355) | def load_depth(self, lidar_path, lidar2img, image_size):
method __len__ (line 404) | def __len__(self):
method prepare_img_metas (line 407) | def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
method read_surround_imgs (line 471) | def read_surround_imgs(self, img_paths):
method to_scan_idx (line 479) | def to_scan_idx(self, index):
method __getitem__ (line 485) | def __getitem__(self, index):
FILE: dataset/kitti_raw/kitti_raw_dataset_stereo.py
class Kitti_Raw_Stereo (line 20) | class Kitti_Raw_Stereo:
method __init__ (line 24) | def __init__(
method _get_sequences (line 221) | def _get_sequences(data_path):
method _load_split (line 236) | def _load_split(split_path):
method _load_calibs (line 249) | def _load_calibs(data_path):
method _load_poses (line 307) | def _load_poses(pose_path, sequences):
method get_depth_from_lidar (line 333) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
method load_depth (line 363) | def load_depth(self, lidar_path, lidar2img, image_size):
method __len__ (line 412) | def __len__(self):
method prepare_img_metas (line 415) | def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
method read_surround_imgs (line 480) | def read_surround_imgs(self, img_paths):
method to_scan_idx (line 488) | def to_scan_idx(self, index):
method append_self (line 494) | def append_self(self, flag, obj):
method __getitem__ (line 502) | def __getitem__(self, index):
FILE: dataset/loading.py
class LoadMultiViewImageFromFilesHF (line 11) | class LoadMultiViewImageFromFilesHF:
method __init__ (line 25) | def __init__(self,
method load (line 34) | def load(self, results):
method __call__ (line 69) | def __call__(self, results):
class OriLoadMultiViewImageFromFilesHF (line 108) | class OriLoadMultiViewImageFromFilesHF(object):
method __init__ (line 122) | def __init__(self,
method __call__ (line 131) | def __call__(self, results):
method __repr__ (line 177) | def __repr__(self):
class LoadPtsFromFilesHF (line 185) | class LoadPtsFromFilesHF:
method __init__ (line 187) | def __init__(self,
method load (line 194) | def load(self, filename):
class FFrecordClient (line 218) | class FFrecordClient(BaseStorageBackend):
method __init__ (line 222) | def __init__(self, fname, check_data=False, filename2idx='filename2idx...
method get (line 233) | def get(self, filepath: Union[str, Path]) -> bytes:
method get_text (line 258) | def get_text(self,
FILE: dataset/sampler.py
class CustomDistributedSampler (line 12) | class CustomDistributedSampler(Sampler[T_co]):
method __init__ (line 59) | def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None,
method __iter__ (line 94) | def __iter__(self) -> Iterator[T_co]:
method __len__ (line 122) | def __len__(self) -> int:
method set_epoch (line 125) | def set_epoch(self, epoch: int) -> None:
method set_last_iter (line 136) | def set_last_iter(self, last_iter: int):
FILE: dataset/transform_3d.py
class PadMultiViewImage (line 8) | class PadMultiViewImage(object):
method __init__ (line 19) | def __init__(self, size=None, size_divisor=None, pad_val=0):
method _pad_img (line 29) | def _pad_img(self, results):
method __call__ (line 61) | def __call__(self, results):
method __repr__ (line 71) | def __repr__(self):
class NormalizeMultiviewImage (line 79) | class NormalizeMultiviewImage(object):
method __init__ (line 89) | def __init__(self, mean, std, to_rgb=True):
method __call__ (line 95) | def __call__(self, results):
method __repr__ (line 109) | def __repr__(self):
class RandomFlip (line 114) | class RandomFlip(object):
method __init__ (line 116) | def __init__(self, prob=0.5) -> None:
method __call__ (line 119) | def __call__(self, results):
class PhotoMetricDistortionMultiViewImage (line 126) | class PhotoMetricDistortionMultiViewImage:
method __init__ (line 151) | def __init__(self,
method __call__ (line 163) | def __call__(self, results):
method __repr__ (line 222) | def __repr__(self):
class RandomScaleImageMultiViewImage (line 323) | class RandomScaleImageMultiViewImage(object):
method __init__ (line 329) | def __init__(self, scales=[], ref_focal_len=None, random_scale=None, p...
method __call__ (line 341) | def __call__(self, results):
method __repr__ (line 380) | def __repr__(self):
function read_surround_imgs (line 399) | def read_surround_imgs(img_paths, crop_size=[768, 1600]):
FILE: dataset/utils.py
function get_rm (line 4) | def get_rm(angle, axis, deg=False):
FILE: eval_depth.py
function pass_print (line 19) | def pass_print(*args, **kwargs):
function main (line 22) | def main(local_rank, args):
FILE: eval_iou.py
function pass_print (line 23) | def pass_print(*args, **kwargs):
function read_occ3d_label (line 26) | def read_occ3d_label(metas, nusc):
function read_openoccupancy_label (line 34) | def read_openoccupancy_label(metas, nusc):
function main (line 43) | def main(local_rank, args):
FILE: eval_iou_kitti.py
function pass_print (line 21) | def pass_print(*args, **kwargs):
function read_semantic_kitti (line 24) | def read_semantic_kitti(metas):
function main (line 44) | def main(local_rank, args):
FILE: eval_novel_depth.py
function pass_print (line 20) | def pass_print(*args, **kwargs):
function main (line 23) | def main(local_rank, args):
function evaluate_depth (line 285) | def evaluate_depth(gt_depth, pred_depth):
function print_metrics (line 306) | def print_metrics(agg_depth_errors, n_frames, logger, num_cams=6):
FILE: eval_novel_depth_kitti.py
function pass_print (line 20) | def pass_print(*args, **kwargs):
function main (line 23) | def main(local_rank, args):
function evaluate_depth (line 264) | def evaluate_depth(gt_depth, pred_depth):
function print_metrics (line 281) | def print_metrics(agg_depth_errors, n_frames, logger):
FILE: examine_sweeps.py
function gather_sensor (line 15) | def gather_sensor(sample_data_token):
FILE: generate_videos.py
function cat_images (line 6) | def cat_images(dir, cam_img_size, pred_img_size, pred_img_size2, spacing...
function get_video (line 70) | def get_video(img_path, video_path, fps, size):
FILE: loss/base_loss.py
class BaseLoss (line 8) | class BaseLoss(nn.Module):
method __init__ (line 19) | def __init__(
method forward (line 34) | def forward(self, inputs):
FILE: loss/edge_loss_3d_ms.py
function get_smooth_loss (line 7) | def get_smooth_loss(disp, img):
class EdgeLoss3DMS (line 24) | class EdgeLoss3DMS(BaseLoss):
method __init__ (line 26) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
method edge_loss (line 44) | def edge_loss(self, curr_imgs, ms_depths, ms_rays, ms_accs=None, max_d...
FILE: loss/eikonal_loss.py
class EikonalLoss (line 6) | class EikonalLoss(BaseLoss):
method __init__ (line 8) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
method eikonal_loss (line 19) | def eikonal_loss(self, eik_grad):
FILE: loss/multi_loss.py
class MultiLoss (line 10) | class MultiLoss(nn.Module):
method __init__ (line 12) | def __init__(self, loss_cfgs):
method forward (line 24) | def forward(self, inputs):
FILE: loss/reproj_loss_mono_multi_new.py
class SSIM (line 7) | class SSIM(nn.Module):
method __init__ (line 10) | def __init__(self):
method forward (line 23) | def forward(self, x, y):
class ReprojLossMonoMultiNew (line 41) | class ReprojLossMonoMultiNew(BaseLoss):
method __init__ (line 43) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
method reproj_loss (line 72) | def reproj_loss(
FILE: loss/reproj_loss_mono_multi_new_combine.py
class SSIM (line 7) | class SSIM(nn.Module):
method __init__ (line 10) | def __init__(self):
method forward (line 23) | def forward(self, x, y):
class ReprojLossMonoMultiNewCombine (line 41) | class ReprojLossMonoMultiNewCombine(BaseLoss):
method __init__ (line 43) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
method reproj_loss (line 70) | def reproj_loss(
FILE: loss/rgb_loss_ms.py
class SSIM (line 7) | class SSIM(nn.Module):
method __init__ (line 10) | def __init__(self):
method forward (line 23) | def forward(self, x, y):
class RGBLossMS (line 41) | class RGBLossMS(BaseLoss):
method __init__ (line 43) | def __init__(
method rgb_loss (line 68) | def rgb_loss(self, ms_colors, ms_rays, gt_imgs):
class SemLossMS (line 103) | class SemLossMS(BaseLoss):
method __init__ (line 105) | def __init__(
method sem_loss (line 126) | def sem_loss(self, sem, metas, ms_rays):
class SemCELossMS (line 160) | class SemCELossMS(BaseLoss):
method __init__ (line 162) | def __init__(
method sem_loss (line 183) | def sem_loss(self, sem, metas, ms_rays):
FILE: loss/second_grad_loss.py
class SecondGradLoss (line 6) | class SecondGradLoss(BaseLoss):
method __init__ (line 8) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
method second_grad_loss (line 19) | def second_grad_loss(self, second_grad):
FILE: loss/sparsity_loss.py
class SparsityLoss (line 7) | class SparsityLoss(BaseLoss):
method __init__ (line 12) | def __init__(self, weight=1.0, scale=1.0, input_dict=None, **kwargs):
method sparsity_loss (line 24) | def sparsity_loss(self, density):
class HardSparsityLoss (line 30) | class HardSparsityLoss(BaseLoss):
method __init__ (line 32) | def __init__(self, weight=1.0, scale=1.0, thresh=0.2, crop=[[0, 0], [0...
method hard_sigmoid (line 46) | def hard_sigmoid(self, density):
method hard_sparsity_loss (line 49) | def hard_sparsity_loss(self, density):
class SoftSparsityLoss (line 67) | class SoftSparsityLoss(BaseLoss):
method __init__ (line 69) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
method soft_sparsity_loss (line 80) | def soft_sparsity_loss(self, density):
class AdaptiveSparsityLoss (line 85) | class AdaptiveSparsityLoss(BaseLoss):
method __init__ (line 87) | def __init__(self, weight=1, input_dict=None, slack=4.0, **kwargs):
method adaptive_sparsity_loss (line 102) | def adaptive_sparsity_loss(self, sdfs, ts, ms_depths):
FILE: model/backbone/unet2d.py
class UpSampleBN (line 11) | class UpSampleBN(nn.Module):
method __init__ (line 12) | def __init__(self, skip_input, output_features):
method forward (line 25) | def forward(self, x, concat_with):
class DecoderBN (line 36) | class DecoderBN(nn.Module):
method __init__ (line 37) | def __init__(
method forward (line 96) | def forward(self, features):
class Encoder (line 144) | class Encoder(nn.Module):
method __init__ (line 145) | def __init__(self, backend):
method forward (line 149) | def forward(self, x):
class UNet2D (line 161) | class UNet2D(nn.Module):
method __init__ (line 162) | def __init__(self, out_feature, use_decoder=True):
method forward (line 194) | def forward(self, x, **kwargs):
method get_encoder_params (line 199) | def get_encoder_params(self): # lr/10 learning rate
method get_decoder_params (line 202) | def get_decoder_params(self): # lr learning rate
FILE: model/encoder/base_encoder.py
class BaseEncoder (line 6) | class BaseEncoder(BaseModule):
method __init__ (line 11) | def __init__(self, init_cfg=None, **kwargs):
method forward (line 14) | def forward(
FILE: model/encoder/bevformer/attention/image_cross_attention.py
class BEVCrossAttention (line 12) | class BEVCrossAttention(BaseModule):
method __init__ (line 18) | def __init__(self,
method init_weight (line 40) | def init_weight(self):
method forward (line 46) | def forward(self,
class BEVDeformableAttention (line 149) | class BEVDeformableAttention(BaseModule):
method __init__ (line 178) | def __init__(self,
method init_weights (line 226) | def init_weights(self) -> None:
method forward (line 247) | def forward(self,
FILE: model/encoder/bevformer/bevformer_encoder.py
class BEVFormerEncoder (line 18) | class BEVFormerEncoder(BaseEncoder):
method __init__ (line 20) | def __init__(
method init_weights (line 119) | def init_weights(self):
method forward_layers (line 135) | def forward_layers(
method forward (line 171) | def forward(
FILE: model/encoder/bevformer/bevformer_encoder_layer.py
class BEVFormerLayer (line 11) | class BEVFormerLayer(BaseModule):
method __init__ (line 45) | def __init__(self,
method forward (line 121) | def forward(self,
FILE: model/encoder/bevformer/bevformer_pos_embed.py
class BEVPositionalEncoding (line 7) | class BEVPositionalEncoding(BaseModule):
method __init__ (line 9) | def __init__(self, num_freqs, embed_dims,
method forward (line 33) | def forward(self):
FILE: model/encoder/bevformer/mappings.py
class LinearMapping (line 4) | class LinearMapping:
method __init__ (line 6) | def __init__(
method grid2meter (line 39) | def grid2meter(self, grid):
method meter2grid (line 97) | def meter2grid(self, meter, normalize=False):
class GridMeterMapping (line 153) | class GridMeterMapping:
method __init__ (line 155) | def __init__(
class NonLinearMapping (line 199) | class NonLinearMapping:
method __init__ (line 201) | def __init__(
method grid2meter (line 227) | def grid2meter(self, grid):
method meter2grid (line 254) | def meter2grid(self, meter, normalize=False):
FILE: model/encoder/bevformer/mappings_old.py
class GridMeterMapping (line 5) | class GridMeterMapping:
method __init__ (line 7) | def __init__(
method grid2meter (line 37) | def grid2meter(self, grid):
method meter2grid (line 64) | def meter2grid(self, meter):
FILE: model/encoder/bevformer/utils.py
function get_cross_view_ref_points (line 5) | def get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar):
function get_reference_points (line 74) | def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs...
function point_sampling (line 116) | def point_sampling(reference_points, img_metas):
FILE: model/encoder/tpvformer/attention/cross_view_hybrid_attention.py
class CrossViewHybridAttention (line 12) | class CrossViewHybridAttention(MultiScaleDeformableAttention):
method forward (line 17) | def forward(self,
FILE: model/encoder/tpvformer/attention/image_cross_attention.py
class TPVCrossAttention (line 8) | class TPVCrossAttention(BaseModule):
method __init__ (line 10) | def __init__(
method forward (line 71) | def forward(self,
FILE: model/encoder/tpvformer/modules/camera_se_net.py
class Mlp (line 4) | class Mlp(nn.Module):
method __init__ (line 6) | def __init__(self,
method forward (line 21) | def forward(self, x):
class SELayer (line 44) | class SELayer(nn.Module):
method __init__ (line 46) | def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
method forward (line 49) | def forward(self, x, x_se):
class CameraAwareSE (line 52) | class CameraAwareSE(nn.Module):
method __init__ (line 54) | def __init__(
method _init_layers (line 65) | def _init_layers(self):
method init_weight (line 87) | def init_weight(self):
method forward (line 93) | def forward(self, ms_img_feats, metas):
FILE: model/encoder/tpvformer/modules/split_fpn.py
class MultiPlaneFFN (line 8) | class MultiPlaneFFN(BaseModule):
method __init__ (line 10) | def __init__(
method _init_layers (line 29) | def _init_layers(self):
method forward (line 35) | def forward(self, tpv, identity=None):
FILE: model/encoder/tpvformer/modules/split_norm.py
class MultiPlaneNorm (line 8) | class MultiPlaneNorm(BaseModule):
method __init__ (line 10) | def __init__(
method _init_layers (line 22) | def _init_layers(self):
method forward (line 28) | def forward(self, tpv):
FILE: model/encoder/tpvformer/tpvformer_encoder.py
class TPVFormerEncoder (line 20) | class TPVFormerEncoder(BaseEncoder):
method __init__ (line 22) | def __init__(
method init_weights (line 174) | def init_weights(self):
method forward_layers (line 192) | def forward_layers(
method forward (line 234) | def forward(
FILE: model/encoder/tpvformer/tpvformer_encoder_layer.py
class TPVFormerLayer (line 11) | class TPVFormerLayer(BaseModule):
method __init__ (line 45) | def __init__(self,
method forward (line 123) | def forward(self,
FILE: model/encoder/tpvformer/tpvformer_pos_embed.py
function get_feat_from_meter (line 6) | def get_feat_from_meter(num_freqs, meter):
class TPVPositionalEncoding (line 17) | class TPVPositionalEncoding(BaseModule):
method __init__ (line 19) | def __init__(
method forward (line 54) | def forward(self):
FILE: model/encoder/tpvformer/utils.py
function get_cross_view_ref_points (line 5) | def get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar,...
FILE: model/head/base_head.py
class BaseTaskHead (line 5) | class BaseTaskHead(BaseModule):
method __init__ (line 11) | def __init__(self, init_cfg=None, **kwargs):
method forward (line 14) | def forward(
FILE: model/head/nerfacc_head/bev_nerf.py
class BEVNeRF (line 8) | class BEVNeRF(BaseModule):
method __init__ (line 10) | def __init__(
method _init_layers (line 62) | def _init_layers(self):
method pre_compute_density_color (line 74) | def pre_compute_density_color(self, bev):
method query_density (line 99) | def query_density(self, x):
method forward (line 120) | def forward(self, x, condition=None):
method forward_geo (line 155) | def forward_geo(self, x):
FILE: model/head/nerfacc_head/estimator.py
class CustomOccGridEstimator (line 8) | class CustomOccGridEstimator(OccGridEstimator):
method sampling (line 11) | def sampling(
FILE: model/head/nerfacc_head/img2lidar.py
class Img2LiDAR (line 6) | class Img2LiDAR(nn.Module):
method __init__ (line 8) | def __init__(
method forward (line 25) | def forward(self, metas, rays):
FILE: model/head/nerfacc_head/nerfacc_head.py
function namedtuple_map (line 13) | def namedtuple_map(fn, tup):
class NeRFAccHead (line 21) | class NeRFAccHead(BaseTaskHead):
method __init__ (line 23) | def __init__(
method prepare (line 144) | def prepare(
method render (line 153) | def render(
method render_image_with_occgrid (line 242) | def render_image_with_occgrid(self, rays):
method forward_occ (line 346) | def forward_occ(
method get_uniform_sdf (line 366) | def get_uniform_sdf(self, aabb, resolution, device, shift=False):
method forward (line 396) | def forward(
FILE: model/head/nerfacc_head/ray_sampler.py
class RaySampler (line 5) | class RaySampler(nn.Module):
method __init__ (line 7) | def __init__(
method forward (line 48) | def forward(self):
FILE: model/head/nerfacc_head/rendering.py
function custom_rendering (line 15) | def custom_rendering(
FILE: model/head/neus_head/neus_head.py
class NeuSHead (line 22) | class NeuSHead(BaseTaskHead):
method __init__ (line 24) | def __init__(
method forward_occ (line 237) | def forward_occ(
method get_uniform_sdf (line 265) | def get_uniform_sdf(self, aabb, resolution, device, shift=False):
method prepare (line 295) | def prepare(
method render (line 308) | def render(
method forward (line 473) | def forward(
function chunk_cams (line 716) | def chunk_cams(tensor, num_cams):
FILE: model/head/utils/sh_render.py
function eval_sh_bases (line 35) | def eval_sh_bases(deg, dirs):
function SHRender (line 84) | def SHRender(xyz_sampled, viewdirs, features, deg=2, act='relu'):
FILE: model/lifter/base_lifter.py
class BaseLifter (line 6) | class BaseLifter(BaseModule):
method __init__ (line 13) | def __init__(self, init_cfg=None, **kwargs) -> None:
method forward (line 16) | def forward(
FILE: model/lifter/bev_query_lifter.py
class BEVQueryLifter (line 7) | class BEVQueryLifter(BaseLifter):
method __init__ (line 9) | def __init__(
method forward (line 23) | def forward(self, ms_img_feats, *args, **kwargs):
FILE: model/lifter/tpv_pos_lifter.py
function get_feat_from_meter (line 6) | def get_feat_from_meter(num_freqs, meter):
class TPVPositionLifter (line 18) | class TPVPositionLifter(BaseLifter):
method __init__ (line 20) | def __init__(
method forward (line 81) | def forward(self, ms_img_feats, *args, **kwargs):
FILE: model/lifter/tpv_query_lifter.py
class TPVQueryLifter (line 7) | class TPVQueryLifter(BaseLifter):
method __init__ (line 9) | def __init__(
method forward (line 27) | def forward(self, ms_img_feats, *args, **kwargs):
FILE: model/neck/identity_neck.py
class IdentityNeck (line 6) | class IdentityNeck(BaseModule):
method __init__ (line 8) | def __init__(self, init_cfg = None):
method forward (line 11) | def forward(self, inputs):
FILE: model/segmentor/base_segmentor.py
class CustomBaseSegmentor (line 7) | class CustomBaseSegmentor(BaseModule):
method __init__ (line 9) | def __init__(
method extract_img_feat (line 34) | def extract_img_feat(self, imgs, **kwargs):
method forward (line 51) | def forward(
FILE: model/segmentor/tpv_segmentor.py
class TPVSegmentor (line 10) | class TPVSegmentor(CustomBaseSegmentor):
method __init__ (line 12) | def __init__(
method extract_img_feat (line 36) | def extract_img_feat(self, imgs, metas, **kwargs):
method forward_extra_img_backbone (line 71) | def forward_extra_img_backbone(self, imgs, **kwargs):
method forward (line 87) | def forward(self,
FILE: train.py
function pass_print (line 18) | def pass_print(*args, **kwargs):
function main (line 21) | def main(local_rank, args):
FILE: utils/config_tools.py
function modify_for_eval (line 10) | def modify_for_eval(cfg, dataset='nuscenes', novel_depth=False, args=None):
FILE: utils/feat_tools.py
function multi2single_scale (line 4) | def multi2single_scale(
FILE: utils/metric_util.py
function cityscapes2semantickitti (line 10) | def cityscapes2semantickitti(sem):
function openseed2nuscenes (line 37) | def openseed2nuscenes(sem):
class MeanIoU (line 66) | class MeanIoU:
method __init__ (line 68) | def __init__(self,
method reset (line 85) | def reset(self) -> None:
method _after_step (line 90) | def _after_step(self, outputs, targets, mask=None):
method _after_epoch (line 122) | def _after_epoch(self):
class IoU (line 168) | class IoU(nn.Module):
method __init__ (line 170) | def __init__(self, use_mask=False):
method reset (line 186) | def reset(self) -> None:
method _after_step (line 191) | def _after_step(self, outputs, targets, occ3d=False):
method _after_step_occ3d (line 203) | def _after_step_occ3d(self, outputs, targets):
method _after_epoch (line 221) | def _after_epoch(self):
function cal_depth_metric (line 247) | def cal_depth_metric(depth_pred, depth_gt):
class DepthMetric (line 282) | class DepthMetric(nn.Module):
method __init__ (line 284) | def __init__(self, camera_names=['front'], eval_types=['raw', 'median']):
method _reset (line 300) | def _reset(self):
method _after_step (line 311) | def _after_step(self, depth_loc, depth_gt, depth_mask, depth_pred):
method _after_epoch (line 351) | def _after_epoch(self):
function compute_depth_errors (line 400) | def compute_depth_errors(gt, pred, min_depth=1e-3, max_depth=80):
function compute_depth_errors_torch (line 424) | def compute_depth_errors_torch(gt, pred, min_depth=1e-3, max_depth=80):
FILE: utils/misc.py
function _load_model_from_metafile (line 12) | def _load_model_from_metafile(model: str, scope: str = 'mmseg'):
function _init_model (line 49) | def _init_model(
function generate_segmentation_map_2d (line 85) | def generate_segmentation_map_2d(model_2d, inputs):
function read_segmentation_map_2d (line 122) | def read_segmentation_map_2d(map_path, img_metas):
FILE: utils/openseed_utils.py
function build_openseed_model (line 12) | def build_openseed_model():
function forward_openseed_model (line 54) | def forward_openseed_model(model, img, size):
FILE: utils/scenerf_metric.py
function get_iou (line 7) | def get_iou(iou_sum, cnt_class):
function get_accuracy (line 17) | def get_accuracy(predict, target, weight=None): # 0.05s
class SSCMetrics (line 39) | class SSCMetrics:
method __init__ (line 40) | def __init__(self, n_classes):
method hist_info (line 44) | def hist_info(self, n_cl, pred, gt):
method compute_score (line 59) | def compute_score(hist, correct, labeled):
method add_batch (line 69) | def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
method get_stats (line 93) | def get_stats(self):
method reset (line 120) | def reset(self):
method get_score_completion (line 140) | def get_score_completion(self, predict, target, nonempty=None):
method get_score_semantic_and_completion (line 177) | def get_score_semantic_and_completion(self, predict, target, nonempty=...
FILE: utils/tb_wrapper.py
class WrappedTBWriter (line 4) | class WrappedTBWriter(SummaryWriter, ManagerMixin):
method __init__ (line 6) | def __init__(self, name, **kwargs):
FILE: utils/temporal_pkl.py
function arange_according_to_scene (line 4) | def arange_according_to_scene(infos, nusc):
FILE: vis_3d.py
function pass_print (line 37) | def pass_print(*args, **kwargs):
function read_semantic_kitti (line 42) | def read_semantic_kitti(metas, idx):
function read_occ3d_label (line 65) | def read_occ3d_label(metas, nusc, idx):
function get_grid_coords (line 75) | def get_grid_coords(dims, resolution):
function draw (line 97) | def draw(
function main (line 344) | def main(local_rank, args):
FILE: vis_3d_scene.py
function pass_print (line 38) | def pass_print(*args, **kwargs):
function read_semantic_kitti (line 43) | def read_semantic_kitti(metas, idx):
function read_occ3d_label (line 66) | def read_occ3d_label(metas, nusc, idx):
function draw (line 76) | def draw(
function main (line 228) | def main(local_rank, args):
FILE: vis_pics.py
function pass_print (line 26) | def pass_print(*args, **kwargs):
function create_label_colormap (line 29) | def create_label_colormap():
function save_depth_map (line 56) | def save_depth_map(disp_resized_np, save_path):
function main (line 65) | def main(local_rank, args):
Copy disabled (too large)
Download .json
Condensed preview — 181 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (10,061K chars).
[
{
"path": ".gitignore",
"chars": 3104,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": "LICENSE",
"chars": 11357,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 5811,
"preview": "# SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction\n### [Paper](https://arxiv.org/pdf/2311.12754) | [Projec"
},
{
"path": "assets/.placeholder",
"chars": 1,
"preview": "\n"
},
{
"path": "config/_base_/dataset_v1.py",
"chars": 542,
"preview": "data_path = 'data/nuscenes/'\n\ntrain_wrapper_config = dict(\n phase='train', \n scale_rate=0.5,\n photometric_aug=d"
},
{
"path": "config/_base_/optimizer.py",
"chars": 237,
"preview": "optimizer = dict(\n optimizer=dict(\n type='AdamW',\n lr=2e-5,\n weight_decay=0.0001\n ),\n para"
},
{
"path": "config/_base_/schedule.py",
"chars": 32,
"preview": "max_epochs = 12\nprint_freq = 50\n"
},
{
"path": "config/kitti/kitti_novel_depth.py",
"chars": 7764,
"preview": "_base_ = [\n '../_base_/dataset_v1.py',\n '../_base_/optimizer.py',\n '../_base_/schedule.py',\n]\n\nimg_size = [370,"
},
{
"path": "config/kitti/kitti_occ.py",
"chars": 7929,
"preview": "_base_ = [\n '../_base_/dataset_v1.py',\n '../_base_/optimizer.py',\n '../_base_/schedule.py',\n]\n\nimg_size = [352,"
},
{
"path": "config/kitti_raw/kitti_raw_depth.py",
"chars": 8453,
"preview": "_base_ = [\n '../_base_/dataset_v1.py',\n '../_base_/optimizer.py',\n '../_base_/schedule.py',\n]\n\nimg_size = [370,"
},
{
"path": "config/nuscenes/nuscenes_depth.py",
"chars": 8548,
"preview": "_base_ = [\n '../_base_/dataset_v1.py',\n '../_base_/optimizer.py',\n '../_base_/schedule.py',\n]\n\n# img_size = [76"
},
{
"path": "config/nuscenes/nuscenes_novel_depth.py",
"chars": 7957,
"preview": "_base_ = [\n '../_base_/dataset_v1.py',\n '../_base_/optimizer.py',\n '../_base_/schedule.py',\n]\n\nimg_size = [768,"
},
{
"path": "config/nuscenes/nuscenes_occ.py",
"chars": 8619,
"preview": "_base_ = [\n '../_base_/dataset_v1.py',\n '../_base_/optimizer.py',\n '../_base_/schedule.py',\n]\n\nimg_size = [768,"
},
{
"path": "config/nuscenes/nuscenes_occ_bev.py",
"chars": 8739,
"preview": "_base_ = [\n '../_base_/dataset_v1.py',\n '../_base_/optimizer.py',\n '../_base_/schedule.py',\n]\n\nimg_size = [768,"
},
{
"path": "config/openseed/openseed_swint_lang.yaml",
"chars": 9845,
"preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
},
{
"path": "dataset/__init__.py",
"chars": 3732,
"preview": "from mmengine.registry import Registry\nOPENOCC_DATASET = Registry('openocc_dataset')\nOPENOCC_DATAWRAPPER = Registry('ope"
},
{
"path": "dataset/dataset_one_frame_eval.py",
"chars": 9690,
"preview": "import os, numpy as np, pickle, random\nfrom mmcv.image.io import imread\nfrom pyquaternion import Quaternion\nfrom copy im"
},
{
"path": "dataset/dataset_one_frame_sweeps_dist.py",
"chars": 22271,
"preview": "import os, numpy as np, random, mmengine, math\nfrom mmcv.image.io import imread\nfrom pyquaternion import Quaternion\nfrom"
},
{
"path": "dataset/dataset_one_frame_sweeps_dist_vis.py",
"chars": 6247,
"preview": "import os, numpy as np, random, mmengine, math\nfrom mmcv.image.io import imread\nfrom pyquaternion import Quaternion\nfrom"
},
{
"path": "dataset/dataset_wrapper_temporal.py",
"chars": 8075,
"preview": "\nimport numpy as np, torch\nfrom torch.utils import data\nfrom dataset.transform_3d import PadMultiViewImage, NormalizeMul"
},
{
"path": "dataset/dataset_wrapper_vis.py",
"chars": 2968,
"preview": "\nimport numpy as np, torch\nfrom torch.utils import data\nfrom dataset.transform_3d import PadMultiViewImage, NormalizeMul"
},
{
"path": "dataset/kitti/helpers.py",
"chars": 6934,
"preview": "import numpy as np\n# import scenerf.data.utils.fusion as fusion\n# import open3d as o3d\nfrom PIL import Image\n\n\n# def mak"
},
{
"path": "dataset/kitti/io_data.py",
"chars": 7383,
"preview": "\"\"\"\nMost of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py\n\"\"\""
},
{
"path": "dataset/kitti/kitti_dataset_eval.py",
"chars": 23468,
"preview": "import glob\nimport os\nimport time\n\nimport numpy as np\nimport torch\n# from torchvision import transforms\nfrom dataset.kit"
},
{
"path": "dataset/kitti/kitti_dataset_one_frame.py",
"chars": 18292,
"preview": "import glob, os, time, random\n\nimport numpy as np\nimport torch\n# from torchvision import transforms\nfrom dataset.kitti.h"
},
{
"path": "dataset/kitti/params.py",
"chars": 1050,
"preview": "import numpy as np\n\nsemantic_kitti_class_frequencies = np.array(\n [\n 5.41773033e09,\n 1.57835390e07,\n "
},
{
"path": "dataset/kitti/semantic-kitti.yaml",
"chars": 5593,
"preview": "# This file is covered by the LICENSE file in the root of this project.\nnbr_classes: 20\ngrid_dims: [256, 32, 256] # (W,"
},
{
"path": "dataset/kitti_raw/kitti_raw_dataset.py",
"chars": 23341,
"preview": "import os, time, random\nfrom pathlib import Path\nfrom copy import deepcopy\nfrom collections import Counter\n\nimport numpy"
},
{
"path": "dataset/kitti_raw/kitti_raw_dataset_stereo.py",
"chars": 25715,
"preview": "import os, time, random\nfrom pathlib import Path\nfrom copy import deepcopy\nfrom collections import Counter\n\nimport numpy"
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0001_sync.txt",
"chars": 16118,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0002_sync.txt",
"chars": 11435,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0005_sync.txt",
"chars": 22961,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0009_sync.txt",
"chars": 67151,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0011_sync.txt",
"chars": 34953,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0013_sync.txt",
"chars": 21649,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0014_sync.txt",
"chars": 46989,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0015_sync.txt",
"chars": 44622,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0017_sync.txt",
"chars": 16929,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0018_sync.txt",
"chars": 40221,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0019_sync.txt",
"chars": 72478,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0020_sync.txt",
"chars": 12756,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0022_sync.txt",
"chars": 120073,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0023_sync.txt",
"chars": 71295,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0027_sync.txt",
"chars": 28107,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0028_sync.txt",
"chars": 64999,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0029_sync.txt",
"chars": 65060,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0032_sync.txt",
"chars": 58646,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0035_sync.txt",
"chars": 19411,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0036_sync.txt",
"chars": 121989,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0039_sync.txt",
"chars": 59671,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0046_sync.txt",
"chars": 18586,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0048_sync.txt",
"chars": 3258,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0051_sync.txt",
"chars": 65448,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0052_sync.txt",
"chars": 11543,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0056_sync.txt",
"chars": 44489,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0057_sync.txt",
"chars": 53463,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0059_sync.txt",
"chars": 55857,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0060_sync.txt",
"chars": 11553,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0061_sync.txt",
"chars": 105707,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0064_sync.txt",
"chars": 85569,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0070_sync.txt",
"chars": 63546,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0079_sync.txt",
"chars": 14866,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0084_sync.txt",
"chars": 57460,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0086_sync.txt",
"chars": 107196,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0087_sync.txt",
"chars": 108988,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0091_sync.txt",
"chars": 50939,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0093_sync.txt",
"chars": 64924,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0095_sync.txt",
"chars": 40187,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0096_sync.txt",
"chars": 71614,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0101_sync.txt",
"chars": 142239,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0104_sync.txt",
"chars": 46664,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0106_sync.txt",
"chars": 33801,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0113_sync.txt",
"chars": 12888,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0117_sync.txt",
"chars": 99432,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_28/2011_09_28_drive_0001_sync.txt",
"chars": 15814,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_28/2011_09_28_drive_0002_sync.txt",
"chars": 56023,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_29/2011_09_29_drive_0004_sync.txt",
"chars": 50994,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_29/2011_09_29_drive_0026_sync.txt",
"chars": 23519,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_29/2011_09_29_drive_0071_sync.txt",
"chars": 160106,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0016_sync.txt",
"chars": 42043,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0018_sync.txt",
"chars": 417932,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0020_sync.txt",
"chars": 166624,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0027_sync.txt",
"chars": 166524,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0028_sync.txt",
"chars": 790379,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0033_sync.txt",
"chars": 243089,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0034_sync.txt",
"chars": 187450,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0027_sync.txt",
"chars": 689623,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0034_sync.txt",
"chars": 715366,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0042_sync.txt",
"chars": 179843,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0047_sync.txt",
"chars": 126027,
"preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
},
{
"path": "dataset/kitti_raw/splits/eigen/test_files.txt",
"chars": 35547,
"preview": "2011_09_26/2011_09_26_drive_0002_sync 0000000069 l\n2011_09_26/2011_09_26_drive_0002_sync 0000000054 l\n2011_09_26/2011_09"
},
{
"path": "dataset/kitti_raw/splits/eigen_zhou/test_files.txt",
"chars": 35547,
"preview": "2011_09_26/2011_09_26_drive_0002_sync 0000000069 l\n2011_09_26/2011_09_26_drive_0002_sync 0000000054 l\n2011_09_26/2011_09"
},
{
"path": "dataset/kitti_raw/splits/eigen_zhou/train_files.txt",
"chars": 1762076,
"preview": "2011_09_26/2011_09_26_drive_0022_sync 473 r\n2011_09_29/2011_09_29_drive_0026_sync 1 l\n2011_09_26/2011_09_26_drive_0087_s"
},
{
"path": "dataset/kitti_raw/splits/eigen_zhou/val_files.txt",
"chars": 195800,
"preview": "2011_09_26/2011_09_26_drive_0028_sync 82 r\n2011_09_26/2011_09_26_drive_0022_sync 247 r\n2011_10_03/2011_10_03_drive_0042_"
},
{
"path": "dataset/kitti_raw/splits/tulsiani/test_files.txt",
"chars": 55028,
"preview": "2011_09_26/2011_09_26_drive_0001_sync 0000000000 l\n2011_09_26/2011_09_26_drive_0001_sync 0000000001 l\n2011_09_26/2011_09"
},
{
"path": "dataset/kitti_raw/splits/tulsiani/train_files.txt",
"chars": 610877,
"preview": "2011_09_26/2011_09_26_drive_0005_sync 0000000000 l\n2011_09_26/2011_09_26_drive_0005_sync 0000000001 l\n2011_09_26/2011_09"
},
{
"path": "dataset/kitti_raw/splits/tulsiani/val_files.txt",
"chars": 63392,
"preview": "2011_09_26/2011_09_26_drive_0048_sync 0000000000 l\n2011_09_26/2011_09_26_drive_0048_sync 0000000001 l\n2011_09_26/2011_09"
},
{
"path": "dataset/loading.py",
"chars": 10842,
"preview": "import os.path\nimport mmcv\nimport numpy as np\nfrom mmengine import BaseStorageBackend, FileClient\nfrom typing import Uni"
},
{
"path": "dataset/sampler.py",
"chars": 5887,
"preview": "import math\nfrom typing import TypeVar, Optional, Iterator\n\nimport torch\nfrom torch.utils.data import Sampler, Dataset\ni"
},
{
"path": "dataset/transform_3d.py",
"chars": 16837,
"preview": "from typing import Any\nimport numpy as np\nfrom numpy import random\nimport mmcv\n# from mmcv.parallel import DataContainer"
},
{
"path": "dataset/utils.py",
"chars": 602,
"preview": "import numpy as np\n\n\ndef get_rm(angle, axis, deg=False):\n if deg:\n angle = np.deg2rad(angle)\n rm = np.eye(3"
},
{
"path": "docs/get_started.md",
"chars": 3441,
"preview": "<!-- template from bevformer -->\n\n# Prerequisites\n\n**Please ensure you have prepared the environment and datasets.**\n\n[2"
},
{
"path": "docs/installation.md",
"chars": 2648,
"preview": "<!-- template from bevformer -->\n# Step-by-step installation instructions\n\nThe following configuration of conda environm"
},
{
"path": "docs/prepare_data.md",
"chars": 4302,
"preview": "<!-- template from bevformer -->\n\n## NuScenes\n\n**a. Download nuScenes data.**\n\nDownload nuScenes V1.0 full dataset data "
},
{
"path": "docs/visualization.md",
"chars": 1527,
"preview": "<!-- TODO -->\n\n# Visualization Guide\n\n## Install packages\n\n```\npip install pyvirtualdisplay mayavi PyQt5\n```\n\n## 3D Occu"
},
{
"path": "eval_depth.py",
"chars": 11010,
"preview": "import time, argparse, os.path as osp, os, sys\nimport torch, numpy as np\nimport torch.distributed as dist\n\nimport mmcv\nf"
},
{
"path": "eval_iou.py",
"chars": 13909,
"preview": "import time, argparse, os.path as osp, os\nimport torch, numpy as np\nimport torch.distributed as dist\nimport torch.nn.fun"
},
{
"path": "eval_iou_kitti.py",
"chars": 9251,
"preview": "import time, argparse, os.path as osp, os\nimport torch, numpy as np\nimport torch.distributed as dist\n\nimport mmcv\nfrom m"
},
{
"path": "eval_novel_depth.py",
"chars": 15410,
"preview": "import time, argparse, os.path as osp\nimport os, math, pickle\nimport torch, numpy as np\nimport torch.distributed as dist"
},
{
"path": "eval_novel_depth_kitti.py",
"chars": 12811,
"preview": "import time, argparse, os.path as osp\nimport os, math, pickle\nimport torch, numpy as np\nimport torch.distributed as dist"
},
{
"path": "examine_sweeps.py",
"chars": 4710,
"preview": "#### check nuscenes sweeps\nfrom nuscenes import NuScenes\nimport mmengine\nfrom copy import deepcopy\nimport numpy as np\n\nn"
},
{
"path": "generate_videos.py",
"chars": 4751,
"preview": "import os, os.path as osp\nimport PIL.Image as Image\nimport cv2\nimport argparse\n\ndef cat_images(dir, cam_img_size, pred_i"
},
{
"path": "loss/__init__.py",
"chars": 537,
"preview": "from mmengine.registry import Registry\nOPENOCC_LOSS = Registry('openocc_loss')\n\nfrom .multi_loss import MultiLoss\nfrom ."
},
{
"path": "loss/base_loss.py",
"chars": 1278,
"preview": "import torch.nn as nn\nfrom utils.tb_wrapper import WrappedTBWriter\nif 'selfocc' in WrappedTBWriter._instance_dict:\n w"
},
{
"path": "loss/edge_loss_3d_ms.py",
"chars": 3022,
"preview": "import torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport torch.nn.functional as F\n\n\ndef get_smooth"
},
{
"path": "loss/eikonal_loss.py",
"chars": 605,
"preview": "from .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\n\n\n@OPENOCC_LOSS.register_module()\nclass EikonalLoss(BaseLoss)"
},
{
"path": "loss/multi_loss.py",
"chars": 1394,
"preview": "import torch.nn as nn\nfrom . import OPENOCC_LOSS\nfrom utils.tb_wrapper import WrappedTBWriter\nif 'selfocc' in WrappedTBW"
},
{
"path": "loss/reproj_loss_mono_multi_new.py",
"chars": 13361,
"preview": "import torch.nn as nn, torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport numpy as np\nimport torch."
},
{
"path": "loss/reproj_loss_mono_multi_new_combine.py",
"chars": 10409,
"preview": "import torch.nn as nn, torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport numpy as np\nimport torch."
},
{
"path": "loss/rgb_loss_ms.py",
"chars": 6788,
"preview": "import torch.nn as nn, torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport torch.nn.functional as F\n"
},
{
"path": "loss/second_grad_loss.py",
"chars": 537,
"preview": "from .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\n\n\n@OPENOCC_LOSS.register_module()\nclass SecondGradLoss(BaseLo"
},
{
"path": "loss/sparsity_loss.py",
"chars": 3493,
"preview": "from .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport torch, numpy as np\n\n\n@OPENOCC_LOSS.register_module()\nc"
},
{
"path": "model/__init__.py",
"chars": 134,
"preview": "from .backbone import *\nfrom .neck import *\nfrom .lifter import *\nfrom .encoder import *\nfrom .head import *\nfrom .segme"
},
{
"path": "model/backbone/__init__.py",
"chars": 63,
"preview": "from mmseg.models.backbones import *\nfrom .unet2d import UNet2D"
},
{
"path": "model/backbone/unet2d.py",
"chars": 7355,
"preview": "\"\"\"\nCode adapted from https://github.com/shariqfarooq123/AdaBins/blob/main/models/unet_adaptive_bins.py\n\"\"\"\nimport torch"
},
{
"path": "model/encoder/__init__.py",
"chars": 49,
"preview": "from .bevformer import *\nfrom .tpvformer import *"
},
{
"path": "model/encoder/base_encoder.py",
"chars": 471,
"preview": "from mmseg.registry import MODELS\nfrom mmengine.model import BaseModule\n\n\n@MODELS.register_module()\nclass BaseEncoder(Ba"
},
{
"path": "model/encoder/bevformer/__init__.py",
"chars": 179,
"preview": "from .bevformer_encoder import BEVFormerEncoder\nfrom .bevformer_pos_embed import BEVPositionalEncoding\nfrom .bevformer_e"
},
{
"path": "model/encoder/bevformer/attention/__init__.py",
"chars": 76,
"preview": "from .image_cross_attention import BEVCrossAttention, BEVDeformableAttention"
},
{
"path": "model/encoder/bevformer/attention/image_cross_attention.py",
"chars": 15352,
"preview": "\r\nimport warnings\r\nimport torch, math\r\nimport torch.nn as nn\r\nfrom mmengine.model import xavier_init, constant_init, Bas"
},
{
"path": "model/encoder/bevformer/bevformer_encoder.py",
"chars": 8174,
"preview": "from mmseg.registry import MODELS\nfrom mmcv.cnn.bricks.transformer import build_positional_encoding, build_transformer_l"
},
{
"path": "model/encoder/bevformer/bevformer_encoder_layer.py",
"chars": 9243,
"preview": "\r\nimport copy, torch\r\nfrom mmengine import ConfigDict\r\nfrom mmengine.model import BaseModule, ModuleList\r\nfrom mmengine."
},
{
"path": "model/encoder/bevformer/bevformer_pos_embed.py",
"chars": 2463,
"preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom mmengine.registry import MODELS\n\n\n@MODELS.regist"
},
{
"path": "model/encoder/bevformer/mappings.py",
"chars": 11591,
"preview": "import torch\nfrom typing_extensions import Literal\n\nclass LinearMapping:\n\n def __init__(\n self,\n "
},
{
"path": "model/encoder/bevformer/mappings_old.py",
"chars": 4864,
"preview": "import torch\nfrom typing_extensions import Literal\n\n\nclass GridMeterMapping:\n\n def __init__(\n self,\n be"
},
{
"path": "model/encoder/bevformer/utils.py",
"chars": 10131,
"preview": "import torch\nimport numpy as np\n\n\ndef get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar):\n # ref po"
},
{
"path": "model/encoder/tpvformer/__init__.py",
"chars": 202,
"preview": "from .tpvformer_encoder import TPVFormerEncoder\nfrom .tpvformer_pos_embed import TPVPositionalEncoding\nfrom .tpvformer_e"
},
{
"path": "model/encoder/tpvformer/attention/__init__.py",
"chars": 118,
"preview": "from .image_cross_attention import TPVCrossAttention\nfrom .cross_view_hybrid_attention import CrossViewHybridAttention"
},
{
"path": "model/encoder/tpvformer/attention/cross_view_hybrid_attention.py",
"chars": 5701,
"preview": "from typing import Optional, no_type_check\nfrom mmcv.ops.multi_scale_deform_attn import \\\n MultiScaleDeformableAttent"
},
{
"path": "model/encoder/tpvformer/attention/image_cross_attention.py",
"chars": 3234,
"preview": "\r\nfrom mmengine.model import BaseModule\r\nfrom mmengine.registry import MODELS\r\nfrom mmcv.cnn.bricks.transformer import b"
},
{
"path": "model/encoder/tpvformer/modules/__init__.py",
"chars": 116,
"preview": "from .split_fpn import MultiPlaneFFN\nfrom .split_norm import MultiPlaneNorm\nfrom .camera_se_net import CameraAwareSE"
},
{
"path": "model/encoder/tpvformer/modules/camera_se_net.py",
"chars": 4596,
"preview": "import torch, torch.nn as nn\nimport numpy as np\n\nclass Mlp(nn.Module):\n\n def __init__(self,\n in_featu"
},
{
"path": "model/encoder/tpvformer/modules/split_fpn.py",
"chars": 1198,
"preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom mmengine.registry import MODELS\nfrom mmcv.cnn.br"
},
{
"path": "model/encoder/tpvformer/modules/split_norm.py",
"chars": 844,
"preview": "from mmengine.registry import MODELS\nfrom mmengine.model import BaseModule\nfrom mmcv.cnn import build_norm_layer\nimport "
},
{
"path": "model/encoder/tpvformer/tpvformer_encoder.py",
"chars": 11750,
"preview": "from mmseg.registry import MODELS\nfrom mmcv.cnn.bricks.transformer import build_positional_encoding, build_transformer_l"
},
{
"path": "model/encoder/tpvformer/tpvformer_encoder_layer.py",
"chars": 9859,
"preview": "\r\nimport copy, torch\r\nfrom mmengine import ConfigDict\r\nfrom mmengine.model import BaseModule, ModuleList\r\nfrom mmengine."
},
{
"path": "model/encoder/tpvformer/tpvformer_pos_embed.py",
"chars": 3554,
"preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom mmengine.registry import MODELS\n\n\ndef get_feat_f"
},
{
"path": "model/encoder/tpvformer/utils.py",
"chars": 4298,
"preview": "import torch\nimport numpy as np\n\n\ndef get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar, offset=0):\n "
},
{
"path": "model/head/__init__.py",
"chars": 57,
"preview": "import nerfacc\nfrom .neus_head.neus_head import NeuSHead\n"
},
{
"path": "model/head/base_head.py",
"chars": 503,
"preview": "from mmengine.model import BaseModule\nfrom mmseg.models import HEADS\n\n@HEADS.register_module()\nclass BaseTaskHead(BaseMo"
},
{
"path": "model/head/nerfacc_head/bev_nerf.py",
"chars": 6512,
"preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom ...encoder.bevformer.mappings import GridMeterMa"
},
{
"path": "model/head/nerfacc_head/estimator.py",
"chars": 6467,
"preview": "from nerfacc import OccGridEstimator\nimport torch\nfrom torch import Tensor\nfrom typing import Callable, Optional, Tuple\n"
},
{
"path": "model/head/nerfacc_head/img2lidar.py",
"chars": 2542,
"preview": "import torch, torch.nn as nn\nimport numpy as np, os\nfrom dataset.utils import get_rm\n\n\nclass Img2LiDAR(nn.Module):\n\n "
},
{
"path": "model/head/nerfacc_head/nerfacc_head.py",
"chars": 18413,
"preview": "import os\nfrom ..base_head import BaseTaskHead\nfrom .ray_sampler import RaySampler\nfrom .img2lidar import Img2LiDAR\nfrom"
},
{
"path": "model/head/nerfacc_head/ray_sampler.py",
"chars": 3212,
"preview": "import torch, torch.nn as nn\nimport numpy as np\n\n\nclass RaySampler(nn.Module):\n\n def __init__(\n self,\n "
},
{
"path": "model/head/nerfacc_head/rendering.py",
"chars": 6751,
"preview": "\"\"\"\nModified from NeRFAcc.\nCopyright (c) 2022 Ruilong Li, UC Berkeley.\n\"\"\"\n\nfrom typing import Callable, Dict, Optional,"
},
{
"path": "model/head/neus_head/neus_head.py",
"chars": 31708,
"preview": "import torch, torch.distributed as dist, torch.nn as nn, os\nfrom nerfstudio.models.neus_custom import NeuSCustomModelCon"
},
{
"path": "model/head/utils/sh_render.py",
"chars": 3199,
"preview": "import torch\n\n################## sh function ##################\nC0 = 0.28209479177387814\nC1 = 0.4886025119029199\nC2 = [\n"
},
{
"path": "model/lifter/__init__.py",
"chars": 135,
"preview": "from .bev_query_lifter import BEVQueryLifter\nfrom .tpv_query_lifter import TPVQueryLifter\nfrom .tpv_pos_lifter import TP"
},
{
"path": "model/lifter/base_lifter.py",
"chars": 522,
"preview": "from mmseg.registry import MODELS\nfrom mmengine.model import BaseModule\n\n\n@MODELS.register_module()\nclass BaseLifter(Bas"
},
{
"path": "model/lifter/bev_query_lifter.py",
"chars": 682,
"preview": "from .base_lifter import BaseLifter\nfrom mmseg.registry import MODELS\nimport torch, torch.nn as nn\n\n\n@MODELS.register_mo"
},
{
"path": "model/lifter/tpv_pos_lifter.py",
"chars": 3789,
"preview": "from .base_lifter import BaseLifter\nfrom mmseg.registry import MODELS\nimport torch, torch.nn as nn\nfrom model.encoder.be"
},
{
"path": "model/lifter/tpv_query_lifter.py",
"chars": 1176,
"preview": "from .base_lifter import BaseLifter\nfrom mmseg.registry import MODELS\nimport torch, torch.nn as nn\n\n\n@MODELS.register_mo"
},
{
"path": "model/neck/__init__.py",
"chars": 115,
"preview": "from mmseg.models.necks import *\nfrom mmdet3d.models.necks import SECONDFPN\nfrom .identity_neck import IdentityNeck"
},
{
"path": "model/neck/identity_neck.py",
"chars": 571,
"preview": "from mmseg.registry import MODELS\nfrom mmengine.model import BaseModule\n\n\n@MODELS.register_module()\nclass IdentityNeck(B"
},
{
"path": "model/segmentor/__init__.py",
"chars": 40,
"preview": "from .tpv_segmentor import TPVSegmentor\n"
},
{
"path": "model/segmentor/base_segmentor.py",
"chars": 1672,
"preview": "from mmseg.models import SEGMENTORS, builder\nfrom mmdet3d.registry import MODELS\nfrom mmengine.model import BaseModule\n\n"
},
{
"path": "model/segmentor/tpv_segmentor.py",
"chars": 4560,
"preview": "import torch, torch.nn as nn\nimport numpy as np\nfrom mmseg.models import SEGMENTORS\nfrom mmseg.models import build_backb"
},
{
"path": "train.py",
"chars": 16480,
"preview": "import time, argparse, os.path as osp, os\nimport torch, numpy as np\nimport torch.distributed as dist\nfrom copy import de"
},
{
"path": "utils/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "utils/config_tools.py",
"chars": 3535,
"preview": "NUM_RAYS = {\n # 'nuscenes': [96, 200],\n # 'nuscenes': [384, 800],\n 'nuscenes': [450, 800],\n 'kitti': [176, 6"
},
{
"path": "utils/feat_tools.py",
"chars": 1273,
"preview": "import torch\nimport torch.nn.functional as F\n\ndef multi2single_scale(\n ms_feats, imgs=None, \n max_shape=No"
},
{
"path": "utils/metric_util.py",
"chars": 16923,
"preview": "\nimport numpy as np\n# from mmseg.utils import get_root_logger\nfrom mmengine import MMLogger\nlogger = MMLogger.get_instan"
},
{
"path": "utils/misc.py",
"chars": 4917,
"preview": "import os.path as osp\nimport torch, numpy as np\nimport torch.nn.functional as F\nfrom mmengine import Config\nfrom mmengin"
},
{
"path": "utils/openseed_utils.py",
"chars": 2520,
"preview": "import os, numpy as np\n\nfrom openseed.BaseModel import BaseModel\nfrom openseed import build_model\nfrom openseed.utils.ar"
},
{
"path": "utils/scenerf_metric.py",
"chars": 8091,
"preview": "\"\"\"\nPart of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py\n\"\"\"\nimport numpy as np\ni"
},
{
"path": "utils/tb_wrapper.py",
"chars": 273,
"preview": "from torch.utils.tensorboard import SummaryWriter\nfrom mmengine.utils import ManagerMixin\n\nclass WrappedTBWriter(Summary"
},
{
"path": "utils/temporal_pkl.py",
"chars": 1956,
"preview": "import pickle\nimport argparse\n\ndef arange_according_to_scene(infos, nusc):\n scenes = dict()\n\n for i, info in enume"
},
{
"path": "vis_3d.py",
"chars": 28116,
"preview": "import os\noffscreen = False\nif os.environ.get('DISP', 'f') == 'f':\n from pyvirtualdisplay import Display\n display "
},
{
"path": "vis_3d_scene.py",
"chars": 25768,
"preview": "import os\noffscreen = False\nif os.environ.get('DISP', 'f') == 'f':\n from pyvirtualdisplay import Display\n display "
},
{
"path": "vis_pics.py",
"chars": 13295,
"preview": "import os\nimport time, argparse, math, os.path as osp, numpy as np\nimport torch\nimport torch.distributed as dist\nimport "
}
]
About this extraction
This page contains the full source code of the huang-yh/SelfOcc GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 181 files (9.5 MB), approximately 2.5M tokens, and a symbol index with 448 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.