Full Code of huang-yh/SelfOcc for AI

main 1658c2afd464 cached

181 files

9.5 MB

2.5M tokens

448 symbols

1 requests

Download .txt

Showing preview only (9,959K chars total). Download the full file or copy to clipboard to get everything.

Repository: huang-yh/SelfOcc
Branch: main
Commit: 1658c2afd464
Files: 181
Total size: 9.5 MB

Directory structure:
gitextract_ciq3vmv7/

├── .gitignore
├── LICENSE
├── README.md
├── assets/
│   └── .placeholder
├── config/
│   ├── _base_/
│   │   ├── dataset_v1.py
│   │   ├── optimizer.py
│   │   └── schedule.py
│   ├── kitti/
│   │   ├── kitti_novel_depth.py
│   │   └── kitti_occ.py
│   ├── kitti_raw/
│   │   └── kitti_raw_depth.py
│   ├── nuscenes/
│   │   ├── nuscenes_depth.py
│   │   ├── nuscenes_novel_depth.py
│   │   ├── nuscenes_occ.py
│   │   └── nuscenes_occ_bev.py
│   └── openseed/
│       └── openseed_swint_lang.yaml
├── dataset/
│   ├── __init__.py
│   ├── dataset_one_frame_eval.py
│   ├── dataset_one_frame_sweeps_dist.py
│   ├── dataset_one_frame_sweeps_dist_vis.py
│   ├── dataset_wrapper_temporal.py
│   ├── dataset_wrapper_vis.py
│   ├── kitti/
│   │   ├── helpers.py
│   │   ├── io_data.py
│   │   ├── kitti_dataset_eval.py
│   │   ├── kitti_dataset_one_frame.py
│   │   ├── params.py
│   │   └── semantic-kitti.yaml
│   ├── kitti_raw/
│   │   ├── kitti_raw_dataset.py
│   │   ├── kitti_raw_dataset_stereo.py
│   │   ├── orb-slam_poses/
│   │   │   ├── 2011_09_26/
│   │   │   │   ├── 2011_09_26_drive_0001_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0002_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0005_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0009_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0011_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0013_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0014_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0015_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0017_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0018_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0019_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0020_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0022_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0023_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0027_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0028_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0029_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0032_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0035_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0036_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0039_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0046_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0048_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0051_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0052_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0056_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0057_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0059_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0060_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0061_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0064_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0070_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0079_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0084_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0086_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0087_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0091_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0093_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0095_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0096_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0101_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0104_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0106_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0113_sync.txt
│   │   │   │   └── 2011_09_26_drive_0117_sync.txt
│   │   │   ├── 2011_09_28/
│   │   │   │   ├── 2011_09_28_drive_0001_sync.txt
│   │   │   │   └── 2011_09_28_drive_0002_sync.txt
│   │   │   ├── 2011_09_29/
│   │   │   │   ├── 2011_09_29_drive_0004_sync.txt
│   │   │   │   ├── 2011_09_29_drive_0026_sync.txt
│   │   │   │   └── 2011_09_29_drive_0071_sync.txt
│   │   │   ├── 2011_09_30/
│   │   │   │   ├── 2011_09_30_drive_0016_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0018_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0020_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0027_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0028_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0033_sync.txt
│   │   │   │   └── 2011_09_30_drive_0034_sync.txt
│   │   │   └── 2011_10_03/
│   │   │       ├── 2011_10_03_drive_0027_sync.txt
│   │   │       ├── 2011_10_03_drive_0034_sync.txt
│   │   │       ├── 2011_10_03_drive_0042_sync.txt
│   │   │       └── 2011_10_03_drive_0047_sync.txt
│   │   └── splits/
│   │       ├── eigen/
│   │       │   └── test_files.txt
│   │       ├── eigen_zhou/
│   │       │   ├── test_files.txt
│   │       │   ├── train_files.txt
│   │       │   └── val_files.txt
│   │       └── tulsiani/
│   │           ├── test_files.txt
│   │           ├── train_files.txt
│   │           └── val_files.txt
│   ├── loading.py
│   ├── sampler.py
│   ├── transform_3d.py
│   └── utils.py
├── docs/
│   ├── get_started.md
│   ├── installation.md
│   ├── prepare_data.md
│   └── visualization.md
├── eval_depth.py
├── eval_iou.py
├── eval_iou_kitti.py
├── eval_novel_depth.py
├── eval_novel_depth_kitti.py
├── examine_sweeps.py
├── generate_videos.py
├── loss/
│   ├── __init__.py
│   ├── base_loss.py
│   ├── edge_loss_3d_ms.py
│   ├── eikonal_loss.py
│   ├── multi_loss.py
│   ├── reproj_loss_mono_multi_new.py
│   ├── reproj_loss_mono_multi_new_combine.py
│   ├── rgb_loss_ms.py
│   ├── second_grad_loss.py
│   └── sparsity_loss.py
├── model/
│   ├── __init__.py
│   ├── backbone/
│   │   ├── __init__.py
│   │   └── unet2d.py
│   ├── encoder/
│   │   ├── __init__.py
│   │   ├── base_encoder.py
│   │   ├── bevformer/
│   │   │   ├── __init__.py
│   │   │   ├── attention/
│   │   │   │   ├── __init__.py
│   │   │   │   └── image_cross_attention.py
│   │   │   ├── bevformer_encoder.py
│   │   │   ├── bevformer_encoder_layer.py
│   │   │   ├── bevformer_pos_embed.py
│   │   │   ├── mappings.py
│   │   │   ├── mappings_old.py
│   │   │   └── utils.py
│   │   └── tpvformer/
│   │       ├── __init__.py
│   │       ├── attention/
│   │       │   ├── __init__.py
│   │       │   ├── cross_view_hybrid_attention.py
│   │       │   └── image_cross_attention.py
│   │       ├── modules/
│   │       │   ├── __init__.py
│   │       │   ├── camera_se_net.py
│   │       │   ├── split_fpn.py
│   │       │   └── split_norm.py
│   │       ├── tpvformer_encoder.py
│   │       ├── tpvformer_encoder_layer.py
│   │       ├── tpvformer_pos_embed.py
│   │       └── utils.py
│   ├── head/
│   │   ├── __init__.py
│   │   ├── base_head.py
│   │   ├── nerfacc_head/
│   │   │   ├── bev_nerf.py
│   │   │   ├── estimator.py
│   │   │   ├── img2lidar.py
│   │   │   ├── nerfacc_head.py
│   │   │   ├── ray_sampler.py
│   │   │   └── rendering.py
│   │   ├── neus_head/
│   │   │   └── neus_head.py
│   │   └── utils/
│   │       └── sh_render.py
│   ├── lifter/
│   │   ├── __init__.py
│   │   ├── base_lifter.py
│   │   ├── bev_query_lifter.py
│   │   ├── tpv_pos_lifter.py
│   │   └── tpv_query_lifter.py
│   ├── neck/
│   │   ├── __init__.py
│   │   └── identity_neck.py
│   └── segmentor/
│       ├── __init__.py
│       ├── base_segmentor.py
│       └── tpv_segmentor.py
├── train.py
├── utils/
│   ├── __init__.py
│   ├── config_tools.py
│   ├── feat_tools.py
│   ├── metric_util.py
│   ├── misc.py
│   ├── openseed_utils.py
│   ├── scenerf_metric.py
│   ├── tb_wrapper.py
│   └── temporal_pkl.py
├── vis_3d.py
├── vis_3d_scene.py
└── vis_pics.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


ckpts
data
.DS_Store
out

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction
### [Paper](https://arxiv.org/pdf/2311.12754)  | [Project Page](https://huang-yh.github.io/SelfOcc/) 

> SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction, CVPR 2024

> [Yuanhui Huang](https://scholar.google.com/citations?hl=zh-CN&user=LKVgsk4AAAAJ)*, [Wenzhao Zheng](https://wzzheng.net/)\* $\dagger$, [Borui Zhang](https://boruizhang.site/), [Jie Zhou](https://scholar.google.com/citations?user=6a79aPwAAAAJ&hl=en&authuser=1), [Jiwen Lu](http://ivg.au.tsinghua.edu.cn/Jiwen_Lu/)$\ddagger$

\* Equal contribution $\dagger$ Project leader $\ddagger$ Corresponding author

SelfOcc empowers 3D autonomous driving world models (e.g., [OccWorld](https://github.com/wzzheng/OccWorld)) with scalable 3D representations, paving the way for **interpretable end-to-end large driving models**.

## News
- **[2024/9/5]** Check out our recent [GaussianFormer](https://github.com/huang-yh/GaussianFormer) for object-centric 3D occupancy prediction.
- **[2024/4/30]** Visualization code for 2D, 3D and video is released.
- **[2024/2/26]** SelfOcc is accepted to CVPR 2024!
- **[2023/12/16]** Training code release.
- **[2023/11/28]** Evaluation code release.
- **[2023/11/20]** Paper released on [arXiv](https://arxiv.org/abs/2311.12754).
- **[2023/11/20]** Demo release.

## Demo

### Trained using only video sequences and poses:

![demo](./assets/iou.gif)

### Trained using an additional off-the-shelf 2D segmentor ([OpenSeeD](https://github.com/IDEA-Research/OpenSeeD)):

![demo](./assets/miou.gif)

![legend](./assets/legend.png)


### More demo videos can be downloaded [here](https://cloud.tsinghua.edu.cn/d/640283b528f7436193a4/).

## Overview
![overview](./assets/overview.png)

- We first transform the images into the 3D space (e.g., bird's eye view, [tri-perspective view](https://github.com/wzzheng/TPVFormer)) to obtain 3D representation of the scene. We directly impose constraints on the 3D representations by treating them as signed distance fields. We can then render 2D images of previous and future frames as self-supervision signals to learn the 3D representations. 

- Our SelfOcc outperforms the previous best method SceneRF by 58.7% using a single frame as input on SemanticKITTI and is the first self-supervised work that produces reasonable 3D occupancy for surround cameras on nuScenes. 

- SelfOcc produces high-quality depth and achieves state-of-the-art results on **novel depth synthesis**, **monocular depth estimation**, and **surround-view depth estimation** on the SemanticKITTI, KITTI-2015, and nuScenes, respectively. 

## Results
<img src=./assets/results.png height="480px" width="563px" />

## Getting Started

### Installation

Follow detailed instructions in [Installation](docs/installation.md).

### Preparing Dataset

Follow detailed instructions in [Prepare Dataset](docs/prepare_data.md).

We also provide our code for synchronizing sweep data according to keyframe samples.

### Run

[23/12/16 Update] Please update the timm package to 0.9.2 to run the training script.

#### 3D Occupancy Prediction

Download model weights [HERE](https://cloud.tsinghua.edu.cn/f/831c104c82a244e9878a/) and put it under out/nuscenes/occ/
```bash
# train
python train.py --py-config config/nuscenes/nuscenes_occ.py --work-dir out/nuscenes/occ_train --depth-metric
# eval
python eval_iou.py --py-config config/nuscenes/nuscenes_occ.py --work-dir out/nuscenes/occ --resume-from out/nuscenes/occ/model_state_dict.pth --occ3d --resolution 0.4 --sem --use-mask --scene-size 4
```

#### Novel Depth Synthesis


Download model weights [HERE](https://cloud.tsinghua.edu.cn/f/2d217cd298a34ed19039/) and put it under out/nuscenes/novel_depth/
```bash
# train
python train.py --py-config config/nuscenes/nuscenes_novel_depth.py --work-dir out/nuscenes/novel_depth_train --depth-metric
# evak
python eval_novel_depth.py --py-config config/nuscenes/nuscenes_novel_depth.py --work-dir out/nuscenes/novel_depth --resume-from out/nuscenes/novel_depth/model_state_dict.pth
```

#### Depth Estimation

Download model weights [HERE](https://cloud.tsinghua.edu.cn/f/1a722b9139234542ae1e/) and put it under out/nuscenes/depth/
```bash
# train
python train.py --py-config config/nuscenes/nuscenes_depth.py --work-dir out/nuscenes/depth_train --depth-metric
# eval
python eval_depth.py --py-config config/nuscenes/nuscenes_depth.py --work-dir out/nuscenes/depth --resume-from out/nuscenes/depth/model_state_dict.pth --depth-metric --batch 90000
```

Note that evaluating at a resolution (450\*800) of 1:2 against the raw image (900\*1600) takes about 90 min, because we batchify rays for rendering due to GPU memory limit. You can change the rendering resolution by the variable *NUM_RAYS* in utils/config_tools.py

More details on more datasets are detailed in  [Run and Eval](docs/get_started.md).

### Visualization

Follow detailed instructions in [Visualization](docs/visualization.md).

## Related Projects

Our code is based on [TPVFormer](https://github.com/wzzheng/TPVFormer) and [PointOcc](https://github.com/wzzheng/PointOcc). 

Also thanks to these excellent open-sourced repos:
[SurroundOcc](https://github.com/weiyithu/SurroundOcc) 
[OccFormer](https://github.com/zhangyp15/OccFormer)
[BEVFormer](https://github.com/fundamentalvision/BEVFormer)

A pioneering work on object-centric 3D occupancy prediction: [GaussianFormer](https://github.com/huang-yh/GaussianFormer).

## Citation

If you find this project helpful, please consider citing the following paper:
```
@article{huang2023self,
    title={SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction},
    author={Huang, Yuanhui and Zheng, Wenzhao and Zhang, Borui and Zhou, Jie and Lu, Jiwen },
    journal={arXiv preprint arXiv:2311.12754},
    year={2023}
}
```


================================================
FILE: assets/.placeholder
================================================



================================================
FILE: config/_base_/dataset_v1.py
================================================
data_path = 'data/nuscenes/'

train_wrapper_config = dict(
    phase='train', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

val_wrapper_config = dict(
    phase='val', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

nusc = dict(
    version = 'v1.0-trainval',
    dataroot = data_path
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)


================================================
FILE: config/_base_/optimizer.py
================================================
optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=2e-5,
        weight_decay=0.0001
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)

grad_max_norm = 35




================================================
FILE: config/_base_/schedule.py
================================================
max_epochs = 12
print_freq = 50


================================================
FILE: config/kitti/kitti_novel_depth.py
================================================
_base_ = [
    '../_base_/dataset_v1.py',
    '../_base_/optimizer.py',
    '../_base_/schedule.py',
]

img_size = [370, 1216]
num_rays = [55, 190]
amp = False
max_epochs = 24
warmup_iters = 1000
num_cams = 1


optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=1e-3,
        weight_decay=0.01,
        # eps=1e-4
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)


data_path = 'data/kitti/'

train_dataset_config = dict(
    _delete_=True,
    type='Kitti_One_Frame',
    split = 'train',
    root = data_path,
    preprocess_root = data_path + 'preprocess',
    cur_prob = 0.5,
    crop_size = img_size,
    strict = True,
    prev_prob = 0.,
    choose_nearest = True,
)
    
val_dataset_config = dict(
    _delete_=True,
    type='Kitti_One_Frame',
    split = 'val',
    root = data_path,
    preprocess_root = data_path + 'preprocess',
    cur_prob = 1.0,
    crop_size = img_size,
    strict = False,
    prev_prob = 0.5,
    choose_nearest = True,
    return_depth = True
)

train_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='train', 
    scale_rate=1.0,
    pad_img_size=[384, 1216],
    pad_scale_rate=[1.038, 1.0],
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    img_norm_cfg=dict(
        mean=[124.16, 116.74, 103.94], 
        std=[58.624, 57.344, 57.6], to_rgb=True)
)

val_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='val', 
    scale_rate=1.0,
    pad_img_size=[384, 1216],
    pad_scale_rate=[1.038, 1.0],
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    img_norm_cfg=dict(
        mean=[124.16, 116.74, 103.94], 
        std=[58.624, 57.344, 57.6], to_rgb=True)
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)

loss = dict(
    type='MultiLoss',
    loss_cfgs=[
        dict(
            type='ReprojLossMonoMultiNew',
            weight=1.0,
            no_ssim=False,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'curr_imgs': 'curr_imgs',
                'prev_imgs': 'prev_imgs',
                'next_imgs': 'next_imgs',
                'ray_indices': 'ray_indices',
                'weights': 'weights',
                'ts': 'ts',
                'metas': 'metas',
                'ms_rays': 'ms_rays',
                # 'deltas': 'deltas'
                }),
        dict(
            type='RGBLossMS',
            weight=0.1,
            img_size=img_size,
            no_ssim=False,
            ray_resize=num_rays,
            input_dict={
                'ms_colors': 'ms_colors',
                'ms_rays': 'ms_rays',
                'gt_imgs': 'color_imgs'}),
        dict(
            type='EikonalLoss',
            weight=0.1,),
        dict(
            type='SecondGradLoss',
            weight=0.01),
        # dict(
        #     type='SparsityLoss',
        #     weight=0.001,
        #     scale=0.1,
        #     input_dict={
        #         'density': 'uniform_sdf'}),
        ])

loss_input_convertion = dict(
    ms_depths='ms_depths',
    ms_rays='ms_rays',
    # ms_accs='ms_accs',
    ms_colors='ms_colors',
    ray_indices='ray_indices',
    weights='weights',
    ts='ts',
    eik_grad='eik_grad',
    second_grad='second_grad',
    # uniform_sdf='uniform_sdf',
    # deltas='deltas'
)

load_from = ''

_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
    nonlinear_mode='linear',
    h_size=[256, 0],
    h_range=[51.2, 0],
    h_half=True,
    w_size=[128, 0],
    w_range=[25.6, 0],
    w_half=False,
    d_size=[32, 0],
    d_range=[-2.0, 4.4, 4.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 256
tpv_w = 1 + 2 * 128
tpv_z = 1 + 32 + 0
point_cloud_range = [-25.6, 0.0, -2.0, 25.6, 51.2, 4.4]

num_points_cross = [48, 48, 8]
num_points_self = 12


self_cross_layer = dict(
    type='TPVFormerLayer',
    attn_cfgs=[
        dict(
            type='CrossViewHybridAttention',
            embed_dims=_dim_,
            num_heads=num_heads,
            num_levels=3,
            num_points=num_points_self,
            dropout=0.1,
            batch_first=True),
        dict(
            type='TPVCrossAttention',
            embed_dims=_dim_,
            num_cams=num_cams,
            dropout=0.1,
            batch_first=True,
            num_heads=num_heads,
            num_levels=4,
            num_points=num_points_cross)
    ],
    feedforward_channels=_ffn_dim_,
    ffn_dropout=0.1,
    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)


model = dict(
    type='TPVSegmentor',
    img_backbone_out_indices=[0, 1, 2, 3],
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        norm_eval=False,
        style='pytorch',
        pretrained='./ckpts/resnet50-0676ba61.pth'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    lifter=dict(
        type='TPVQueryLifter',
        tpv_h=tpv_h,
        tpv_w=tpv_w,
        tpv_z=tpv_z, 
        dim=_dim_),
    encoder=dict(
        type='TPVFormerEncoder',
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        embed_dims=_dim_,
        num_cams=num_cams,
        num_feature_levels=4,
        positional_encoding=dict(
            type='TPVPositionalEncoding',
            num_freqs=[12] * 3, 
            embed_dims=_dim_, 
            tot_range=point_cloud_range),
        num_points_cross=num_points_cross,
        num_points_self=[num_points_self] * 3,
        transformerlayers=[
            self_cross_layer,
            self_cross_layer,
            self_cross_layer,
            self_cross_layer], 
        num_layers=4),
    head=dict(
        type='NeuSHead',
        roi_aabb=point_cloud_range, 
        resolution=0.4,
        near_plane=0.0,
        far_plane=1e10,
        num_samples=256,
        num_samples_importance=0,
        num_up_sample_steps=0,
        base_variance=4,

        beta_init=0.1,
        beta_max=0.195,
        total_iters=3516*11,
        beta_hand_tune=False,
        
        use_numerical_gradients=False,
        sample_gradient=True,
        return_uniform_sdf=False,
        return_second_grad=True,

        # rays args
        ray_sample_mode='cellular',    # fixed, cellular
        ray_number=num_rays,      # 192 * 400
        ray_img_size=img_size,
        ray_upper_crop=0,
        # img2lidar args
        trans_kw='temImg2lidar',
        novel_view=None,

        # render args
        render_bkgd='random',

        # bev nerf
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        # mlp decoder 
        embed_dims=_dim_,
        color_dims=3,
        density_layers=2,
        sh_deg=0,
        sh_act='relu',
        two_split=False,
        tpv=True))

================================================
FILE: config/kitti/kitti_occ.py
================================================
_base_ = [
    '../_base_/dataset_v1.py',
    '../_base_/optimizer.py',
    '../_base_/schedule.py',
]

img_size = [352, 1216]
num_rays = [55, 190]
amp = False
max_epochs = 24
warmup_iters = 1000
num_cams = 1


optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=1e-3,
        weight_decay=0.01,
        # eps=1e-4
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)


data_path = 'data/kitti/'

train_dataset_config = dict(
    _delete_=True,
    type='Kitti_One_Frame',
    split = 'train',
    root = data_path,
    preprocess_root = data_path + 'preprocess',
    frames_interval=0.4,
    sequence_distance=[10, 40],
    cur_prob = 0.333,
    crop_size = img_size,
    strict = True,
    prev_prob = 0.2,
    choose_nearest = True,
)
    
val_dataset_config = dict(
    _delete_=True,
    type='Kitti_One_Frame',
    split = 'val',
    root = data_path,
    preprocess_root = data_path + 'preprocess',
    frames_interval=0.4,
    sequence_distance=[10, 40],
    cur_prob = 1.0,
    crop_size = img_size,
    strict = False,
    prev_prob = 0.2,
    choose_nearest = True,
    return_depth = True
)

train_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='train', 
    scale_rate=1.0,
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    img_norm_cfg=dict(
        mean=[124.16, 116.74, 103.94], 
        std=[58.624, 57.344, 57.6], to_rgb=True)
)

val_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='val', 
    scale_rate=1.0,
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    img_norm_cfg=dict(
        mean=[124.16, 116.74, 103.94], 
        std=[58.624, 57.344, 57.6], to_rgb=True)
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)

loss = dict(
    type='MultiLoss',
    loss_cfgs=[
        dict(
            type='ReprojLossMonoMultiNew',
            weight=1.0,
            no_ssim=False,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'curr_imgs': 'curr_imgs',
                'prev_imgs': 'prev_imgs',
                'next_imgs': 'next_imgs',
                'ray_indices': 'ray_indices',
                'weights': 'weights',
                'ts': 'ts',
                'metas': 'metas',
                'ms_rays': 'ms_rays',
                # 'deltas': 'deltas'
                }),
        dict(
            type='RGBLossMS',
            weight=0.1,
            img_size=img_size,
            no_ssim=False,
            ray_resize=num_rays,
            input_dict={
                'ms_colors': 'ms_colors',
                'ms_rays': 'ms_rays',
                'gt_imgs': 'color_imgs'}),
        dict(
            type='EikonalLoss',
            weight=0.1,),
        dict(
            type='SecondGradLoss',
            weight=0.1),
        dict(
            type='SoftSparsityLoss',
            weight=0.005,
            input_dict={
                'density': 'uniform_sdf'})
        # dict(
        #     type='SparsityLoss',
        #     weight=0.001,
        #     scale=0.1,
        #     input_dict={
        #         'density': 'uniform_sdf'}),
        ])

loss_input_convertion = dict(
    ms_depths='ms_depths',
    ms_rays='ms_rays',
    ms_accs='ms_accs',
    ms_colors='ms_colors',
    ray_indices='ray_indices',
    weights='weights',
    ts='ts',
    eik_grad='eik_grad',
    second_grad='second_grad',
    uniform_sdf='uniform_sdf',
    # deltas='deltas'
)

load_from = ''

_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
    nonlinear_mode='linear',
    h_size=[256, 0],
    h_range=[51.2, 0],
    h_half=True,
    w_size=[128, 0],
    w_range=[25.6, 0],
    w_half=False,
    d_size=[32, 0],
    d_range=[-2.0, 4.4, 4.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 256
tpv_w = 1 + 2 * 128
tpv_z = 1 + 32 + 0
point_cloud_range = [-25.6, 0.0, -2.0, 25.6, 51.2, 4.4]

num_points_cross = [48, 48, 8]
num_points_self = 12


self_cross_layer = dict(
    type='TPVFormerLayer',
    attn_cfgs=[
        dict(
            type='CrossViewHybridAttention',
            embed_dims=_dim_,
            num_heads=num_heads,
            num_levels=3,
            num_points=num_points_self,
            dropout=0.1,
            batch_first=True),
        dict(
            type='TPVCrossAttention',
            embed_dims=_dim_,
            num_cams=num_cams,
            dropout=0.1,
            batch_first=True,
            num_heads=num_heads,
            num_levels=4,
            num_points=num_points_cross)
    ],
    feedforward_channels=_ffn_dim_,
    ffn_dropout=0.1,
    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)


model = dict(
    type='TPVSegmentor',
    img_backbone_out_indices=[0, 1, 2, 3],
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        norm_eval=False,
        style='pytorch',
        pretrained='./ckpts/resnet50-0676ba61.pth'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    lifter=dict(
        type='TPVQueryLifter',
        tpv_h=tpv_h,
        tpv_w=tpv_w,
        tpv_z=tpv_z, 
        dim=_dim_),
    encoder=dict(
        type='TPVFormerEncoder',
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        embed_dims=_dim_,
        num_cams=num_cams,
        num_feature_levels=4,
        positional_encoding=dict(
            type='TPVPositionalEncoding',
            num_freqs=[12] * 3, 
            embed_dims=_dim_, 
            tot_range=point_cloud_range),
        num_points_cross=num_points_cross,
        num_points_self=[num_points_self] * 3,
        transformerlayers=[
            self_cross_layer,
            self_cross_layer,
            self_cross_layer,
            self_cross_layer], 
        num_layers=4),
    head=dict(
        type='NeuSHead',
        roi_aabb=point_cloud_range, 
        resolution=0.4,
        near_plane=0.0,
        far_plane=1e10,
        num_samples=256,
        num_samples_importance=0,
        num_up_sample_steps=0,
        base_variance=4,

        beta_init=0.1,
        beta_max=0.195,
        total_iters=3516*11,
        beta_hand_tune=False,
        
        use_numerical_gradients=False,
        sample_gradient=True,
        use_compact_2nd_grad=True,
        return_uniform_sdf=True,
        return_second_grad=True,

        # rays args
        ray_sample_mode='cellular',    # fixed, cellular
        ray_number=num_rays,      # 192 * 400
        ray_img_size=img_size,
        ray_upper_crop=0,
        # img2lidar args
        trans_kw='temImg2lidar',
        novel_view=None,

        # render args
        render_bkgd='random',

        # bev nerf
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        # mlp decoder 
        embed_dims=_dim_,
        color_dims=3,
        density_layers=2,
        sh_deg=0,
        sh_act='relu',
        two_split=False,
        tpv=True))

================================================
FILE: config/kitti_raw/kitti_raw_depth.py
================================================
_base_ = [
    '../_base_/dataset_v1.py',
    '../_base_/optimizer.py',
    '../_base_/schedule.py',
]

img_size = [370, 1216]
num_rays = [44, 152]
amp = False
max_epochs = 24
warmup_iters = 1000
num_cams = 1

multisteplr = True
multisteplr_config = dict(
    decay_t = [4976 * 18],
    decay_rate = 0.1,
    warmup_t = warmup_iters,
    warmup_lr_init = 1e-6,
    t_in_epochs = False
)

optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=1e-4,
        weight_decay=0.01,
        # eps=1e-4
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)


data_path = 'data/kitti_raw/'

train_dataset_config = dict(
    _delete_=True,
    type = 'Kitti_Raw_Stereo',
    root = data_path,
    pose_path = 'dataset/kitti_raw/orb-slam_poses',
    split_path = 'dataset/kitti_raw/splits/eigen_zhou/train_files.txt',
    frames_interval=1.0,
    sequence_distance=5.0,
    cur_prob=1.0,
    crop_size=img_size,
    strict=True,
    return_depth=False,
    prev_prob=0,
    choose_nearest = True,
    eval_depth = 80,
    include_stereo = True
)
    
val_dataset_config = dict(
    _delete_=True,
    type = 'Kitti_Raw_Stereo',
    root = data_path,
    pose_path = 'dataset/kitti_raw/orb-slam_poses',
    split_path = 'dataset/kitti_raw/splits/eigen_zhou/test_files.txt',
    frames_interval=1.0,
    sequence_distance=5.0,
    cur_prob=1.0,
    crop_size=img_size,
    strict=False,
    return_depth=True,
    prev_prob=0.5,
    choose_nearest = True,
    eval_depth = 80,
    include_stereo = True
)

train_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='train', 
    scale_rate=0.84,
    pad_img_size=[320, 1024],
    pad_scale_rate=[0.8649, 0.8421],
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    img_norm_cfg=dict(
        mean=[124.16, 116.74, 103.94], 
        std=[58.624, 57.344, 57.6], to_rgb=True)
)

val_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='val', 
    scale_rate=0.84,
    pad_img_size=[320, 1024],
    pad_scale_rate=[0.8649, 0.8421],
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    img_norm_cfg=dict(
        mean=[124.16, 116.74, 103.94], 
        std=[58.624, 57.344, 57.6], to_rgb=True)
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)

loss = dict(
    type='MultiLoss',
    loss_cfgs=[
        dict(
            type='ReprojLossMonoMultiNew',
            weight=1.0,
            no_ssim=False,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'curr_imgs': 'curr_imgs',
                'prev_imgs': 'prev_imgs',
                'next_imgs': 'next_imgs',
                'ray_indices': 'ray_indices',
                'weights': 'weights',
                'ts': 'ts',
                'metas': 'metas',
                'ms_rays': 'ms_rays',
                # 'deltas': 'deltas'
                }),
        # dict(
        #     type='RGBLossMS',
        #     weight=0.1,
        #     img_size=img_size,
        #     no_ssim=False,
        #     ray_resize=num_rays,
        #     input_dict={
        #         'ms_colors': 'ms_colors',
        #         'ms_rays': 'ms_rays',
        #         'gt_imgs': 'color_imgs'}),
        dict(
            type='EikonalLoss',
            weight=0.1,),
        dict(
            type='EdgeLoss3DMS',
            weight=0.01,
            img_size=img_size,
            ray_resize=num_rays,
        )
        # dict(
        #     type='SecondGradLoss',
        #     weight=0.01),
        # dict(
        #     type='SparsityLoss',
        #     weight=0.001,
        #     scale=0.1,
        #     input_dict={
        #         'density': 'uniform_sdf'}),
        ])

loss_input_convertion = dict(
    ms_depths='ms_depths',
    ms_rays='ms_rays',
    # ms_accs='ms_accs',
    # ms_colors='ms_colors',
    ray_indices='ray_indices',
    weights='weights',
    ts='ts',
    eik_grad='eik_grad',
    # second_grad='second_grad',
    # uniform_sdf='uniform_sdf',
    # deltas='deltas'
)

load_from = ''

_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
    nonlinear_mode='linear',
    h_size=[256, 0],
    h_range=[51.2, 0],
    h_half=True,
    w_size=[128, 0],
    w_range=[25.6, 0],
    w_half=False,
    d_size=[32, 0],
    d_range=[-2.0, 4.4, 4.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 256
tpv_w = 1 + 2 * 128
tpv_z = 1 + 32 + 0
point_cloud_range = [-25.6, 0.0, -2.0, 25.6, 51.2, 4.4]

num_points_cross = [48, 48, 8]
num_points_self = 12


self_cross_layer = dict(
    type='TPVFormerLayer',
    attn_cfgs=[
        dict(
            type='CrossViewHybridAttention',
            embed_dims=_dim_,
            num_heads=num_heads,
            num_levels=3,
            num_points=num_points_self,
            dropout=0.1,
            batch_first=True),
        dict(
            type='TPVCrossAttention',
            embed_dims=_dim_,
            num_cams=num_cams,
            dropout=0.1,
            batch_first=True,
            num_heads=num_heads,
            num_levels=4,
            num_points=num_points_cross)
    ],
    feedforward_channels=_ffn_dim_,
    ffn_dropout=0.1,
    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)


model = dict(
    type='TPVSegmentor',
    img_backbone_out_indices=[0, 1, 2, 3],
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        norm_eval=False,
        style='pytorch',
        pretrained='./ckpts/resnet50-0676ba61.pth'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    lifter=dict(
        type='TPVQueryLifter',
        tpv_h=tpv_h,
        tpv_w=tpv_w,
        tpv_z=tpv_z, 
        dim=_dim_),
    encoder=dict(
        type='TPVFormerEncoder',
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        embed_dims=_dim_,
        num_cams=num_cams,
        num_feature_levels=4,
        positional_encoding=dict(
            type='TPVPositionalEncoding',
            num_freqs=[12] * 3, 
            embed_dims=_dim_, 
            tot_range=point_cloud_range),
        num_points_cross=num_points_cross,
        num_points_self=[num_points_self] * 3,
        transformerlayers=[
            self_cross_layer,
            self_cross_layer,
            self_cross_layer,
            self_cross_layer], 
        num_layers=4),
    head=dict(
        type='NeuSHead',
        roi_aabb=point_cloud_range, 
        resolution=0.4,
        near_plane=0.0,
        far_plane=1e10,
        num_samples=256,
        num_samples_importance=0,
        num_up_sample_steps=0,
        base_variance=4,

        beta_init=0.1,
        beta_max=0.195,
        total_iters=3516*11,
        beta_hand_tune=False,
        
        use_numerical_gradients=False,
        sample_gradient=True,
        return_uniform_sdf=False,
        return_second_grad=False,

        # rays args
        ray_sample_mode='cellular',    # fixed, cellular
        ray_number=num_rays,      # 192 * 400
        ray_img_size=img_size,
        ray_upper_crop=0,
        # img2lidar args
        trans_kw='temImg2lidar',
        novel_view=None,

        # render args
        render_bkgd='random',

        # bev nerf
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        # mlp decoder 
        embed_dims=_dim_,
        color_dims=0,
        density_layers=2,
        sh_deg=0,
        sh_act='relu',
        two_split=False,
        tpv=True))

================================================
FILE: config/nuscenes/nuscenes_depth.py
================================================
_base_ = [
    '../_base_/dataset_v1.py',
    '../_base_/optimizer.py',
    '../_base_/schedule.py',
]

# img_size = [768, 1600]
img_size = [896, 1600]
crop_size = [900, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 24
warmup_iters = 1000

multisteplr = True
multisteplr_config = dict(
    decay_t = [3516 * 18],
    decay_rate = 0.1,
    warmup_t = warmup_iters,
    warmup_lr_init = 1e-6,
    t_in_epochs = False
)

optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=1e-4,
        weight_decay=0.01,
        # eps=1e-4
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)


data_path = 'data/nuscenes/'

train_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_train_sweeps.pkl',
    crop_size = crop_size,
    input_img_crop_size = img_size,
    min_dist = 0.4,
    max_dist = 6.0,
    strict = True,
    return_depth = False,
    eval_depth = 80,
    cur_prob = 1.0,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
)
    
val_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_val_sweeps.pkl',
    crop_size = crop_size,
    input_img_crop_size = img_size,
    min_dist = 0.4,
    max_dist = 6.0,
    strict = False,
    return_depth = True,
    eval_depth = 80,
    cur_prob = 1,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
)

train_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='train', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    # pad_img_size=[416, 864],
    # random_scale=[0.92, 1.08],
    # pad_scale_rate=[0.5417, 0.54],
    # use_flip=True
)

val_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='val', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    ),
    # pad_img_size=[416, 864],
    # pad_scale_rate=[0.5417, 0.54],
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)

loss = dict(
    type='MultiLoss',
    loss_cfgs=[
        dict(
            type='ReprojLossMonoMultiNewCombine',
            weight=1.0,
            no_ssim=False,
            img_size=crop_size,
            ray_resize=num_rays,
            no_automask=False,
            input_dict={
                'curr_imgs': 'curr_imgs',
                'prev_imgs': 'prev_imgs',
                'next_imgs': 'next_imgs',
                'ray_indices': 'ray_indices',
                'weights': 'weights',
                'ts': 'ts',
                'metas': 'metas',
                'ms_rays': 'ms_rays',
                # 'deltas': 'deltas'
                }),
        # dict(
        #     type='RGBLossMS',
        #     weight=0.1,
        #     img_size=img_size,
        #     no_ssim=False,
        #     ray_resize=num_rays,
        #     input_dict={
        #         'ms_colors': 'ms_colors',
        #         'ms_rays': 'ms_rays',
        #         'gt_imgs': 'color_imgs'}),
        dict(
            type='EikonalLoss',
            weight=0.1,),
        dict(
            type='EdgeLoss3DMS',
            weight=0.01,
            img_size=crop_size,
            ray_resize=num_rays,
        ),
        # dict(
        #     type='SecondGradLoss',
        #     weight=0.01),
        # dict(
        #     type='SparsityLoss',
        #     weight=0.001,
        #     scale=0.1,
        #     input_dict={
        #         'density': 'uniform_sdf'}),
        ])

loss_input_convertion = dict(
    ms_depths='ms_depths',
    ms_rays='ms_rays',
    # ms_accs='ms_accs',
    # ms_colors='ms_colors',
    ray_indices='ray_indices',
    weights='weights',
    ts='ts',
    eik_grad='eik_grad',
    # second_grad='second_grad',
    # uniform_sdf='uniform_sdf',
    # deltas='deltas'
)

load_from = ''

_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
    nonlinear_mode='linear',
    h_size=[128, 0],
    h_range=[51.2, 0],
    h_half=False,
    w_size=[128, 0],
    w_range=[51.2, 0],
    w_half=False,
    d_size=[30, 0],
    d_range=[-4.0, 5.0, 5.0]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 30
point_cloud_range = [-51.2, -51.2, -4.0, 51.2, 51.2, 5.0]

num_points_cross = [48, 48, 8]
num_points_self = 12


self_cross_layer = dict(
    type='TPVFormerLayer',
    attn_cfgs=[
        dict(
            type='CrossViewHybridAttention',
            embed_dims=_dim_,
            num_heads=num_heads,
            num_levels=3,
            num_points=num_points_self,
            dropout=0.1,
            batch_first=True),
        dict(
            type='TPVCrossAttention',
            embed_dims=_dim_,
            num_cams=6,
            dropout=0.1,
            batch_first=True,
            num_heads=num_heads,
            num_levels=4,
            num_points=num_points_cross)
    ],
    feedforward_channels=_ffn_dim_,
    ffn_dropout=0.1,
    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)


model = dict(
    type='TPVSegmentor',
    img_backbone_out_indices=[0, 1, 2, 3],
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        norm_eval=False,
        style='pytorch',
        pretrained='./ckpts/resnet50-0676ba61.pth'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    lifter=dict(
        type='TPVQueryLifter',
        tpv_h=tpv_h,
        tpv_w=tpv_w,
        tpv_z=tpv_z, 
        dim=_dim_),
    encoder=dict(
        type='TPVFormerEncoder',
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,
        # camera_aware=True,
        # camera_aware_mid_channels=96,

        embed_dims=_dim_,
        num_cams=6,
        num_feature_levels=4,
        positional_encoding=dict(
            type='TPVPositionalEncoding',
            num_freqs=[12] * 3, 
            embed_dims=_dim_, 
            tot_range=point_cloud_range),
        num_points_cross=num_points_cross,
        num_points_self=[num_points_self] * 3,
        transformerlayers=[
            self_cross_layer,
            self_cross_layer,
            self_cross_layer,
            self_cross_layer], 
        num_layers=4),
    head=dict(
        type='NeuSHead',
        roi_aabb=point_cloud_range, 
        resolution=0.4,
        near_plane=0.0,
        far_plane=1e10,
        num_samples=256,
        num_samples_importance=0,
        num_up_sample_steps=0,
        base_variance=4,

        beta_init=0.1,
        beta_max=0.195,
        total_iters=3516*11,
        beta_hand_tune=False,
        
        use_numerical_gradients=False,
        sample_gradient=True,
        return_uniform_sdf=False,
        return_second_grad=False,

        # rays args
        ray_sample_mode='cellular',    # fixed, cellular
        ray_number=num_rays,      # 192 * 400
        ray_img_size=crop_size,
        ray_upper_crop=0,
        # img2lidar args
        trans_kw='temImg2lidar',
        novel_view=None,

        # render args
        render_bkgd='random',

        # bev nerf
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        # mlp decoder 
        embed_dims=_dim_,
        color_dims=0,
        density_layers=2,
        sh_deg=0,
        sh_act='relu',
        two_split=False,
        tpv=True))

================================================
FILE: config/nuscenes/nuscenes_novel_depth.py
================================================
_base_ = [
    '../_base_/dataset_v1.py',
    '../_base_/optimizer.py',
    '../_base_/schedule.py',
]

img_size = [768, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 24
warmup_iters = 1000

multisteplr = True
multisteplr_config = dict(
    decay_t = [3516 * 18],
    decay_rate = 0.1,
    warmup_t = warmup_iters,
    warmup_lr_init = 1e-6,
    t_in_epochs = False
)

optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=1e-4,
        weight_decay=0.01,
        # eps=1e-4
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)


data_path = 'data/nuscenes/'

train_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_train_sweeps.pkl',
    crop_size = img_size,
    min_dist = 0.4,
    max_dist = 10.0,
    strict = True,
    return_depth = False,
    eval_depth = 80,
    cur_prob = 0.333,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
)
    
val_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_val_sweeps.pkl',
    crop_size = img_size,
    min_dist = 0.4,
    max_dist = 10.0,
    strict = False,
    return_depth = True,
    eval_depth = 80,
    cur_prob = 1,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
)

train_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='train', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

val_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='val', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)

loss = dict(
    type='MultiLoss',
    loss_cfgs=[
        dict(
            type='ReprojLossMonoMultiNewCombine',
            weight=1.0,
            no_ssim=False,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'curr_imgs': 'curr_imgs',
                'prev_imgs': 'prev_imgs',
                'next_imgs': 'next_imgs',
                'ray_indices': 'ray_indices',
                'weights': 'weights',
                'ts': 'ts',
                'metas': 'metas',
                'ms_rays': 'ms_rays',
                # 'deltas': 'deltas'
                }),
        dict(
            type='RGBLossMS',
            weight=0.1,
            img_size=img_size,
            no_ssim=False,
            ray_resize=num_rays,
            input_dict={
                'ms_colors': 'ms_colors',
                'ms_rays': 'ms_rays',
                'gt_imgs': 'color_imgs'}),
        dict(
            type='EikonalLoss',
            weight=0.1,),
        dict(
            type='SecondGradLoss',
            weight=0.01),
        # dict(
        #     type='SparsityLoss',
        #     weight=0.001,
        #     scale=0.1,
        #     input_dict={
        #         'density': 'uniform_sdf'}),
        ])

loss_input_convertion = dict(
    ms_depths='ms_depths',
    ms_rays='ms_rays',
    # ms_accs='ms_accs',
    ms_colors='ms_colors',
    ray_indices='ray_indices',
    weights='weights',
    ts='ts',
    eik_grad='eik_grad',
    second_grad='second_grad',
    # uniform_sdf='uniform_sdf',
    # deltas='deltas'
)

load_from = ''

_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
    nonlinear_mode='linear',
    h_size=[128, 0],
    h_range=[51.2, 0],
    h_half=False,
    w_size=[128, 0],
    w_range=[51.2, 0],
    w_half=False,
    d_size=[30, 0],
    d_range=[-4.0, 5.0, 5.0]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 30
point_cloud_range = [-51.2, -51.2, -4.0, 51.2, 51.2, 5.0]

num_points_cross = [48, 48, 8]
num_points_self = 12


self_cross_layer = dict(
    type='TPVFormerLayer',
    attn_cfgs=[
        dict(
            type='CrossViewHybridAttention',
            embed_dims=_dim_,
            num_heads=num_heads,
            num_levels=3,
            num_points=num_points_self,
            dropout=0.1,
            batch_first=True),
        dict(
            type='TPVCrossAttention',
            embed_dims=_dim_,
            num_cams=6,
            dropout=0.1,
            batch_first=True,
            num_heads=num_heads,
            num_levels=4,
            num_points=num_points_cross)
    ],
    feedforward_channels=_ffn_dim_,
    ffn_dropout=0.1,
    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)


model = dict(
    type='TPVSegmentor',
    img_backbone_out_indices=[0, 1, 2, 3],
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        norm_eval=False,
        style='pytorch',
        pretrained='./ckpts/resnet50-0676ba61.pth'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    lifter=dict(
        type='TPVQueryLifter',
        tpv_h=tpv_h,
        tpv_w=tpv_w,
        tpv_z=tpv_z, 
        dim=_dim_),
    encoder=dict(
        type='TPVFormerEncoder',
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        embed_dims=_dim_,
        num_cams=6,
        num_feature_levels=4,
        positional_encoding=dict(
            type='TPVPositionalEncoding',
            num_freqs=[12] * 3, 
            embed_dims=_dim_, 
            tot_range=point_cloud_range),
        num_points_cross=num_points_cross,
        num_points_self=[num_points_self] * 3,
        transformerlayers=[
            self_cross_layer,
            self_cross_layer,
            self_cross_layer,
            self_cross_layer], 
        num_layers=4),
    head=dict(
        type='NeuSHead',
        roi_aabb=point_cloud_range, 
        resolution=0.4,
        near_plane=0.0,
        far_plane=1e10,
        num_samples=256,
        num_samples_importance=0,
        num_up_sample_steps=0,
        base_variance=4,

        beta_init=0.1,
        beta_max=0.195,
        total_iters=3516*11,
        beta_hand_tune=False,
        
        use_numerical_gradients=False,
        sample_gradient=True,
        return_uniform_sdf=False,
        return_second_grad=True,

        # rays args
        ray_sample_mode='cellular',    # fixed, cellular
        ray_number=num_rays,      # 192 * 400
        ray_img_size=img_size,
        ray_upper_crop=0,
        # img2lidar args
        trans_kw='temImg2lidar',
        novel_view=None,

        # render args
        render_bkgd='random',

        # bev nerf
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        # mlp decoder 
        embed_dims=_dim_,
        color_dims=3,
        density_layers=2,
        sh_deg=0,
        sh_act='relu',
        two_split=False,
        tpv=True))

================================================
FILE: config/nuscenes/nuscenes_occ.py
================================================
_base_ = [
    '../_base_/dataset_v1.py',
    '../_base_/optimizer.py',
    '../_base_/schedule.py',
]

img_size = [768, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 12
warmup_iters = 1000
sem = True

multisteplr = True
multisteplr_config = dict(
    decay_t = [3516 * 9],
    decay_rate = 0.1,
    warmup_t = warmup_iters,
    warmup_lr_init = 1e-6,
    t_in_epochs = False
)

optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=1e-4,
        weight_decay=0.01,
        # eps=1e-4
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)


data_path = 'data/nuscenes/'

train_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_train_sweeps.pkl',
    crop_size = img_size,
    min_dist = 0.4,
    max_dist = 30.0,
    strict = True,
    return_depth = False,
    eval_depth = 80,
    cur_prob = 0.333,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
    ego_centric=True,
)
    
val_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_val_sweeps.pkl',
    crop_size = img_size,
    min_dist = 0.4,
    max_dist = 30.0,
    strict = False,
    return_depth = True,
    eval_depth = 80,
    cur_prob = 1,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
    ego_centric=True
)

train_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='train', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

val_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='val', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)

loss = dict(
    type='MultiLoss',
    loss_cfgs=[
        dict(
            type='ReprojLossMonoMultiNewCombine',
            weight=1.0,
            no_ssim=False,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'curr_imgs': 'curr_imgs',
                'prev_imgs': 'prev_imgs',
                'next_imgs': 'next_imgs',
                'ray_indices': 'ray_indices',
                'weights': 'weights',
                'ts': 'ts',
                'metas': 'metas',
                'ms_rays': 'ms_rays',
                # 'deltas': 'deltas'
                }),
        dict(
            type='RGBLossMS',
            weight=0.1,
            img_size=img_size,
            no_ssim=False,
            ray_resize=num_rays,
            input_dict={
                'ms_colors': 'ms_colors',
                'ms_rays': 'ms_rays',
                'gt_imgs': 'color_imgs'}),
        dict(
            type='EikonalLoss',
            weight=0.1,),
        dict(
            type='SecondGradLoss',
            weight=0.01),
        dict(
            type='SemCELossMS',
            weight=0.1,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'sem': 'sem',
                'metas': 'metas',
                'ms_rays': 'ms_rays'}),
        # dict(
        #     type='AdaptiveSparsityLoss',
        #     weight=0.001,
        #     slack=4.0,
        #     input_dict={
        #         'sdfs': 'sdfs',
        #         'ts': 'ts',
        #         'ms_depths': 'ms_depths'})
        # dict(
        #     type='SparsityLoss',
        #     weight=0.001,
        #     scale=0.1,
        #     input_dict={
        #         'density': 'uniform_sdf'}),
        ])

loss_input_convertion = dict(
    ms_depths='ms_depths',
    ms_rays='ms_rays',
    # ms_accs='ms_accs',
    ms_colors='ms_colors',
    ray_indices='ray_indices',
    weights='weights',
    ts='ts',
    eik_grad='eik_grad',
    second_grad='second_grad',
    sem='sem',
    # sdfs='sample_sdf'
    # uniform_sdf='uniform_sdf',
    # deltas='deltas'
)

load_from = ''

_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
    nonlinear_mode='linear',
    h_size=[128, 0],
    h_range=[40.0, 0],
    h_half=False,
    w_size=[128, 0],
    w_range=[40.0, 0],
    w_half=False,
    d_size=[24, 0],
    d_range=[-1.0, 5.4, 5.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 24
point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]

num_points_cross = [48, 48, 8]
num_points_self = 12


self_cross_layer = dict(
    type='TPVFormerLayer',
    attn_cfgs=[
        dict(
            type='CrossViewHybridAttention',
            embed_dims=_dim_,
            num_heads=num_heads,
            num_levels=3,
            num_points=num_points_self,
            dropout=0.1,
            batch_first=True),
        dict(
            type='TPVCrossAttention',
            embed_dims=_dim_,
            num_cams=6,
            dropout=0.1,
            batch_first=True,
            num_heads=num_heads,
            num_levels=4,
            num_points=num_points_cross)
    ],
    feedforward_channels=_ffn_dim_,
    ffn_dropout=0.1,
    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)


model = dict(
    type='TPVSegmentor',
    img_backbone_out_indices=[0, 1, 2, 3],
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        norm_eval=False,
        style='pytorch',
        pretrained='./ckpts/resnet50-0676ba61.pth'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    lifter=dict(
        type='TPVQueryLifter',
        tpv_h=tpv_h,
        tpv_w=tpv_w,
        tpv_z=tpv_z, 
        dim=_dim_),
    encoder=dict(
        type='TPVFormerEncoder',
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        embed_dims=_dim_,
        num_cams=6,
        num_feature_levels=4,
        positional_encoding=dict(
            type='TPVPositionalEncoding',
            num_freqs=[12] * 3, 
            embed_dims=_dim_, 
            tot_range=point_cloud_range),
        num_points_cross=num_points_cross,
        num_points_self=[num_points_self] * 3,
        transformerlayers=[
            self_cross_layer,
            self_cross_layer,
            self_cross_layer,
            self_cross_layer], 
        num_layers=4),
    head=dict(
        type='NeuSHead',
        roi_aabb=point_cloud_range, 
        resolution=0.4,
        near_plane=0.0,
        far_plane=1e10,
        num_samples=256,
        num_samples_importance=0,
        num_up_sample_steps=0,
        base_variance=4,

        beta_init=0.1,
        beta_max=0.195,
        total_iters=3516*11,
        beta_hand_tune=False,
        
        use_numerical_gradients=False,
        sample_gradient=True,
        return_uniform_sdf=False,
        return_second_grad=True,
        return_sem=True,
        return_sample_sdf=False,

        # rays args
        ray_sample_mode='cellular',    # fixed, cellular
        ray_number=num_rays,      # 192 * 400
        ray_img_size=img_size,
        ray_upper_crop=0,
        # img2lidar args
        trans_kw='temImg2lidar',
        novel_view=None,

        # render args
        render_bkgd='random',

        # bev nerf
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        # mlp decoder 
        embed_dims=_dim_,
        color_dims=24,
        density_layers=2,
        sh_deg=0,
        sh_act='relu',
        two_split=False,
        tpv=True))

================================================
FILE: config/nuscenes/nuscenes_occ_bev.py
================================================
_base_ = [
    '../_base_/dataset_v1.py',
    '../_base_/optimizer.py',
    '../_base_/schedule.py',
]

img_size = [768, 1600]
num_rays = [48, 100]
amp = False
max_epochs = 12
warmup_iters = 1000
sem = True

multisteplr = True
multisteplr_config = dict(
    decay_t = [3516 * 9],
    decay_rate = 0.1,
    warmup_t = warmup_iters,
    warmup_lr_init = 1e-6,
    t_in_epochs = False
)

optimizer = dict(
    optimizer=dict(
        type='AdamW',
        lr=1e-4,
        weight_decay=0.01,
        # eps=1e-4
    ),
    paramwise_cfg=dict(
        custom_keys={
            'img_backbone': dict(lr_mult=0.1),}
    ),
)


data_path = 'data/nuscenes/'

train_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_train_sweeps.pkl',
    crop_size = img_size,
    min_dist = 0.4,
    max_dist = 30.0,
    strict = True,
    return_depth = False,
    eval_depth = 80,
    cur_prob = 0.333,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
    ego_centric=True,
)
    
val_dataset_config = dict(
    _delete_=True,
    type='nuScenes_One_Frame_Sweeps_Dist',
    data_path = data_path,
    imageset = 'data/nuscenes_infos_val_sweeps.pkl',
    crop_size = img_size,
    min_dist = 0.4,
    max_dist = 30.0,
    strict = False,
    return_depth = True,
    eval_depth = 80,
    cur_prob = 1,
    prev_prob = 0.5,
    choose_nearest = True,
    ref_sensor = 'CAM_FRONT',
    composite_prev_next=True,
    sensor_mus=[0.5, 0.5],
    sensor_sigma=0.5,
    ego_centric=True
)

train_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='train', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

val_wrapper_config = dict(
    type='tpvformer_dataset_nuscenes_temporal',
    phase='val', 
    scale_rate=0.5,
    photometric_aug=dict(
        use_swap_channel=False,
    )
)

train_loader = dict(
    batch_size = 1,
    shuffle = True,
    num_workers = 1,
)
    
val_loader = dict(
    batch_size = 1,
    shuffle = False,
    num_workers = 1,
)

loss = dict(
    type='MultiLoss',
    loss_cfgs=[
        dict(
            type='ReprojLossMonoMultiNewCombine',
            weight=1.0,
            no_ssim=False,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'curr_imgs': 'curr_imgs',
                'prev_imgs': 'prev_imgs',
                'next_imgs': 'next_imgs',
                'ray_indices': 'ray_indices',
                'weights': 'weights',
                'ts': 'ts',
                'metas': 'metas',
                'ms_rays': 'ms_rays',
                # 'deltas': 'deltas'
                }),
        dict(
            type='RGBLossMS',
            weight=0.1,
            img_size=img_size,
            no_ssim=False,
            ray_resize=num_rays,
            input_dict={
                'ms_colors': 'ms_colors',
                'ms_rays': 'ms_rays',
                'gt_imgs': 'color_imgs'}),
        dict(
            type='EikonalLoss',
            weight=0.1,),
        dict(
            type='SecondGradLoss',
            weight=0.01),
        dict(
            type='SemLossMS',
            weight=0.1,
            img_size=img_size,
            ray_resize=num_rays,
            input_dict={
                'sem': 'sem',
                'metas': 'metas',
                'ms_rays': 'ms_rays'}),
        dict(
            type='SoftSparsityLoss',
            weight=0.001,
            input_dict={
                'density': 'uniform_sdf'})
        # dict(
        #     type='SparsityLoss',
        #     weight=0.001,
        #     scale=0.1,
        #     input_dict={
        #         'density': 'uniform_sdf'}),
        ])

loss_input_convertion = dict(
    ms_depths='ms_depths',
    ms_rays='ms_rays',
    # ms_accs='ms_accs',
    ms_colors='ms_colors',
    ray_indices='ray_indices',
    weights='weights',
    ts='ts',
    eik_grad='eik_grad',
    second_grad='second_grad',
    sem='sem',
    # sdfs='sample_sdf'
    uniform_sdf='uniform_sdf',
    # deltas='deltas'
)

load_from = ''

_dim_ = 96
_ffn_dim_ = 2 * _dim_
num_heads = 6
mapping_args = dict(
    nonlinear_mode='linear',
    h_size=[128, 0],
    h_range=[40.0, 0],
    h_half=False,
    w_size=[128, 0],
    w_range=[40.0, 0],
    w_half=False,
    d_size=[24, 0],
    d_range=[-1.0, 5.4, 5.4]
)
# bev_inner = 160
# bev_outer = 1
# range_inner = 80.0
# range_outer = 1.0
# nonlinear_mode = 'linear_upscale'
# z_inner = 20
# z_outer = 10
# z_ranges = [-4.0, 4.0, 12.0]
tpv_h = 1 + 2 * 128
tpv_w = 1 + 2 * 128
tpv_z = 1 + 24
point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]

num_points_cross = 8
num_points_self = 12


self_cross_layer = dict(
    type='BEVFormerLayer',
    attn_cfgs=[
        dict(
            type='MultiScaleDeformableAttention',
            embed_dims=_dim_,
            num_heads=num_heads,
            num_levels=1,
            num_points=num_points_self,
            dropout=0.1,
            batch_first=True),
        dict(
            type='BEVCrossAttention',
            embed_dims=_dim_,
            num_cams=6,
            dropout=0.1,
            batch_first=True,
            deformable_attention=dict(
                type='BEVDeformableAttention',
                embed_dims=_dim_,
                num_heads=num_heads,
                num_levels=4,
                num_points=num_points_cross,
                dropout=0.1,
                batch_first=True))
    ],
    feedforward_channels=_ffn_dim_,
    ffn_dropout=0.1,
    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')
)


model = dict(
    type='TPVSegmentor',
    img_backbone_out_indices=[0, 1, 2, 3],
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(0, 1, 2, 3),
        frozen_stages=0,
        norm_eval=False,
        style='pytorch',
        pretrained='./ckpts/resnet50-0676ba61.pth'),
    img_neck=dict(
        type='FPN',
        in_channels=[256, 512, 1024, 2048],
        out_channels=_dim_,
        start_level=0,
        add_extra_convs='on_output',
        num_outs=4,
        relu_before_extra_convs=True),
    lifter=dict(
        type='BEVQueryLifter',
        bev_h=tpv_h,
        bev_w=tpv_w, 
        dim=_dim_),
    encoder=dict(
        type='BEVFormerEncoder',
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,
        # camera_aware=True,
        # camera_aware_mid_channels=96,

        embed_dims=_dim_,
        num_cams=6,
        num_feature_levels=4,
        positional_encoding=dict(
            type='BEVPositionalEncoding',
            num_freqs=12, 
            embed_dims=_dim_, 
            tot_range=point_cloud_range),
        num_points_cross=num_points_cross,
        num_points_self=num_points_self,
        transformerlayers=[
            self_cross_layer,
            self_cross_layer,
            self_cross_layer,
            self_cross_layer], 
        num_layers=4),
    head=dict(
        type='NeuSHead',
        roi_aabb=point_cloud_range, 
        resolution=0.4,
        near_plane=0.0,
        far_plane=1e10,
        num_samples=256,
        num_samples_importance=0,
        num_up_sample_steps=0,
        base_variance=4,

        beta_init=0.1,
        beta_max=0.195,
        total_iters=3516*11,
        beta_hand_tune=False,
        
        use_numerical_gradients=False,
        sample_gradient=True,
        return_uniform_sdf=True,
        return_second_grad=True,
        return_sem=True,
        return_sample_sdf=False,

        # rays args
        ray_sample_mode='cellular',    # fixed, cellular
        ray_number=num_rays,      # 192 * 400
        ray_img_size=img_size,
        ray_upper_crop=0,
        # img2lidar args
        trans_kw='temImg2lidar',
        novel_view=None,

        # render args
        render_bkgd='random',

        # bev nerf
        # bev_inner=bev_inner,
        # bev_outer=bev_outer,
        # range_inner=range_inner,
        # range_outer=range_outer,
        # nonlinear_mode=nonlinear_mode,
        # z_inner=z_inner,
        # z_outer=z_outer,
        # z_ranges=z_ranges,
        mapping_args=mapping_args,

        # mlp decoder 
        embed_dims=_dim_,
        color_dims=24,
        density_layers=2,
        sh_deg=0,
        sh_act='relu',
        two_split=False,
        tpv=False))

================================================
FILE: config/openseed/openseed_swint_lang.yaml
================================================
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# ------------------------------------------------------------------------
# OpenSeeD
# Copyright (c) 2023 IDEA. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from MaskDINO https://github.com/IDEA-Research/MaskDINO by Feng Li and Hao Zhang.
# ------------------------------------------------------------------------

##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
VERBOSE: true
OUTPUT_DIR: '../../data/output/test'
# misc
LOADER:
  JOINT: True
  KEY_DATASET: 'coco'
# model
MODEL:
  NAME: openseed_model
  HEAD: openseed_head
  MASK_ON: false
  KEYPOINT_ON: false
  LOAD_PROPOSALS: false
  DIM_PROJ: 512
  BACKBONE_DIM: 768
  BACKGROUND: False
  WEIGHTS: ''
  TEXT:
    ARCH: encoder
    NAME: transformer
    TOKENIZER: clip
    CONTEXT_LENGTH: 18 # 18
    WIDTH: 512
    HEADS: 8
    LAYERS: 12
    AUTOGRESSIVE: True
  BACKBONE:
    NAME: swin
    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
    LOAD_PRETRAINED: true
    SWIN:
      PRETRAIN_IMG_SIZE: 224
      PATCH_SIZE: 4
      EMBED_DIM: 96
      DEPTHS: [ 2, 2, 6, 2 ]
      NUM_HEADS: [ 3, 6, 12, 24 ]
      WINDOW_SIZE: 7
      MLP_RATIO: 4.0
      QKV_BIAS: true
      QK_SCALE: ~
      DROP_RATE: 0.0
      ATTN_DROP_RATE: 0.0
      DROP_PATH_RATE: 0.3
      APE: false
      PATCH_NORM: true
      USE_CHECKPOINT: false
      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
  ENCODER:
    NAME: encoder_deform
    IGNORE_VALUE: 255
    NUM_CLASSES: 133
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
    TOTAL_NUM_FEATURE_LEVELS: 4
    NUM_FEATURE_LEVELS: 3
    FEATURE_ORDER: "low2high"
  DECODER:
    NAME: openseed_decoder
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    MASK: True
    BOX: True
    GROUNDING:
      ENABLED: False
      MAX_LEN: 5
      TEXT_WEIGHT: 2.0
      CLASS_WEIGHT: 0.5
    CAPTION:
      ENABLED: False
      PHRASE_PROB: 0.0
      SIM_THRES: 0.95
    CAPTIONING:
      ENABLED: False
      STEP: 50
    RETRIEVAL:
      ENABLED: False
      DIM_IMG: 768
      ENSEMBLE: True
    OPENIMAGE:
      ENABLED: False
      NEGATIVE_SAMPLES: 5
      GROUNDING:
        ENABLED: False
        MAX_LEN: 5
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 4.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    BOX_WEIGHT: 5.0
    GIOU_WEIGHT: 2.0
    COST_CLASS_WEIGHT: 4.0
    COST_DICE_WEIGHT: 5.0
    COST_MASK_WEIGHT: 5.0
    COST_BOX_WEIGHT: 5.0
    COST_GIOU_WEIGHT: 2.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 300
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TWO_STAGE: True
    INITIALIZE_BOX_TYPE: 'no'
    DN: seg
    DN_NOISE_SCALE: 0.4
    DN_NUM: 100
    INITIAL_PRED: True
    LEARN_TGT: False
    TOTAL_NUM_FEATURE_LEVELS: 4
    SEMANTIC_CE_LOSS: False
    PANO_BOX_LOSS: False
    COCO: True
    O365: False
    TEST:
      SEMANTIC_ON: True
      INSTANCE_ON: True
      PANOPTIC_ON: True
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.25
      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
      TEST_FOUCUS_ON_BOX: False
      PANO_TRANSFORM_EVAL: True
      PANO_TEMPERATURE: 0.06

TEST:
  EVAL_PERIOD: 5000
  PRECISE_BN:
    NUM_ITER: 1
    ENABLED: False
  AUG:
    ENABLED: False

COCO:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 0.1
    MAX_SCALE: 2.0
    DATASET_MAPPER_NAME: "coco_panoptic_lsj"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 16
    BATCH_SIZE_PER_GPU: 2
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 2
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

VLP:
  INPUT:
    IMAGE_SIZE: 224
    DATASET_MAPPER_NAME: "vlpretrain"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TRAIN:
    BATCH_SIZE_TOTAL: 2
    BATCH_SIZE_PER_GPU: 2
  TEST:
    BATCH_SIZE_TOTAL: 256
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

INPUT:
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]

DATASETS:
  TRAIN: ["coco_2017_train_panoptic"]
  TEST: ["coco_2017_val_panoptic_with_sem_seg"]
  CLASS_CONCAT: false
  SIZE_DIVISIBILITY: 32
  PROPOSAL_FILES_TRAIN: []

DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS: 16
  LOAD_PROPOSALS: False
  SAMPLER_TRAIN: "TrainingSampler"
  ASPECT_RATIO_GROUPING: True

# Detectron2 training config for optimizer and lr scheduler
SOLVER:
  BASE_LR_END: 0.0
  MOMENTUM: 0.9
  NESTEROV: False
  CHECKPOINT_PERIOD: 5000
  IMS_PER_BATCH: 16
  REFERENCE_WORLD_SIZE: 0
  BIAS_LR_FACTOR: 1.0
  WEIGHT_DECAY_BIAS: None
  # original
  BASE_LR: 0.0001
  STEPS: [327778, 355092]
  MAX_ITER: 368750
  GAMMA: 0.1
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WARMUP_METHOD: "linear"
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
  LR_MULTIPLIER:
    backbone: 0.1
    lang_encoder: 0.1
  WEIGHT_DECAY_NORM: 0.0
  WEIGHT_DECAY_EMBED: 0.0
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True

# Evaluation Dataset
ADE20K:
  INPUT:
    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 640
    MAX_SIZE_TRAIN: 2560
    MAX_SIZE_TEST: 2560
    MASK_FORMAT: "polygon"
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [640, 640]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 640  # used in dataset mapper
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    FORMAT: "RGB"
  DATASET:
    DATASET: 'ade'
  TRAIN:
    ASPECT_RATIO_GROUPING: true
    BATCH_SIZE_TOTAL: 16
    BATCH_SIZE_PER_GPU: 2
    SHUFFLE: true
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 8
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True


REF:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
    FORMAT: "RGB"
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SUN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SCAN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

BDD:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

CITY:
  INPUT:
    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 1024
    MAX_SIZE_TRAIN: 4096
    MAX_SIZE_TEST: 2048
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [ 512, 1024 ]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: -1
    FORMAT: "RGB"
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    MASK_FORMAT: "polygon"
    TEST:
      EVAL_PERIOD: 5000
      BATCH_SIZE_TOTAL: 1
      AUG:
        ENABLED: False
        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
        MAX_SIZE: 4096
        FLIP: True
    DATALOADER:
      FILTER_EMPTY_ANNOTATIONS: True
      NUM_WORKERS: 2
      LOAD_PROPOSALS: False
      SAMPLER_TRAIN: "TrainingSampler"
      ASPECT_RATIO_GROUPING: True
    TRAIN:
      ASPECT_RATIO_GROUPING: true
      BATCH_SIZE_TOTAL: 2
      BATCH_SIZE_PER_GPU: 2
      SHUFFLE: true


================================================
FILE: dataset/__init__.py
================================================
from mmengine.registry import Registry
OPENOCC_DATASET = Registry('openocc_dataset')
OPENOCC_DATAWRAPPER = Registry('openocc_datawrapper')

from .dataset_one_frame_sweeps_dist import nuScenes_One_Frame_Sweeps_Dist
from .dataset_one_frame_sweeps_dist_vis import nuScenes_One_Frame_Sweeps_Dist_Vis
from .dataset_wrapper_vis import tpvformer_dataset_nuscenes_vis
from .dataset_one_frame_eval import nuScenes_One_Frame_Eval
from .dataset_wrapper_temporal import tpvformer_dataset_nuscenes_temporal, custom_collate_fn_temporal
from .sampler import CustomDistributedSampler
from .kitti.kitti_dataset_eval import Kitti_Novel_View_Eval
from .kitti_raw.kitti_raw_dataset import Kitti_Raw
from .kitti_raw.kitti_raw_dataset_stereo import Kitti_Raw_Stereo
from .kitti.kitti_dataset_one_frame import Kitti_One_Frame

from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.dataloader import DataLoader


def get_dataloader(
    train_dataset_config, 
    val_dataset_config, 
    train_wrapper_config,
    val_wrapper_config,
    train_loader, 
    val_loader, 
    nusc=dict(
        version='v1.0-trainval',
        dataroot='data/nuscenes'),
    dist=False,
    iter_resume=False,
    train_sampler_config=dict(
        shuffle=True,
        drop_last=True),
    val_sampler_config=dict(
        shuffle=False,
        drop_last=False),
    val_only=False,
):
    # if nusc is not None:
    #     from nuscenes import NuScenes
    #     nusc = NuScenes(**nusc)
    if val_only:
        val_dataset = OPENOCC_DATASET.build(
            val_dataset_config,
            default_args={'nusc': nusc})
        
        val_wrapper = OPENOCC_DATAWRAPPER.build(
            val_wrapper_config,
            default_args={'in_dataset': val_dataset})
        
        val_sampler = None
        if dist:
            val_sampler = DistributedSampler(val_wrapper, **val_sampler_config)

        val_dataset_loader = DataLoader(
            dataset=val_wrapper,
            batch_size=val_loader["batch_size"],
            collate_fn=custom_collate_fn_temporal,
            shuffle=False,
            sampler=val_sampler,
            num_workers=val_loader["num_workers"],
            pin_memory=True)

        return None, val_dataset_loader

    train_dataset = OPENOCC_DATASET.build(
        train_dataset_config,
        default_args={'nusc': nusc})
    val_dataset = OPENOCC_DATASET.build(
        val_dataset_config,
        default_args={'nusc': nusc})
    
    train_wrapper = OPENOCC_DATAWRAPPER.build(
        train_wrapper_config,
        default_args={'in_dataset': train_dataset})
    val_wrapper = OPENOCC_DATAWRAPPER.build(
        val_wrapper_config,
        default_args={'in_dataset': val_dataset})
    
    train_sampler = val_sampler = None
    if dist:
        if iter_resume:
            train_sampler = CustomDistributedSampler(train_wrapper, **train_sampler_config)
        else:
            train_sampler = DistributedSampler(train_wrapper, **train_sampler_config)
        val_sampler = DistributedSampler(val_wrapper, **val_sampler_config)

    train_dataset_loader = DataLoader(
        dataset=train_wrapper,
        batch_size=train_loader["batch_size"],
        collate_fn=custom_collate_fn_temporal,
        shuffle=False if dist else train_loader["shuffle"],
        sampler=train_sampler,
        num_workers=train_loader["num_workers"],
        pin_memory=True)
    val_dataset_loader = DataLoader(
        dataset=val_wrapper,
        batch_size=val_loader["batch_size"],
        collate_fn=custom_collate_fn_temporal,
        shuffle=False,
        sampler=val_sampler,
        num_workers=val_loader["num_workers"],
        pin_memory=True)

    return train_dataset_loader, val_dataset_loader

================================================
FILE: dataset/dataset_one_frame_eval.py
================================================
import os, numpy as np, pickle, random
from mmcv.image.io import imread
from pyquaternion import Quaternion
from copy import deepcopy
from . import OPENOCC_DATASET

if 'HFAI' in os.environ:
    hfai = True
    from dataset.loading import LoadMultiViewImageFromFilesHF, \
        LoadPtsFromFilesHF
else:
    hfai = False


@OPENOCC_DATASET.register_module()
class nuScenes_One_Frame_Eval:
    def __init__(
            self, 
            data_path, 
            imageset='train', 
            nusc=None,
            crop_size=[768, 1600],
            cam_types=None,
            eval_depth=80,
            scene_name=None,
            **kwargs):

        with open(imageset, 'rb') as f:
            data = pickle.load(f)

        self.nusc_infos = data['infos']
        if scene_name is not None:
            assert isinstance(scene_name, str) and scene_name.startswith('scene-')
            selected_indices = data['scene_info'][scene_name]
            self.selected_indices = list(range(selected_indices[0], selected_indices[1] + 1))
        else:
            self.selected_indices = list(range(len(self.nusc_infos)))

        self.data_path = data_path
        self.nusc = nusc
        self.crop_size = crop_size

        self.cam_types = [
            'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 
            'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'
            ] if cam_types is None else cam_types
        self.eval_depth = eval_depth

        if hfai:
            self.img_loader = LoadMultiViewImageFromFilesHF(
                to_float32=True,
                file_client_args=dict(
                    backend='ffrecord',
                    fname=data_path+'CAM',
                    filename2idx='img_fname2idx.pkl'))
            self.pts_loader = LoadPtsFromFilesHF(
                to_float32=True,
                file_client_args=dict(
                    backend='ffrecord',
                    fname=data_path+'LIDAR',
                    filename2idx='pts_fname2idx.pkl'))
        else:
            self.img_loader = self.pts_loader = None
        
        self.lidar2cam_rect = np.eye(4)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.selected_indices)
    
    def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
        # lidar2img: N, 4, 4
        scan = np.fromfile(lidar_path, dtype=np.float32)
        scan = scan.reshape((-1, 5))[:, :4]
        scan[:, 3] = 1.0
        # points_hcoords = scan[scan[:, 0] > 0, :]
        points_hcoords = np.expand_dims(scan.T, 0) # 1, 4, n
        img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4

        depth = img_points[..., 2] # N, n
        img_points = img_points[..., :2] # N, n, 2
        mask = (depth < self.eval_depth) & (depth > 1e-3)  # get points with depth < max_sample_depth

        img_points = img_points / np.expand_dims(depth, axis=2)  # scale 2D points
        img_points[..., 0] = img_points[..., 0] / image_size[1]
        img_points[..., 1] = img_points[..., 1] / image_size[0]
        # img_points = np.round(img_points).astype(int)
        mask = mask & (img_points[..., 0] > 0) & \
                    (img_points[..., 1] > 0) & \
                    (img_points[..., 0] < 1) & \
                    (img_points[..., 1] < 1)

        return img_points, depth, mask

    def __getitem__(self, index):
        #### 2. get self, prev, next infos for the stem, and also temp_depth info
        while True:
            index = self.selected_indices[index]
            info = deepcopy(self.nusc_infos[index])
            if len(info['nice_neighbor_prev']) == 0 and len(info['nice_neighbor_next']) ==0:
                index = np.random.randint(len(self))
                continue
            break

        #### 3. prepare img_metas
        imgs_info = self.get_data_info(info)
        img_metas = {
            'input_imgs_path': imgs_info['img_filename'],
            'lidar2img': imgs_info['lidar2img'],
            'img2lidar': imgs_info['img2lidar'],
            'ego2lidar': imgs_info['ego2lidar'],
            'token': info['token'],
            'timestamp': info['timestamp'],
            'intrinsic': imgs_info['cam_intrinsic']}

        anchor_imgs = []                       # list[list[array]]
        anchor_depth_locs = []                 # list[array]
        anchor_depth_gts = []                  # list[array]
        anchor_depth_masks = []                # list[array]
        temImg2lidars = []                     # list[list[array]]
        anchor_frame_dist = \
            info['prev_dists'] + info['next_dists'] # list[float]
        
        for anchor in info['nice_neighbor_prev'] + info['nice_neighbor_next']:
            anchor_info = deepcopy(self.nusc_infos[anchor])
            anchor_imgs_info = self.get_data_info_temporal(info, anchor_info)
            anchor_img_path = anchor_imgs_info['image_paths']
            temImg2lidar = anchor_imgs_info['temImg2lidar']
            anchor_img = self.read_surround_imgs(anchor_img_path)
            depth_loc, depth_gt, depth_mask = self.get_depth_from_lidar(
                anchor_info['lidar_path'], img_metas['lidar2img'], self.crop_size)
                        
            anchor_imgs.append(anchor_img)
            anchor_depth_locs.append(depth_loc)
            anchor_depth_gts.append(depth_gt)
            anchor_depth_masks.append(depth_mask)
            temImg2lidars.append(temImg2lidar)

        img_metas.update({
            'depth_loc': anchor_depth_locs,
            'depth_gt': anchor_depth_gts,
            'depth_mask': anchor_depth_masks,
            'temImg2lidars': temImg2lidars,
            'frame_dists': anchor_frame_dist
        })
        
        #### 4. read imgs
        input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'])
        data_tuple = (input_imgs, anchor_imgs, img_metas)
        return data_tuple   
    
    def read_surround_imgs(self, img_paths):
        if hfai:
            imgs = self.img_loader.load(img_paths)
        else:
            imgs = []
            for filename in img_paths:
                imgs.append(
                    imread(filename, 'unchanged').astype(np.float32))
        imgs = [img[:self.crop_size[0], :self.crop_size[1], :] for img in imgs]
        return imgs

    def get_data_info_temporal(self, info, info_tem):
        image_paths = []
        temImg2lidars = []

        lidar2ego_r = Quaternion(info['lidar2ego_rotation']).rotation_matrix
        lidar2ego = np.eye(4)
        lidar2ego[:3, :3] = lidar2ego_r
        lidar2ego[:3, 3] = np.array(info['lidar2ego_translation']).T

        ego2global_r = Quaternion(info['ego2global_rotation']).rotation_matrix
        ego2global = np.eye(4)
        ego2global[:3, :3] = ego2global_r
        ego2global[:3, 3] = np.array(info['ego2global_translation']).T

        lidar2global = ego2global @ lidar2ego
        global2lidar = np.linalg.inv(lidar2global)
        
        for cam_type_tem, cam_info_tem in info_tem['cams'].items():
            image_paths.append(cam_info_tem['data_path'])

            temIntrinsic = cam_info_tem['cam_intrinsic']
            temImg2temSensor = np.eye(4)
            temImg2temSensor[:3, :3] = temIntrinsic
            temImg2temSensor = np.linalg.inv(temImg2temSensor)

            temSensor2temEgo_r = Quaternion(cam_info_tem['sensor2ego_rotation']).rotation_matrix
            temSensor2temEgo = np.eye(4)
            temSensor2temEgo[:3, :3] = temSensor2temEgo_r
            temSensor2temEgo[:3, 3] = np.array(cam_info_tem['sensor2ego_translation']).T

            temEgo2global_r = Quaternion(cam_info_tem['ego2global_rotation']).rotation_matrix
            temEgo2global = np.eye(4)
            temEgo2global[:3, :3] = temEgo2global_r
            temEgo2global[:3, 3] = np.array(cam_info_tem['ego2global_translation']).T

            temImg2global = temEgo2global @ temSensor2temEgo @ temImg2temSensor

            temImg2lidar = global2lidar @ temImg2global
            temImg2lidars.append(temImg2lidar)

        out_dict = dict(
            image_paths=image_paths,
            temImg2lidar=np.asarray(temImg2lidars)
        )
        return out_dict

    def get_data_info(self, info):

        image_paths = []
        lidar2img_rts = []
        img2lidar_rts = []
        cam_intrinsics = []

        lidar2ego_r = Quaternion(info['lidar2ego_rotation']).rotation_matrix
        lidar2ego = np.eye(4)
        lidar2ego[:3, :3] = lidar2ego_r
        lidar2ego[:3, 3] = np.array(info['lidar2ego_translation']).T
        ego2lidar = np.linalg.inv(lidar2ego)

        for cam_type, cam_info in info['cams'].items():
            image_paths.append(cam_info['data_path'])

            lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
            lidar2cam_t = cam_info['sensor2lidar_translation'] @ lidar2cam_r.T
            lidar2cam_rt = np.eye(4)
            lidar2cam_rt[:3, :3] = lidar2cam_r.T
            lidar2cam_rt[3, :3] = -lidar2cam_t
            intrinsic = cam_info['cam_intrinsic']
            viewpad = np.eye(4)
            viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic

            lidar2img_rt = (viewpad @ lidar2cam_rt.T)
            img2lidar_rt = np.linalg.inv(lidar2img_rt)

            cam_intrinsics.append(viewpad)

            lidar2img_rts.append(lidar2img_rt)
            img2lidar_rts.append(img2lidar_rt)

        input_dict =dict(
            img_filename=image_paths,
            lidar2img=np.asarray(lidar2img_rts),
            img2lidar=np.asarray(img2lidar_rts),
            cam_intrinsic=np.asarray(cam_intrinsics),
            ego2lidar=ego2lidar)
        return input_dict
    


================================================
FILE: dataset/dataset_one_frame_sweeps_dist.py
================================================
import os, numpy as np, random, mmengine, math
from mmcv.image.io import imread
from pyquaternion import Quaternion
from copy import deepcopy
from . import OPENOCC_DATASET

if 'HFAI' in os.environ:
    hfai = True
    from dataset.loading import LoadMultiViewImageFromFilesHF, \
        LoadPtsFromFilesHF
else:
    hfai = False


def get_xyz(pose_dict):
    return np.array(pose_dict['translation'])

def get_img2global(calib_dict, pose_dict):
    
    cam2img = np.eye(4)
    cam2img[:3, :3] = np.asarray(calib_dict['camera_intrinsic'])
    img2cam = np.linalg.inv(cam2img)

    cam2ego = np.eye(4)
    cam2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
    cam2ego[:3, 3] = np.asarray(calib_dict['translation']).T

    ego2global = np.eye(4)
    ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
    ego2global[:3, 3] = np.asarray(pose_dict['translation']).T

    img2global = ego2global @ cam2ego @ img2cam
    return img2global

def get_lidar2global(calib_dict, pose_dict):

    lidar2ego = np.eye(4)
    lidar2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
    lidar2ego[:3, 3] = np.asarray(calib_dict['translation']).T

    ego2global = np.eye(4)
    ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
    ego2global[:3, 3] = np.asarray(pose_dict['translation']).T

    lidar2global = ego2global @ lidar2ego
    return lidar2global


@OPENOCC_DATASET.register_module()
class nuScenes_One_Frame_Sweeps_Dist:
    def __init__(
            self, 
            data_path, 
            imageset='train', 
            crop_size=[768, 1600],
            input_img_crop_size=None,
            min_dist=0.4,
            max_dist=10.0,

            strict=True,
            return_depth=False,
            eval_depth=80,
            cur_prob=1.0,
            prev_prob=0.5,
            choose_nearest=False,
            ref_sensor='CAM_FRONT',
            composite_prev_next=False,
            sensor_mus=[3.0, 0.5],
            sensor_sigma=0.5,
            ego_centric=False,
            **kwargs):

        data = mmengine.load(imageset)
        self.scene_infos = data['infos']
        self.keyframes = data['metadata']

        self.data_path = data_path
        self.crop_size = crop_size
        self.input_img_crop_size = crop_size if input_img_crop_size is None else input_img_crop_size
        self.strict = strict
        self.return_depth = return_depth
        self.eval_depth = eval_depth
        self.cur_prob = cur_prob
        self.prev_prob = prev_prob
        self.choose_nearest = choose_nearest
        self.composite_prev_next = composite_prev_next
        self.sensor_mus = {
            'CAM_FRONT': sensor_mus[0], 'CAM_FRONT_RIGHT': sensor_mus[1], 'CAM_FRONT_LEFT': sensor_mus[1],
            'CAM_BACK': sensor_mus[0], 'CAM_BACK_LEFT': sensor_mus[1], 'CAM_BACK_RIGHT': sensor_mus[1]
        }
        self.sensor_sigma = sensor_sigma
        self.sensor_types = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 
            'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
        self.ego_centric = ego_centric

        if hfai:
            self.img_loader = LoadMultiViewImageFromFilesHF(
                to_float32=True,
                file_client_args=dict(
                    backend='ffrecord',
                    fname=data_path+'CAM',
                    filename2idx='img_fname2idx.pkl'))
            self.pts_loader = LoadPtsFromFilesHF(
                to_float32=True,
                file_client_args=dict(
                    backend='ffrecord',
                    fname=data_path+'LIDAR',
                    filename2idx='pts_fname2idx.pkl'))
        else:
            self.img_loader = self.pts_loader = None
        
        self.lidar2cam_rect = np.eye(4)

        #### collect temporal information
        for scene_idx, (scene_token, scene_samples) in enumerate(self.scene_infos.items()):
            if (scene_idx + 1) % 50 == 0:
                print(f'One_Frame_Sweeps: processed {scene_idx + 1} scenes.')
            length = len(scene_samples)
            for i, sample in enumerate(scene_samples):
                curr_xyz = get_xyz(sample['data'][ref_sensor]['pose'])
                prev_samples, next_samples = [], []
                prev_dists, next_dists = [], []
                for j in range(i - 1, -1, -1):
                    temp_xyz = get_xyz(scene_samples[j]['data'][ref_sensor]['pose'])
                    temp_dist = np.linalg.norm(curr_xyz - temp_xyz)
                    if temp_dist > max_dist:
                        break
                    if temp_dist > min_dist:
                        prev_samples.append((scene_token, j))
                        prev_dists.append(temp_dist)
                
                for j in range(i + 1, length, 1):
                    temp_xyz = get_xyz(scene_samples[j]['data'][ref_sensor]['pose'])
                    temp_dist = np.linalg.norm(curr_xyz - temp_xyz)
                    if temp_dist > max_dist:
                        break
                    if temp_dist > min_dist:
                        next_samples.append((scene_token, j))
                        next_dists.append(temp_dist)
                
                if not strict:
                    prev_samples.append((scene_token, i))
                    prev_dists.append(0.)
                    next_samples.append((scene_token, i))
                    next_dists.append(0.)
                
                sample.update({
                    'prev_samples': prev_samples,
                    'prev_dists': prev_dists,
                    'next_samples': next_samples,
                    'next_dists': next_dists,})

    def __len__(self):
        'Denotes the total number of samples'
        # return len(self.nusc_infos)
        return len(self.keyframes)
    
    def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
        # lidar2img: N, 4, 4
        scan = np.fromfile(os.path.join(self.data_path, lidar_path), dtype=np.float32)
        scan = scan.reshape((-1, 5))[:, :4]
        scan[:, 3] = 1.0
        # points_hcoords = scan[scan[:, 0] > 0, :]
        points_hcoords = np.expand_dims(scan.T, 0) # 1, 4, n
        img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4

        depth = img_points[..., 2] # N, n
        img_points = img_points[..., :2] # N, n, 2
        # mask = (depth < self.eval_depth) & (depth > 1e-3)  # get points with depth < max_sample_depth
        mask = (depth < self.eval_depth) & (depth > 1.0)  # get points with depth < max_sample_depth

        img_points = img_points / np.expand_dims(depth, axis=2)  # scale 2D points
        img_points[..., 0] = img_points[..., 0] / image_size[1]
        img_points[..., 1] = img_points[..., 1] / image_size[0]
        # img_points = np.round(img_points).astype(int)
        mask = mask & (img_points[..., 0] > 0) & \
                    (img_points[..., 1] > 0) & \
                    (img_points[..., 0] < 1) & \
                    (img_points[..., 1] < 1)

        return img_points, depth, mask
    
    def composite_dict(self, anchor_info):
        datas = []
        for prefix in ['prev_', 'next_']:
            data = dict()
            dists = np.asarray(anchor_info[prefix + 'dists'])
            for sensor_type in self.sensor_types:
                mu = self.sensor_mus[sensor_type]
                sigma = self.sensor_sigma
                probs = 1 / math.sqrt(2 * math.pi) / sigma * np.exp(-1 / (2 * sigma * sigma) * ((dists - mu) ** 2))
                probs = probs / np.sum(probs)
                idx = np.random.choice(len(dists), p=probs)
                scene_token, sample_idx = anchor_info[prefix + 'samples'][idx]
                data.update({sensor_type: self.scene_infos[scene_token][sample_idx]['data'][sensor_type]})
            datas.append(data)
        return {'data': datas[0]}, {'data': datas[1]}

    def __getitem__(self, index):
        #### 1. get color, temporal_depth choice if necessary
        if random.random() < self.cur_prob:
            temporal_supervision = 'curr'
        elif random.random() < self.prev_prob:
            temporal_supervision = 'prev'
        else:
            temporal_supervision = 'next'

        #### 2. get self, prev, next infos for the stem, and also temp_depth info
        while True:
            scene_token, index = self.keyframes[index]
            info = deepcopy(self.scene_infos[scene_token][index])

            if temporal_supervision == 'curr':
                anchor_info = deepcopy(info)
            elif temporal_supervision == 'prev':
                if len(info['prev_samples']) == 0:
                    index = np.random.randint(len(self))
                    continue
                anchor_scene_token, anchor_info_id = info['prev_samples'][np.random.randint(len(info['prev_samples']))]
                # anchor_scene_token, anchor_info_id = np.random.choice(info['prev_samples'])
                assert anchor_scene_token == scene_token and anchor_info_id <= index
                anchor_info = deepcopy(self.scene_infos[scene_token][anchor_info_id])
            else:
                if len(info['next_samples']) == 0:
                    index = np.random.randint(len(self))
                    continue
                anchor_scene_token, anchor_info_id = info['next_samples'][np.random.randint(len(info['next_samples']))]
                # anchor_scene_token, anchor_info_id = np.random.choice(info['next_samples'])
                assert anchor_scene_token == scene_token and anchor_info_id >= index
                anchor_info = deepcopy(self.scene_infos[scene_token][anchor_info_id])

            if len(anchor_info['prev_samples']) == 0 or \
                len(anchor_info['next_samples']) == 0:
                index = np.random.randint(len(self))
                continue

            if self.composite_prev_next:
                anchor_prev, anchor_next = self.composite_dict(anchor_info)
            else:
                if self.choose_nearest:
                    anchor_prev_scene_token, anchor_prev_idx = anchor_info['prev_samples'][0]
                    anchor_next_scene_token, anchor_next_idx = anchor_info['next_samples'][0]
                else:
                    anchor_prev_scene_token, anchor_prev_idx = anchor_info['prev_samples'][np.random.randint(len(anchor_info['prev_samples']))]
                    anchor_next_scene_token, anchor_next_idx = anchor_info['next_samples'][np.random.randint(len(anchor_info['next_samples']))]
                    # anchor_prev_scene_token, anchor_prev_idx = np.random.choice(anchor_info['prev_samples'])
                    # anchor_next_scene_token, anchor_next_idx = np.random.choice(anchor_info['next_samples'])
                assert anchor_prev_scene_token == scene_token and \
                    anchor_next_scene_token == scene_token
                anchor_prev = deepcopy(self.scene_infos[scene_token][anchor_prev_idx])
                anchor_next = deepcopy(self.scene_infos[scene_token][anchor_next_idx])
            break

        #### 3. prepare img_metas
        imgs_info = self.get_data_info(info)
        anchor_dict = self.get_data_info_anchor(info, anchor_info)
        prev_dict = self.get_data_info_temporal(anchor_info, anchor_prev)
        next_dict = self.get_data_info_temporal(anchor_info, anchor_next)

        img_metas = {
            'input_imgs_path': imgs_info['img_filename'],
            'curr_imgs_path': anchor_dict['image_paths'],
            'prev_imgs_path': prev_dict['image_paths'],
            'next_imgs_path': next_dict['image_paths'],
            'lidar2img': imgs_info['lidar2img'],
            'img2lidar': imgs_info['img2lidar'],
            'intrinsic': imgs_info['cam_intrinsic'],
            'cam2ego': imgs_info['cam2ego'],
            'temImg2lidar': anchor_dict['temImg2lidar'],
            'ego2lidar': imgs_info['ego2lidar'],
            'token': info['token'],
            'timestamp': info['timestamp'],
            'img2prevImg': prev_dict['img2temImg'],
            'img2nextImg': next_dict['img2temImg'],}
        if self.return_depth:
            depth_loc, depth_gt, depth_mask = self.get_depth_from_lidar(
                info['data']['LIDAR_TOP']['filename'], img_metas['lidar2img'], self.crop_size)
            img_metas.update({
                'depth_loc': depth_loc,
                'depth_gt': depth_gt,
                'depth_mask': depth_mask})
            
        if self.ego_centric:
            ego2lidar = img_metas['ego2lidar']
            lidar2ego = np.linalg.inv(ego2lidar)
            ego2img = img_metas['lidar2img'] @ ego2lidar[None, ...]
            img2ego = lidar2ego[None, ...] @ img_metas['img2lidar']
            temImg2ego = lidar2ego[None, ...] @ img_metas['temImg2lidar']
            img_metas.update({
                'lidar2img': ego2img,
                'img2lidar': img2ego,
                'temImg2lidar': temImg2ego,
                'ego2lidar': np.eye(4)})
        
        #### 4. read imgs
        input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'], self.input_img_crop_size)
        curr_imgs = self.read_surround_imgs(img_metas['curr_imgs_path'], self.crop_size)
        prev_imgs = self.read_surround_imgs(img_metas['prev_imgs_path'], self.crop_size)
        next_imgs = self.read_surround_imgs(img_metas['next_imgs_path'], self.crop_size)
        
        data_tuple = (
            [input_imgs, curr_imgs, prev_imgs, next_imgs], img_metas)
        return data_tuple   
    
    def read_surround_imgs(self, img_paths, crop_size):
        if hfai:
            imgs = self.img_loader.load(img_paths)
        else:
            imgs = []
            for filename in img_paths:
                imgs.append(
                    imread(filename, 'unchanged').astype(np.float32))
        imgs = [img[:crop_size[0], :crop_size[1], :] for img in imgs]
        return imgs

    def get_data_info_temporal(self, info, info_tem):
        image_paths = []
        img2temImgs = []

        for cam_type in self.sensor_types:

            cam_info_tem = info_tem['data'][cam_type]
            cam_info = info['data'][cam_type]
            image_paths.append(os.path.join(self.data_path, cam_info_tem['filename']))

            temImg2global = get_img2global(cam_info_tem['calib'], cam_info_tem['pose'])
            img2global = get_img2global(cam_info['calib'], cam_info['pose'])

            img2temImg = np.linalg.inv(temImg2global) @ img2global            
            img2temImgs.append(img2temImg)

        out_dict = dict(
            image_paths=image_paths,
            img2temImg=np.asarray(img2temImgs))
        return out_dict
    
    def get_data_info_anchor(self, info, info_tem):
        image_paths = []
        temImg2lidars = []

        lidar2global = get_lidar2global(info['data']['LIDAR_TOP']['calib'], info['data']['LIDAR_TOP']['pose'])

        for cam_type in self.sensor_types:

            cam_info_tem = info_tem['data'][cam_type]
            image_paths.append(os.path.join(self.data_path, cam_info_tem['filename']))

            temImg2global = get_img2global(cam_info_tem['calib'], cam_info_tem['pose'])

            temImg2lidar = np.linalg.inv(lidar2global) @ temImg2global
            temImg2lidars.append(temImg2lidar)

        out_dict = dict(
            image_paths=image_paths,
            temImg2lidar=np.asarray(temImg2lidars))
        return out_dict

    def get_data_info(self, info):
        image_paths = []
        lidar2img_rts = []
        img2lidar_rts = []
        cam_intrinsics = []
        cam2ego_rts = []

        lidar2ego_r = Quaternion(info['data']['LIDAR_TOP']['calib']['rotation']).rotation_matrix
        lidar2ego = np.eye(4)
        lidar2ego[:3, :3] = lidar2ego_r
        lidar2ego[:3, 3] = np.array(info['data']['LIDAR_TOP']['calib']['translation']).T
        ego2lidar = np.linalg.inv(lidar2ego)

        lidar2global = get_lidar2global(info['data']['LIDAR_TOP']['calib'], info['data']['LIDAR_TOP']['pose'])

        for cam_type in self.sensor_types:
            image_paths.append(os.path.join(self.data_path, info['data'][cam_type]['filename']))

            img2global = get_img2global(info['data'][cam_type]['calib'], info['data'][cam_type]['pose'])
            lidar2img = np.linalg.inv(img2global) @ lidar2global
            img2lidar = np.linalg.inv(lidar2global) @ img2global

            cam2ego_r = Quaternion(info['data'][cam_type]['calib']['rotation']).rotation_matrix
            cam2ego = np.eye(4)
            cam2ego[:3, :3] = cam2ego_r
            cam2ego[:3, 3] = np.array(info['data'][cam_type]['calib']['translation']).T

            intrinsic = info['data'][cam_type]['calib']['camera_intrinsic']
            viewpad = np.eye(4)
            viewpad[:3, :3] = intrinsic

            lidar2img_rts.append(lidar2img)
            img2lidar_rts.append(img2lidar)
            cam_intrinsics.append(viewpad)
            cam2ego_rts.append(cam2ego)
            
        input_dict =dict(
            img_filename=image_paths,
            lidar2img=np.asarray(lidar2img_rts),
            img2lidar=np.asarray(img2lidar_rts),
            cam_intrinsic=np.asarray(cam_intrinsics),
            ego2lidar=ego2lidar,
            cam2ego=np.asarray(cam2ego_rts))
        return input_dict

    # def get_data_info(self, info):
    #     image_paths = []
    #     lidar2img_rts = []
    #     img2lidar_rts = []
    #     cam_intrinsics = []
    #     cam2ego_rts = []

    #     lidar2ego_r = Quaternion(info['data']['LIDAR_TOP']['calib']['rotation']).rotation_matrix
    #     lidar2ego = np.eye(4)
    #     lidar2ego[:3, :3] = lidar2ego_r
    #     lidar2ego[:3, 3] = np.array(info['data']['LIDAR_TOP']['calib']['translation']).T
    #     ego2lidar = np.linalg.inv(lidar2ego)

    #     for cam_type in self.sensor_types:
    #         image_paths.append(os.path.join(self.data_path, info['data'][cam_type]['filename']))

    #         cam2ego_r = Quaternion(info['data'][cam_type]['calib']['rotation']).rotation_matrix
    #         cam2ego = np.eye(4)
    #         cam2ego[:3, :3] = cam2ego_r
    #         cam2ego[:3, 3] = np.array(info['data'][cam_type]['calib']['translation']).T
    #         ego2cam = np.linalg.inv(cam2ego)
    #         lidar2cam = ego2cam @ lidar2ego

    #         intrinsic = info['data'][cam_type]['calib']['camera_intrinsic']
    #         viewpad = np.eye(4)
    #         viewpad[:3, :3] = intrinsic
    #         lidar2img = viewpad @ lidar2cam
    #         img2lidar = np.linalg.inv(lidar2img)

    #         lidar2img_rts.append(lidar2img)
    #         img2lidar_rts.append(img2lidar)
    #         cam_intrinsics.append(viewpad)
    #         cam2ego_rts.append(cam2ego)
            
    #     input_dict =dict(
    #         img_filename=image_paths,
    #         lidar2img=np.asarray(lidar2img_rts),
    #         img2lidar=np.asarray(img2lidar_rts),
    #         cam_intrinsic=np.asarray(cam_intrinsics),
    #         ego2lidar=ego2lidar,
    #         cam2ego=np.asarray(cam2ego_rts))
    #     return input_dict
    

if __name__ == '__main__':
    import torch
    dataset = nuScenes_One_Frame_Sweeps(
        'data/nuscenes',
        'data/nuscenes_infos_val_temporal.pkl',
    )

    batch = dataset[300]
    imgs, img_metas, points = batch
    curr_imgs, prev_imgs, next_imgs = imgs

    def list2tensor(imgs):
        imgs = np.asarray(imgs)
        imgs = torch.from_numpy(imgs)
        return imgs
    curr_imgs = list2tensor(curr_imgs)
    prev_imgs = list2tensor(prev_imgs)
    next_imgs = list2tensor(next_imgs)
    
    print('img metas contains keys: ', list(img_metas.keys()))
    l2i = list2tensor(img_metas['lidar2img'])
    img2prevImg = list2tensor(img_metas['img2prevImg'])
    img2nextImg = list2tensor(img_metas['img2nextImg'])
    
    def get_diagonal(trans):
        trans = trans.reshape(6, 6, 4, 4)
        trans = torch.diagonal(trans, 0, 0, 1)
        trans = trans.permute(2, 0, 1)
        return trans
    img2prevImg = get_diagonal(img2prevImg)
    img2nextImg = get_diagonal(img2nextImg)

    def filter_invalid_pts(pts):
        mask = torch.norm(pts, dim=-1)
        mask = mask > 2
        pts = pts[mask]
        return pts
    points = torch.from_numpy(points)
    points = filter_invalid_pts(points)
    points = torch.cat([
        points,
        torch.ones_like(points[:, :1])
    ], dim=1)

    curr_pts_uv = torch.matmul(
        l2i.unsqueeze(1).to(points.dtype), 
        points.reshape(1, -1, 4, 1))
    
    prev_pts_uv = torch.matmul(
        img2prevImg.unsqueeze(1).to(points.dtype),
        curr_pts_uv)
    
    next_pts_uv = torch.matmul(
        img2nextImg.unsqueeze(1).to(points.dtype),
        curr_pts_uv)
    
    curr_pts_uv = curr_pts_uv.squeeze(-1)
    prev_pts_uv = prev_pts_uv.squeeze(-1)
    next_pts_uv = next_pts_uv.squeeze(-1)

    def get_pixel(uv, eps=1e-5):
        uv = uv[..., :2] / torch.maximum(
            uv[..., 2:3],
            torch.ones_like(uv[..., :1] * eps))
        return uv
    
    curr_pts_uv = get_pixel(curr_pts_uv)
    prev_pts_uv = get_pixel(prev_pts_uv)
    next_pts_uv = get_pixel(next_pts_uv)

    mask1 = torch.logical_and(
        curr_pts_uv[0, :, 0] > 600,
        curr_pts_uv[0, :, 0] < 1000)
    mask2 = torch.logical_and(
        curr_pts_uv[0, :, 1] > 400,
        curr_pts_uv[0, :, 1] < 500)
    mask = torch.logical_and(mask1, mask2)
    curr_pts_uv = curr_pts_uv[:, mask, :]
    prev_pts_uv = prev_pts_uv[:, mask, :]
    next_pts_uv = next_pts_uv[:, mask, :]

    def filter_invalid_points(uv):
        uv[..., 0] = torch.clamp(uv[..., 0], 0, 1600)
        uv[..., 1] = torch.clamp(uv[..., 1], 0, 900)
        return uv
    curr_pts_uv = filter_invalid_points(curr_pts_uv)
    prev_pts_uv = filter_invalid_points(prev_pts_uv)
    next_pts_uv = filter_invalid_points(next_pts_uv)

    import matplotlib.pyplot as plt
    def saveimg(i, imgs, uv, name):
        plt.imshow(imgs[i] / 256)
        plt.scatter(uv[i][:, 0], uv[i][:, 1], s=2)
        plt.savefig(f'{name}_{i}.jpg')
        plt.cla()
    for i in range(6):
        saveimg(i, curr_imgs, curr_pts_uv, 'curr')
        saveimg(i, prev_imgs, prev_pts_uv, 'prev')
        saveimg(i, next_imgs, next_pts_uv, 'next')
        
    pass


================================================
FILE: dataset/dataset_one_frame_sweeps_dist_vis.py
================================================
import os, numpy as np, random, mmengine, math
from mmcv.image.io import imread
from pyquaternion import Quaternion
from copy import deepcopy
from . import OPENOCC_DATASET


def get_xyz(pose_dict):
    return np.array(pose_dict['translation'])

def get_img2global(calib_dict, pose_dict):
    
    cam2img = np.eye(4)
    cam2img[:3, :3] = np.asarray(calib_dict['camera_intrinsic'])
    img2cam = np.linalg.inv(cam2img)

    cam2ego = np.eye(4)
    cam2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
    cam2ego[:3, 3] = np.asarray(calib_dict['translation']).T

    ego2global = np.eye(4)
    ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
    ego2global[:3, 3] = np.asarray(pose_dict['translation']).T

    img2global = ego2global @ cam2ego @ img2cam
    return img2global

def get_lidar2global(calib_dict, pose_dict):

    lidar2ego = np.eye(4)
    lidar2ego[:3, :3] = Quaternion(calib_dict['rotation']).rotation_matrix
    lidar2ego[:3, 3] = np.asarray(calib_dict['translation']).T

    ego2global = np.eye(4)
    ego2global[:3, :3] = Quaternion(pose_dict['rotation']).rotation_matrix
    ego2global[:3, 3] = np.asarray(pose_dict['translation']).T

    lidar2global = ego2global @ lidar2ego
    return lidar2global


@OPENOCC_DATASET.register_module()
class nuScenes_One_Frame_Sweeps_Dist_Vis:
    def __init__(
            self, 
            data_path, 
            imageset='train', 
            crop_size=[768, 1600],
            return_depth=False,
            eval_depth=80,
            ego_centric=False,
            scene_token=None,
            **kwargs):
        
        assert scene_token is not None

        data = mmengine.load(imageset)
        self.scene_infos = data['infos']
        assert scene_token in self.scene_infos
        self.infos = self.scene_infos[scene_token]

        self.data_path = data_path
        self.crop_size = crop_size
        self.return_depth = return_depth
        self.eval_depth = eval_depth
        self.sensor_types = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 
            'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
        self.ego_centric = ego_centric
        self.img_loader = self.pts_loader = None
        
        self.lidar2cam_rect = np.eye(4)

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.infos)
    
    def __getitem__(self, index):
        info = deepcopy(self.infos[index])
        imgs_info = self.get_data_info(info)

        img_metas = {
            'input_imgs_path': imgs_info['img_filename'],
            'lidar2img': imgs_info['lidar2img'],
            'img2lidar': imgs_info['img2lidar'],
            'intrinsic': imgs_info['cam_intrinsic'],
            'cam2ego': imgs_info['cam2ego'],
            'ego2lidar': imgs_info['ego2lidar'],
            # 'token': info['token'],
            'timestamp': info['timestamp'],
            'cam_positions': imgs_info['cam_positions'],
            'focal_positions': imgs_info['focal_positions']
            }
                    
        if self.ego_centric:
            ego2lidar = img_metas['ego2lidar']
            lidar2ego = np.linalg.inv(ego2lidar)
            ego2img = img_metas['lidar2img'] @ ego2lidar[None, ...]
            img2ego = lidar2ego[None, ...] @ img_metas['img2lidar']
            img_metas.update({
                'lidar2img': ego2img,
                'img2lidar': img2ego})
        
        #### 4. read imgs
        input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'])
        
        data_tuple = (input_imgs, img_metas)
        return data_tuple   
    
    def read_surround_imgs(self, img_paths):
        imgs = []
        for filename in img_paths:
            imgs.append(
                imread(filename, 'unchanged').astype(np.float32))
        imgs = [img[:self.crop_size[0], :self.crop_size[1], :] for img in imgs]
        return imgs

    def get_data_info(self, info):
        f = 0.0055
        image_paths = []
        lidar2img_rts = []
        img2lidar_rts = []
        cam_intrinsics = []
        cam2ego_rts = []
        cam_positions = []
        focal_positions = []

        lidar2ego_r = Quaternion(info['data']['LIDAR_TOP']['calib']['rotation']).rotation_matrix
        lidar2ego = np.eye(4)
        lidar2ego[:3, :3] = lidar2ego_r
        lidar2ego[:3, 3] = np.array(info['data']['LIDAR_TOP']['calib']['translation']).T
        ego2lidar = np.linalg.inv(lidar2ego)

        lidar2global = get_lidar2global(info['data']['LIDAR_TOP']['calib'], info['data']['LIDAR_TOP']['pose'])

        for cam_type in self.sensor_types:
            image_paths.append(os.path.join(self.data_path, info['data'][cam_type]['filename']))

            img2global = get_img2global(info['data'][cam_type]['calib'], info['data'][cam_type]['pose'])
            lidar2img = np.linalg.inv(img2global) @ lidar2global
            img2lidar = np.linalg.inv(lidar2global) @ img2global

            cam2ego_r = Quaternion(info['data'][cam_type]['calib']['rotation']).rotation_matrix
            cam2ego = np.eye(4)
            cam2ego[:3, :3] = cam2ego_r
            cam2ego[:3, 3] = np.array(info['data'][cam_type]['calib']['translation']).T

            intrinsic = info['data'][cam_type]['calib']['camera_intrinsic']
            viewpad = np.eye(4)
            viewpad[:3, :3] = intrinsic

            cam_position = img2lidar @ viewpad @ np.array([0., 0., 0., 1.]).reshape([4, 1])
            cam_positions.append(cam_position.flatten()[:3])
            focal_position = img2lidar @ viewpad @ np.array([0., 0., f, 1.]).reshape([4, 1])
            focal_positions.append(focal_position.flatten()[:3])

            lidar2img_rts.append(lidar2img)
            img2lidar_rts.append(img2lidar)
            cam_intrinsics.append(viewpad)
            cam2ego_rts.append(cam2ego)
            
        input_dict =dict(
            img_filename=image_paths,
            lidar2img=np.asarray(lidar2img_rts),
            img2lidar=np.asarray(img2lidar_rts),
            cam_intrinsic=np.asarray(cam_intrinsics),
            ego2lidar=ego2lidar,
            cam2ego=np.asarray(cam2ego_rts),
            cam_positions=cam_positions,
            focal_positions=focal_positions)
        return input_dict


================================================
FILE: dataset/dataset_wrapper_temporal.py
================================================

import numpy as np, torch
from torch.utils import data
from dataset.transform_3d import PadMultiViewImage, NormalizeMultiviewImage, \
    PhotoMetricDistortionMultiViewImage, RandomScaleImageMultiViewImage, \
    RandomFlip
import torch.nn.functional as F
from copy import deepcopy
from mmengine import MMLogger
logger = MMLogger.get_instance('selfocc')
from . import OPENOCC_DATAWRAPPER

# img_norm_cfg = dict(
    # mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

@OPENOCC_DATAWRAPPER.register_module()
class tpvformer_dataset_nuscenes_temporal(data.Dataset):
    def __init__(
            self, 
            in_dataset, 
            phase='train', 
            scale_rate=1,
            photometric_aug=dict(
                use_swap_channel=False,
            ),
            use_temporal_aug=False,
            temporal_aug_list=[],
            img_norm_cfg=img_norm_cfg,
            supervision_img_size=None,
            supervision_scale_rate=None,
            use_flip=False,
            ref_focal_len=None,
            pad_img_size=None,
            random_scale=None,
            pad_scale_rate=None,
        ):
        'Initialization'
        self.point_cloud_dataset = in_dataset
        self.scale_rate = scale_rate
        self.use_temporal_aug = use_temporal_aug
        if use_temporal_aug:
            assert len(temporal_aug_list) > 0
        self.temporal_aug_list = temporal_aug_list

        photometric = PhotoMetricDistortionMultiViewImage(**photometric_aug)
        logger.info('using photometric augmentation: '+ str(photometric_aug))

        train_transforms = [
            photometric,
            NormalizeMultiviewImage(**img_norm_cfg),
            PadMultiViewImage(size_divisor=32, size=pad_img_size)
        ]
        val_transforms = [
            NormalizeMultiviewImage(**img_norm_cfg),
            PadMultiViewImage(size_divisor=32, size=pad_img_size)
        ]
        if scale_rate != 1 or ref_focal_len is not None or random_scale is not None or pad_scale_rate is not None:
            train_transforms.insert(2, RandomScaleImageMultiViewImage([scale_rate], ref_focal_len, random_scale, pad_scale_rate))
            val_transforms.insert(1, RandomScaleImageMultiViewImage([scale_rate], ref_focal_len, pad_scale_rate=pad_scale_rate))
        if use_flip:
            train_transforms.append(RandomFlip(0.5))
        
        if phase == 'train':
            self.transforms = train_transforms
        else:
            self.transforms = val_transforms
        if use_temporal_aug:
            self.temporal_transforms = [
                NormalizeMultiviewImage(**img_norm_cfg),
                PadMultiViewImage(size_divisor=4)]
            if supervision_scale_rate != 1:
                self.temporal_transforms.insert(1, RandomScaleImageMultiViewImage([supervision_scale_rate]))
        self.supervision_img_size = supervision_img_size

    def __len__(self):
        return len(self.point_cloud_dataset)
    
    def to_tensor(self, imgs):
        imgs = np.stack(imgs).astype(np.float32)
        imgs = torch.from_numpy(imgs)
        imgs = imgs.permute(0, 3, 1, 2)
        return imgs

    def __getitem__(self, index):
        data = self.point_cloud_dataset[index]
        if len(data) == 2:
            return self.deal_with_length2_dataset(data)
        elif len(data) == 3:
            return self.deal_with_length3_dataset(data)
    
    def deal_with_length3_dataset(self, data):
        input_imgs, anchor_imgs, img_metas = data
        img_metas['img_shape'] = input_imgs[0].shape[:2]

        # deal with img augmentations
        input_imgs, imgs_dict = forward_aug(input_imgs, img_metas, self.transforms)
        input_imgs = self.to_tensor(input_imgs)
        # img_metas['img_shape'] = imgs_dict['img_shape']
        img_metas['scale_rate'] = self.scale_rate
        # if 'focal_ratios' in imgs_dict:
        #     img_metas['focal_ratios'] = imgs_dict['focal_ratios']
        if 'focal_ratios_x' in imgs_dict:
            img_metas['focal_ratios_x'] = imgs_dict['focal_ratios_x']
        if 'focal_ratios_y' in imgs_dict:
            img_metas['focal_ratios_y'] = imgs_dict['focal_ratios_y']
        img_metas['flip'] = imgs_dict.get('flip', False)

        anchor_imgs = np.asarray(anchor_imgs).astype(np.float32)
        anchor_imgs = torch.from_numpy(anchor_imgs)
        anchor_imgs = anchor_imgs.permute(0, 1, 4, 2, 3)

        data_tuple = (input_imgs, 
                      anchor_imgs / 256.,
                      img_metas)
        return data_tuple
    
    def deal_with_length2_dataset(self, data):
        imgs, img_metas = data
        if len(imgs) == 4:
            input_imgs, curr_imgs, prev_imgs, next_imgs = imgs
            color_imgs = deepcopy(curr_imgs)
        elif len(imgs) == 3:
            curr_imgs, prev_imgs, next_imgs = imgs
            input_imgs = deepcopy(curr_imgs)
            color_imgs = deepcopy(curr_imgs)
        elif len(imgs) == 5:
            input_imgs, curr_imgs, prev_imgs, next_imgs, color_imgs = imgs
        img_metas['img_shape'] = input_imgs[0].shape[:2]

        # deal with img augmentations
        input_imgs, imgs_dict = forward_aug(input_imgs, img_metas, self.transforms)
        input_imgs = self.to_tensor(input_imgs)

        curr_aug = prev_aug = next_aug = None
        if 'curr_imgs' in self.temporal_aug_list:
            curr_aug, _ = forward_aug(curr_imgs, {}, self.temporal_transforms)
            curr_aug = self.to_tensor(curr_aug)
        if 'prev_imgs' in self.temporal_aug_list:
            prev_aug, _ = forward_aug(prev_imgs, {}, self.temporal_transforms)
            prev_aug = self.to_tensor(prev_aug)
        if 'next_imgs' in self.temporal_aug_list:
            next_aug, _ = forward_aug(next_imgs, {}, self.temporal_transforms)
            next_aug = self.to_tensor(next_aug)
            
        curr_imgs = self.to_tensor(curr_imgs)
        prev_imgs = self.to_tensor(prev_imgs)
        next_imgs = self.to_tensor(next_imgs)
        color_imgs = self.to_tensor(color_imgs)
        if self.supervision_img_size is not None:
            curr_imgs = F.interpolate(curr_imgs, size=self.supervision_img_size, mode='bilinear', align_corners=True)
            prev_imgs = F.interpolate(prev_imgs, size=self.supervision_img_size, mode='bilinear', align_corners=True)
            next_imgs = F.interpolate(next_imgs, size=self.supervision_img_size, mode='bilinear', align_corners=True)

        # img_metas['img_shape'] = imgs_dict['img_shape']
        img_metas['scale_rate'] = self.scale_rate
        # if 'focal_ratios' in imgs_dict:
        #     img_metas['focal_ratios'] = imgs_dict['focal_ratios']
        if 'focal_ratios_x' in imgs_dict:
            img_metas['focal_ratios_x'] = imgs_dict['focal_ratios_x']
        if 'focal_ratios_y' in imgs_dict:
            img_metas['focal_ratios_y'] = imgs_dict['focal_ratios_y']
        img_metas['flip'] = imgs_dict.get('flip', False)

        data_tuple = (input_imgs, 
                      curr_imgs / 256., 
                      prev_imgs / 256., 
                      next_imgs / 256., 
                      color_imgs / 256.,
                      img_metas,
                      curr_aug,
                      prev_aug,
                      next_aug)
        return data_tuple


def custom_collate_fn_temporal(data):
    data_tuple = []
    for i, item in enumerate(data[0]):
        if isinstance(item, torch.Tensor):
            data_tuple.append(torch.stack([d[i] for d in data]))
        elif isinstance(item, (dict, str)):
            data_tuple.append([d[i] for d in data])
        elif item is None:
            data_tuple.append(None)
        else:
            raise NotImplementedError
    return data_tuple

def forward_aug(imgs, metas, transforms):
    imgs_dict = {
        'img': imgs,
        'metas': metas,
    }
    for t in transforms:
        imgs_dict = t(imgs_dict)
    aug_imgs = imgs_dict['img']
    return aug_imgs, imgs_dict


================================================
FILE: dataset/dataset_wrapper_vis.py
================================================

import numpy as np, torch
from torch.utils import data
from dataset.transform_3d import PadMultiViewImage, NormalizeMultiviewImage, \
    RandomScaleImageMultiViewImage
from mmengine import MMLogger
logger = MMLogger.get_instance('selfocc')
from . import OPENOCC_DATAWRAPPER

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

@OPENOCC_DATAWRAPPER.register_module()
class tpvformer_dataset_nuscenes_vis(data.Dataset):
    def __init__(
            self, 
            in_dataset, 
            scale_rate=1,
            img_norm_cfg=img_norm_cfg,
            pad_img_size=None,
            pad_scale_rate=None,
            **kwargs
        ):
        'Initialization'
        self.point_cloud_dataset = in_dataset
        self.scale_rate = scale_rate

        val_transforms = [
            NormalizeMultiviewImage(**img_norm_cfg),
            PadMultiViewImage(size_divisor=32, size=pad_img_size)]
        if scale_rate != 1 or pad_scale_rate is not None:
            val_transforms.insert(1, RandomScaleImageMultiViewImage([scale_rate], pad_scale_rate=pad_scale_rate))
        self.transforms = val_transforms

    def __len__(self):
        return len(self.point_cloud_dataset)
    
    def to_tensor(self, imgs):
        imgs = np.stack(imgs).astype(np.float32)
        imgs = torch.from_numpy(imgs)
        imgs = imgs.permute(0, 3, 1, 2)
        return imgs

    def __getitem__(self, index):
        data = self.point_cloud_dataset[index]
        return self.deal_with_length2_dataset(data)
        
    def deal_with_length2_dataset(self, data):
        input_imgs, img_metas = data
        img_metas['img_shape'] = input_imgs[0].shape[:2]

        # deal with img augmentations
        input_imgs, imgs_dict = forward_aug(input_imgs, img_metas, self.transforms)
        input_imgs = self.to_tensor(input_imgs)

        img_metas['scale_rate'] = self.scale_rate
        if 'focal_ratios_x' in imgs_dict:
            img_metas['focal_ratios_x'] = imgs_dict['focal_ratios_x']
        if 'focal_ratios_y' in imgs_dict:
            img_metas['focal_ratios_y'] = imgs_dict['focal_ratios_y']
        img_metas['flip'] = imgs_dict.get('flip', False)

        data_tuple = (input_imgs, 
                      img_metas,)
        return data_tuple


def custom_collate_fn_temporal(data):
    data_tuple = []
    for i, item in enumerate(data[0]):
        if isinstance(item, torch.Tensor):
            data_tuple.append(torch.stack([d[i] for d in data]))
        elif isinstance(item, (dict, str)):
            data_tuple.append([d[i] for d in data])
        elif item is None:
            data_tuple.append(None)
        else:
            raise NotImplementedError
    return data_tuple

def forward_aug(imgs, metas, transforms):
    imgs_dict = {
        'img': imgs,
        'metas': metas,
    }
    for t in transforms:
        imgs_dict = t(imgs_dict)
    aug_imgs = imgs_dict['img']
    return aug_imgs, imgs_dict


================================================
FILE: dataset/kitti/helpers.py
================================================
import numpy as np
# import scenerf.data.utils.fusion as fusion
# import open3d as o3d
from PIL import Image


# def make_open3d_point_cloud(xyz, color=None):
#     pcd = o3d.geometry.PointCloud()
#     pcd.points = o3d.utility.Vector3dVector(xyz)
#     if color is not None:
#         if len(color) != len(xyz):
#             color = np.tile(color, (len(xyz), 1))
#         pcd.colors = o3d.utility.Vector3dVector(color)
#     return pcd


def apply_transform(pts, T):
    """
    cam_ptx_from: B, 3
    """
    ones = np.ones((pts.shape[0], 1))
    homo_pts = np.concatenate([pts, ones], axis=1)
    homo_cam_pts_to = (T @ homo_pts.T).T
    cam_pts_to = homo_cam_pts_to[:, :3]

    return cam_pts_to


def dump_xyz(P):
    return P[0:3, 3]


def read_rgb(path):
    img = Image.open(path).convert("RGB")

    # PIL to numpy
    img = np.array(img, dtype=np.float32, copy=False) / 255.0
    img = img[:370, :1220, :]  # crop image        

    return img


def read_poses(path):
    # Read and parse the poses
    poses = []
    with open(path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            T_w_cam0 = np.fromstring(line, dtype=float, sep=' ')
            T_w_cam0 = T_w_cam0.reshape(3, 4)
            T_w_cam0 = np.vstack((T_w_cam0, [0, 0, 0, 1]))
            poses.append(T_w_cam0)
    return poses
    

def read_calib(calib_path):
    """
    Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
    :param calib_path: Path to a calibration text file.
    :return: dict with calibration matrices.
    """
    calib_all = {}
    with open(calib_path, "r") as f:
        for line in f.readlines():
            if line == "\n":
                break
            key, value = line.split(":", 1)
            calib_all[key] = np.array([float(x) for x in value.split()])

    # reshape matrices
    calib_out = {}
    # 3x4 projection matrix for left camera
    calib_out["P2"] = calib_all["P2"].reshape(3, 4)
    calib_out["Tr"] = np.identity(4)  # 4x4 matrix
    calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)

    T2 = np.eye(4)
    T2[0, 3] = calib_out["P2"][0, 3] / calib_out["P2"][0, 0]
    calib_out["T_cam0_2_cam2"] = T2
    return calib_out


# def compute_transformation(
#         lidar_path_source, lidar_path_infer, lidar_path_target,
#         pose_source, pose_infer, pose_target,
#         T_velo_2_cam2, T_cam0_2_cam2):
#     pts_velo_source = np.fromfile(lidar_path_source, dtype=np.float32).reshape((-1, 4))[:, :3]
#     pts_velo_infer = np.fromfile(lidar_path_infer, dtype=np.float32).reshape((-1, 4))[:, :3]
#     pts_velo_target = np.fromfile(lidar_path_target, dtype=np.float32).reshape((-1, 4))[:, :3]

#     pts_cam2_source = apply_transform(pts_velo_source, T_velo_2_cam2)
#     pts_cam2_infer = apply_transform(pts_velo_infer, T_velo_2_cam2)
#     pts_cam2_target = apply_transform(pts_velo_target, T_velo_2_cam2)

#     T_cam2_2_cam0 = np.linalg.inv(T_cam0_2_cam2)

#     T_source2infer = T_cam0_2_cam2 @ np.linalg.inv(pose_infer) @ pose_source @ T_cam2_2_cam0
#     T_source2target = T_cam0_2_cam2 @ np.linalg.inv(pose_target) @ pose_source @ T_cam2_2_cam0
#     pts_cam2_source2infer = apply_transform(pts_cam2_source, T_source2infer)
#     pts_cam2_source2target = apply_transform(pts_cam2_source, T_source2target)
#     pcd_source2infer = make_open3d_point_cloud(pts_cam2_source2infer).voxel_down_sample(voxel_size=0.05)
#     pcd_source2target = make_open3d_point_cloud(pts_cam2_source2target).voxel_down_sample(voxel_size=0.05)
#     pcd_infer = make_open3d_point_cloud(pts_cam2_infer).voxel_down_sample(voxel_size=0.05)
#     pcd_target = make_open3d_point_cloud(pts_cam2_target).voxel_down_sample(voxel_size=0.05)

#     reg_source2infer = o3d.pipelines.registration.registration_icp(
#         pcd_source2infer, pcd_infer, 0.2, np.eye(4),
#         o3d.pipelines.registration.TransformationEstimationPointToPoint(),
#         o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration=200))

#     reg_source2target = o3d.pipelines.registration.registration_icp(
#         pcd_source2target, pcd_target, 0.2, np.eye(4),
#         o3d.pipelines.registration.TransformationEstimationPointToPoint(),
#         o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration=200))

#     return {
#         "T_source2infer": T_source2infer @ reg_source2infer.transformation,
#         "T_source2target": T_source2target @ reg_source2target.transformation
#     }


  
# def vox2pix(cam_E, cam_K, 
#             vox_origin, voxel_size, 
#             img_W, img_H, 
#             scene_size):
#     """
#     compute the 2D projection of voxels centroids
    
#     Parameters:
#     ----------
#     cam_E: 4x4
#        =camera pose in case of NYUv2 dataset
#        =Transformation from camera to lidar coordinate in case of SemKITTI
#     cam_K: 3x3
#         camera intrinsics
#     vox_origin: (3,)
#         world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
#     img_W: int
#         image width
#     img_H: int
#         image height
#     scene_size: (3,)
#         scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
    
#     Returns
#     -------
#     projected_pix: (N, 2)
#         Projected 2D positions of voxels
#     fov_mask: (N,)
#         Voxels mask indice voxels inside image's FOV 
#     sensor_distance: (N,)
#         Voxels'distance to the sensor in meter
#     """
#     # Compute the x, y, z bounding of the scene in meter
    
#     vol_bnds = np.zeros((3,2))
#     vol_bnds[:,0] = vox_origin
#     vol_bnds[:,1] = vox_origin + np.array(scene_size)

#     # Compute the voxels centroids in lidar cooridnates
#     vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
#     xv, yv, zv = np.meshgrid(
#             range(vol_dim[0]),
#             range(vol_dim[1]),
#             range(vol_dim[2]),
#             indexing='ij'
#           )
#     vox_coords = np.concatenate([
#             xv.reshape(1,-1),
#             yv.reshape(1,-1),
#             zv.reshape(1,-1)
#           ], axis=0).astype(int).T

#     # Project voxels'centroid from lidar coordinates to camera coordinates
#     cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
#     cam_pts = fusion.rigid_transform(cam_pts, cam_E)

#     # Project camera coordinates to pixel positions
#     projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_K)
#     pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]

#     # Eliminate pixels outside view frustum
#     sensor_distance = cam_pts[:, 2]
#     fov_mask = np.logical_and(pix_x >= 0,
#                 np.logical_and(pix_x < img_W,
#                 np.logical_and(pix_y >= 0,
#                 np.logical_and(pix_y < img_H,
#                 sensor_distance > 0))))


#     return projected_pix, fov_mask, sensor_distance


================================================
FILE: dataset/kitti/io_data.py
================================================
"""
Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
"""

import numpy as np
import yaml
import imageio


def unpack(compressed):
  ''' given a bit encoded voxel grid, make a normal voxel grid out of it.  '''
  uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
  uncompressed[::8] = compressed[:] >> 7 & 1
  uncompressed[1::8] = compressed[:] >> 6 & 1
  uncompressed[2::8] = compressed[:] >> 5 & 1
  uncompressed[3::8] = compressed[:] >> 4 & 1
  uncompressed[4::8] = compressed[:] >> 3 & 1
  uncompressed[5::8] = compressed[:] >> 2 & 1
  uncompressed[6::8] = compressed[:] >> 1 & 1
  uncompressed[7::8] = compressed[:] & 1

  return uncompressed


def img_normalize(img, mean, std):
  img = img.astype(np.float32) / 255.0
  img = img - mean
  img = img / std

  return img


def pack(array):
  """ convert a boolean array into a bitwise array. """
  array = array.reshape((-1))

  #compressing bit flags.
  # yapf: disable
  compressed = array[::8] << 7 | array[1::8] << 6  | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
  # yapf: enable

  return np.array(compressed, dtype=np.uint8)


def get_grid_coords(dims, resolution):
  '''
  :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
  :return coords_grid: is the center coords of voxels in the grid
  '''

  # The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
  g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
  # The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
  g_yy = np.arange(0, dims[1] + 1)
  # The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
  # if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
  # (we go to 22 + 1 for the histogramdd)
  # ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
  sensor_pose = 10
  g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)

  # Obtaining the grid with coords...
  xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
  coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
  coords_grid = coords_grid.astype(np.float)

  coords_grid = (coords_grid * resolution) + resolution/2

  temp = np.copy(coords_grid)
  temp[:, 0] = coords_grid[:, 1]
  temp[:, 1] = coords_grid[:, 0]
  coords_grid = np.copy(temp)

  return coords_grid, g_xx, g_yy, g_zz


def _get_remap_lut(config_path):
  '''
  remap_lut to remap classes of semantic kitti for training...
  :return:
  '''

  dataset_config = yaml.safe_load(open(config_path, 'r'))
  # make lookup table for mapping
  maxkey = max(dataset_config['learning_map'].keys())

  # +100 hack making lut bigger just in case there are unknown labels
  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())

  # in completion we have to distinguish empty and invalid voxels.
  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
  remap_lut[0] = 0  # only 'empty' stays 'empty'.

  return remap_lut


def get_inv_map():
  '''
  remap_lut to remap classes of semantic kitti for training...
  :return:
  '''
  config_path = "./semantic-kitti.yaml"
  dataset_config = yaml.safe_load(open(config_path, 'r'))
  # make lookup table for mapping

  inv_map = np.zeros(20, dtype=np.int32)
  inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())

  return inv_map

def _read_SemKITTI(path, dtype, do_unpack):
  bin = np.fromfile(path, dtype=dtype)  # Flattened array
  if do_unpack:
    bin = unpack(bin)
  return bin


def _read_label_SemKITTI(path):
  label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
  return label


def _read_invalid_SemKITTI(path):
  invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
  return invalid


def _read_occluded_SemKITTI(path):
  occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
  return occluded


def _read_occupancy_SemKITTI(path):
  occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
  return occupancy


def _read_rgb_SemKITTI(path):
  rgb = np.asarray(imageio.imread(path))
  return rgb


def _read_pointcloud_SemKITTI(path):
  'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
  pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
  pointcloud = pointcloud.reshape((-1, 4))
  return pointcloud


def _read_calib_SemKITTI(calib_path):
  """
  :param calib_path: Path to a calibration text file.
  :return: dict with calibration matrices.
  """
  calib_all = {}
  with open(calib_path, 'r') as f:
    for line in f.readlines():
      if line == '\n':
        break
      key, value = line.split(':', 1)
      calib_all[key] = np.array([float(x) for x in value.split()])

  # reshape matrices
  calib_out = {}
  calib_out['P2'] = calib_all['P2'].reshape(3, 4)  # 3x4 projection matrix for left camera
  calib_out['Tr'] = np.identity(4)  # 4x4 matrix
  calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
  return calib_out


def get_remap_lut(path):
  '''
  remap_lut to remap classes of semantic kitti for training...
  :return:
  '''

  dataset_config = yaml.safe_load(open(path, 'r'))

  # make lookup table for mapping
  maxkey = max(dataset_config['learning_map'].keys())

  # +100 hack making lut bigger just in case there are unknown labels
  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())

  # in completion we have to distinguish empty and invalid voxels.
  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
  remap_lut[0] = 0  # only 'empty' stays 'empty'.

  return remap_lut


def data_augmentation_3Dflips(flip, data):
  # The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
  # into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
  # https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
  # Dims -> {XZY}
  # Flipping around the X axis...
  if np.isclose(flip, 1):
    data = np.flip(data, axis=0).copy()

  # Flipping around the Y axis...
  if np.isclose(flip, 2):
    data = np.flip(data, 2).copy()

  # Flipping around the X and the Y axis...
  if np.isclose(flip, 3):
    data = np.flip(np.flip(data, axis=0), axis=2).copy()

  return data


def get_cmap_semanticKITTI20():
  colors = np.array([
    # [0  , 0  , 0, 255],
    [100, 150, 245, 255],
    [100, 230, 245, 255],
    [30, 60, 150, 255],
    [80, 30, 180, 255],
    [100, 80, 250, 255],
    [255, 30, 30, 255],
    [255, 40, 200, 255],
    [150, 30, 90, 255],
    [255, 0, 255, 255],
    [255, 150, 255, 255],
    [75, 0, 75, 255],
    [175, 0, 75, 255],
    [255, 200, 0, 255],
    [255, 120, 50, 255],
    [0, 175, 0, 255],
    [135, 60, 0, 255],
    [150, 240, 80, 255],
    [255, 240, 150, 255],
    [255, 0, 0, 255]]).astype(np.uint8)

  return colors


================================================
FILE: dataset/kitti/kitti_dataset_eval.py
================================================
import glob
import os
import time

import numpy as np
import torch
# from torchvision import transforms
from dataset.kitti.helpers import dump_xyz, read_calib, read_poses, read_rgb
from dataset.kitti.params import val_error_frames
# from . import io_data as SemanticKittiIO
from copy import deepcopy
from mmcv.image.io import imread
from .. import OPENOCC_DATASET

if 'HFAI' in os.environ:
    hfai = True
    from dataset.loading import LoadMultiViewImageFromFilesHF, LoadPtsFromFilesHF
else:
    hfai = False


@OPENOCC_DATASET.register_module()
class Kitti_Novel_View_Eval:
    def __init__(
            self,
            split,
            root, 
            preprocess_root,
            frames_interval=0.4,
            sequence_distance=10,
            n_sources=1,
            eval_depth=80,
            sequences=None,
            selected_frames=None, 
            crop_size=[370, 1220],
            **kwargs,
    ):
        self.root = root
        self.preprocess_root = preprocess_root
        self.depth_preprocess_root = os.path.join(preprocess_root, "depth")
        self.transform_preprocess_root = os.path.join(preprocess_root, "transform")
        self.n_classes = 20
        self.n_sources = n_sources
        self.eval_depth = eval_depth
        
        self.transxy = [
            [0, -1., 0, 0],
            [1., 0, 0, 0],
            [0, 0, 1., 0],
            [0, 0, 0, 1.]]
        self.transxy = np.array(self.transxy)

        splits = {
            "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],                   
            "val": ["08"],
            "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
        }
        self.split = split
        if sequences is not None:
            self.sequences = sequences
        else:
            self.sequences = splits[split]
        self.output_scale = 1
        self.scene_size = (51.2, 51.2, 6.4)
        self.vox_origin = np.array([0, -25.6, -2])
        self.frames_interval = frames_interval
        self.sequence_distance = sequence_distance

        self.voxel_size = 0.2  # 0.2m
        self.img_W = crop_size[1]
        self.img_H = crop_size[0]

      
        start_time = time.time()
        self.scans = []
        self.frame2scan = {}

        for sequence in self.sequences:
            pose_path = os.path.join(self.root, "dataset", "poses", sequence + ".txt")
            gt_global_poses = read_poses(pose_path)

            calib = read_calib(
                os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
            )
            P = calib["P2"]

            T_cam0_2_cam2 = calib['T_cam0_2_cam2']
            T_cam2_2_cam0 = np.linalg.inv(T_cam0_2_cam2)
            T_velo_2_cam = T_cam0_2_cam2 @ calib["Tr"]

            if split == "val":
                glob_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
                )

            else:
                glob_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "image_2", "*.png"
                )
            
            dists_glob_path = os.path.join(
                self.root, "dataset", "sequences", sequence, "image_2", "*.png"
            )
            
            seq_img_paths = glob.glob(glob_path)
            dists_seq_img_paths = glob.glob(dists_glob_path)

            max_length = 0
            min_length = 50
            paired_dists = {}
            # dist_step = 1 if split == 'train' else 5
            dist_step = 1
            for seq_img_path in dists_seq_img_paths:
                filename = os.path.basename(seq_img_path)
                frame_id = os.path.splitext(filename)[0]
                prev_frame_id = "{:06d}".format(int(frame_id) - dist_step)
                img_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "image_2", prev_frame_id + ".png")
                if not os.path.exists(img_path):
                    paired_dists[frame_id] = 0.
                else:
                    curr_pose = gt_global_poses[int(frame_id)]
                    prev_pose = gt_global_poses[int(prev_frame_id)]
                    prev_xyz = dump_xyz(prev_pose)
                    curr_xyz = dump_xyz(curr_pose)
                    rel_distance = np.sqrt(
                        (prev_xyz[0] - curr_xyz[0]) ** 2 + (prev_xyz[2] - curr_xyz[2]) ** 2)
                    paired_dists[frame_id] = rel_distance
            
            for seq_img_path in seq_img_paths:
                filename = os.path.basename(seq_img_path)
                frame_id = os.path.splitext(filename)[0]

                current_img_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png")
                current_lid_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "velodyne", frame_id + ".bin")

                prev_frame_ids, next_frame_ids = [], []
                prev_img_paths, next_img_paths = [], []
                prev_lid_paths, next_lid_paths = [], []
                prev_poses, next_poses = [gt_global_poses[int(frame_id)]], [gt_global_poses[int(frame_id)]]
                prev_dists, next_dists = [], []

                # if self.split == 'train':
                #     pos_step = 1
                #     neg_step = -1
                # else:
                #     pos_step = 5
                #     neg_step = -5
                pos_step = 1
                neg_step = -1
                # deal with prev
                step = -1
                cnt = 0
                dist = 0.
                while True:
                    # cnt += step
                    cnt += neg_step
                    rel_frame_id = "{:06d}".format(int(frame_id) + cnt)

                    img_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")

                    if not os.path.exists(img_path):
                        break

                    prev_pose = prev_poses[-1]
                    prev_xyz = dump_xyz(prev_pose)
                    current_xyz = dump_xyz(gt_global_poses[int(rel_frame_id)])
                    tmp_dist = np.sqrt(
                        (prev_xyz[0] - current_xyz[0]) ** 2 + (prev_xyz[2] - current_xyz[2]) ** 2)
                    # tmp_dist = paired_dists["{:06d}".format(int(rel_frame_id) + pos_step)]
                    dist += tmp_dist
                    # if dist < frames_interval:
                    if tmp_dist < frames_interval:
                        continue
                    if dist > sequence_distance:
                        break
                    if split == "val" and rel_frame_id in val_error_frames:
                        continue

                    prev_frame_ids.append(rel_frame_id)
                    prev_img_paths.append(img_path)
                    lidar_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
                    prev_lid_paths.append(lidar_path)
                    prev_poses.append(gt_global_poses[int(rel_frame_id)])
                    prev_dists.append(dist)

                # deal with next
                step = 1
                cnt = 0
                dist = 0.
                while True:
                    # cnt += step
                    cnt += pos_step
                    rel_frame_id = "{:06d}".format(int(frame_id) + cnt)

                    img_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")

                    if not os.path.exists(img_path):
                        break

                    prev_pose = next_poses[-1]
                    prev_xyz = dump_xyz(prev_pose)
                    current_xyz = dump_xyz(gt_global_poses[int(rel_frame_id)])
                    tmp_dist = np.sqrt(
                        (prev_xyz[0] - current_xyz[0]) ** 2 + (prev_xyz[2] - current_xyz[2]) ** 2)
                    # tmp_dist = paired_dists[rel_frame_id]
                    dist += tmp_dist
                    # if dist < frames_interval:
                    if tmp_dist < frames_interval:
                        continue
                    if dist > sequence_distance:
                        break
                    if split == "val" and rel_frame_id in val_error_frames:
                        continue

                    next_frame_ids.append(rel_frame_id)
                    next_img_paths.append(img_path)
                    lidar_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
                    next_lid_paths.append(lidar_path)
                    next_poses.append(gt_global_poses[int(rel_frame_id)])
                    next_dists.append(dist)

                # ignore error frame
                if split == "val" and frame_id in val_error_frames:
                    continue

                is_included = len(next_frame_ids) > 0 if selected_frames is None else frame_id in selected_frames

                if is_included:

                    # prev_next_len = len(prev_poses) + len(next_poses)
                    prev_next_len = len(next_frame_ids)
                    
                    if prev_next_len > max_length:
                        max_length = prev_next_len
                    if prev_next_len < min_length:
                        min_length = prev_next_len

                    self.frame2scan.update({str(sequence) + "_" + frame_id: len(self.scans)})
                    self.scans.append(
                        {
                            "frame_id": frame_id,
                            "sequence": sequence,
                            "img_path": current_img_path,
                            "lid_path": current_lid_path,
                            "pose": gt_global_poses[int(frame_id)],

                            "prev_img_paths": prev_img_paths,
                            "prev_lid_paths": prev_lid_paths,
                            "next_img_paths": next_img_paths,
                            "next_lid_paths": next_lid_paths,

                            "T_velo_2_cam": T_velo_2_cam,
                            "P": P,
                            "T_cam0_2_cam2": T_cam0_2_cam2,
                            "T_cam2_2_cam0": T_cam2_2_cam0,

                            "prev_poses": prev_poses[1:],
                            "next_poses": next_poses[1:],
                            "prev_dists": prev_dists,
                            "next_dists": next_dists,
                            "prev_frame_ids": prev_frame_ids,
                            "next_frame_ids": next_frame_ids
                        }
                    )
            print(sequence, min_length, max_length)

        # self.to_tensor_normalized = transforms.Compose(
        #     [
        #         transforms.ToTensor(),
        #         transforms.Normalize(
        #             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
        #         ),
        #     ]
        # )
        # self.to_tensor = transforms.Compose([
        #     transforms.ToTensor()
        # ])
        print("Preprocess time: --- %s seconds ---" % (time.time() - start_time))

    def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
        # lidar2img: N, 4, 4
        scan = np.fromfile(lidar_path, dtype=np.float32)
        scan = scan.reshape((-1, 4))
        scan[:, 3] = 1.0
        # points_hcoords = scan[scan[:, 0] > 0, :]
        points_hcoords = np.expand_dims(self.transxy @ scan.T, 0) # 1, 4, n
        img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4

        depth = img_points[..., 2] # N, n
        img_points = img_points[..., :2] # N, n, 2
        mask = (depth < self.eval_depth) & (depth > 1e-3)  # get points with depth < max_sample_depth

        img_points = img_points / np.expand_dims(depth, axis=2)  # scale 2D points
        img_points[..., 0] = img_points[..., 0] / image_size[1]
        img_points[..., 1] = img_points[..., 1] / image_size[0]
        # img_points = np.round(img_points).astype(int)
        mask = mask & (img_points[..., 0] > 0) & \
                    (img_points[..., 1] > 0) & \
                    (img_points[..., 0] < 1) & \
                    (img_points[..., 1] < 1)

        return img_points, depth, mask
    
    def __len__(self):
        return len(self.scans)
    
    def prepare_temImg2lidar(self, scan, anchor):
        intrinsic = np.eye(4)
        intrinsic[:3, :3] = scan['P'][:3, :3]
                
        temImg2lidar = self.transxy @ np.linalg.inv(scan['T_velo_2_cam']) @ \
            scan['T_cam0_2_cam2'] @ \
            np.linalg.inv(scan['pose']) @ \
            scan['next_poses'][anchor] @ \
            np.linalg.inv(scan['T_cam0_2_cam2']) @ \
            np.linalg.inv(intrinsic)
        
        return [temImg2lidar]

    def prepare_img_metas(self, scan):
        img_metas = {}
        img_metas.update({
            'input_imgs_path': [scan['img_path']]})

        intrinsic = np.eye(4)
        intrinsic[:3, :3] = scan['P'][:3, :3]
        lidar2img = intrinsic @ scan['T_velo_2_cam'] @ np.linalg.inv(self.transxy)
        img2lidar = np.linalg.inv(lidar2img)
                
        img_metas.update({
            'lidar2img': np.expand_dims(lidar2img, axis=0),
            'img2lidar': [img2lidar],
            'token': scan['frame_id'],
            'sequence': scan['sequence']
        })

        return img_metas

    def read_surround_imgs(self, img_paths):
        if hfai:
            imgs = self.img_loader.load(img_paths)
        else:
            imgs = []
            for filename in img_paths:
                imgs.append(
                    imread(filename, 'unchanged').astype(np.float32))
        imgs = [img[:self.img_H, :self.img_W, :] for img in imgs]
        return imgs

    def __getitem__(self, index):
        #### 1. get color, temporal_depth choice if necessary

        #### 2. get self, prev, next infos for the stem, and also temp_depth info
        scan = deepcopy(self.scans[index])
            
        img_metas = self.prepare_img_metas(scan)

        anchor_imgs = []                       # list[list[array]]
        anchor_depth_locs = []                 # list[array]
        anchor_depth_gts = []                  # list[array]
        anchor_depth_masks = []                # list[array]
        temImg2lidars = []                     # list[list[array]]
        anchor_frame_dist = scan['next_dists'] # list[float]

        for anchor in range(len(scan['next_frame_ids'])):
            anchor_img_path = scan['next_img_paths'][anchor]
            anchor_lid_path = scan['next_lid_paths'][anchor]
            temImg2lidar = self.prepare_temImg2lidar(scan, anchor)
            anchor_img = self.read_surround_imgs([anchor_img_path])
            depth_loc, depth_gt, depth_mask = self.get_depth_from_lidar(
                anchor_lid_path, img_metas['lidar2img'], [self.img_H, self.img_W])
            
            anchor_imgs.append(anchor_img)
            anchor_depth_locs.append(depth_loc)
            anchor_depth_gts.append(depth_gt)
            anchor_depth_masks.append(depth_mask)
            temImg2lidars.append(temImg2lidar)

        img_metas.update({
            'depth_loc': anchor_depth_locs,
            'depth_gt': anchor_depth_gts,
            'depth_mask': anchor_depth_masks,
            'temImg2lidars': temImg2lidars,
            'frame_dists': anchor_frame_dist
        })

        # read 6 cams
        input_imgs = self.read_surround_imgs(img_metas['input_imgs_path'])

        data_tuple = (input_imgs, anchor_imgs, img_metas)
        return data_tuple   

        # frame_id = scan['frame_id']
        # sequence = scan['sequence']
        # lidar_paths = scan['lidar_paths']
        # rel_frame_ids = scan['rel_frame_ids']
        # distances = scan['distances']
        # infer_id = 0
        

        # P = scan["P"]
        # T_velo_2_cam = scan["T_velo_2_cam"]

        # img_paths = scan["img_paths"]

        # img_sources = []
        # img_input_sources = []
        # img_targets = []
        # lidar_depths = []
        # depths = []
        # loc2d_with_depths = []
        # T_source2infers = []
        # T_source2targets = []
        # source_distances = []
        # source_frame_ids = []

        
        # n_sources = min(len(distances) - 1, self.n_sources)
        
        # for d_id in range(n_sources):
        #     if self.n_sources < len(distances):    
        #         source_id = np.random.randint(1, len(distances))
        #         source_distance = distances[source_id]  
        #     else:
        #         source_id = d_id + 1
        #         source_distance = distances[source_id]
        
        #     source_distances.append(source_distance)

        #     rel_frame_id = rel_frame_ids[source_id]
        #     source_frame_ids.append(rel_frame_id)

        #     target_id = source_id - 1

        #     img_input_source = self.to_tensor_normalized(read_rgb(img_paths[source_id]))
        #     img_input_sources.append(img_input_source)

       
        #     img_source = self.to_tensor(read_rgb(img_paths[source_id]))
        #     img_target = self.to_tensor(read_rgb(img_paths[target_id]))


        #     lidar_path = lidar_paths[source_id]
        #     loc2d_with_depth, lidar_depth, _ = self.get_depth_from_lidar(lidar_path, P, T_velo_2_cam,
        #                                                                  (self.img_W, self.img_H))

        #     if self.n_rays  < lidar_depth.shape[0]:
        #         idx = np.random.choice(lidar_depth.shape[0], size=self.n_rays, replace=False)
        #         loc2d_with_depth = loc2d_with_depth[idx, :]
        #         lidar_depth = lidar_depth[idx]

        #     img_sources.append(img_source)
        #     img_targets.append(img_target)
        #     lidar_depths.append(torch.from_numpy(lidar_depth))
        #     loc2d_with_depths.append(torch.from_numpy(loc2d_with_depth))

        #     # Get transformation from source to target coord
        #     transform_dir = os.path.join(self.transform_preprocess_root,
        #                                  "{}_{}_all".format(sequence, self.frames_interval))
        #     os.makedirs(transform_dir, exist_ok=True)

        #     transform_path = os.path.join(transform_dir, "{}.pkl".format(frame_id))

            
        #     if os.path.exists(transform_path):
        #         try:
        #             with open(transform_path, "rb") as input_file:
        #                 transform_data = pickle.load(input_file)
        #         except EOFError:
        #             transform_data = {}
        #     else:
        #         transform_data = {}

        #     if '{}'.format(source_id) in transform_data:
        #         T_out_dict = transform_data['{}'.format(source_id)]
        #     else:
        #         poses = scan["poses"]
        #         pose_source = poses[source_id]
        #         pose_infer = poses[infer_id]
        #         pose_target = poses[target_id]
        #         lidar_path_source = lidar_paths[source_id]
        #         lidar_path_target = lidar_paths[target_id]
        #         lidar_path_infer = lidar_paths[infer_id]

        #         T_out_dict = compute_transformation(
        #             lidar_path_source, lidar_path_infer, lidar_path_target,
        #             pose_source, pose_infer, pose_target,
        #             T_velo_2_cam, scan['T_cam0_2_cam2'])

        #         transform_data['{}'.format(source_id)] = T_out_dict
        #         with open(transform_path, "wb") as input_file:
        #             pickle.dump(transform_data, input_file)
        #             print("{}: saved {} to {}".format(frame_id, source_id, transform_path))

        #     T_source2infer = T_out_dict['T_source2infer']
        #     T_source2target = T_out_dict['T_source2target']
        #     T_source2infers.append(torch.from_numpy(T_source2infer).float())
        #     T_source2targets.append(torch.from_numpy(T_source2target).float())
           


        # data = {

        #     "img_input_sources": img_input_sources,
        #     "source_distances": source_distances,
        #     "source_frame_ids": source_frame_ids,

        #     "img_sources": img_sources,
        #     "img_targets": img_targets,

        #     "lidar_depths": lidar_depths,
        #     "depths": depths,
        #     "loc2d_with_depths": loc2d_with_depths,
        #     "T_source2infers": T_source2infers,
        #     "T_source2targets": T_source2targets,

        #     "frame_id": frame_id,
        #     "sequence": sequence,

        #     "P": P,
        #     "T_velo_2_cam": T_velo_2_cam,
        #     "T_cam2_2_cam0": scan['T_cam2_2_cam0'],
        #     "T_cam0_2_cam2": scan['T_cam0_2_cam2'],

        # }
        
        # scale_3ds = [self.output_scale]
        # data["scale_3ds"] = scale_3ds
        # cam_K = P[0:3, 0:3]
        # data["cam_K"] = cam_K
        # for scale_3d in scale_3ds:
        #     # compute the 3D-2D mapping
           
        #     projected_pix, fov_mask, sensor_distance = vox2pix(
        #         T_velo_2_cam,
        #         cam_K,
        #         self.vox_origin,
        #         self.voxel_size * scale_3d,
        #         self.img_W,
        #         self.img_H,
        #         self.scene_size,
        #     )
           
        #     data["projected_pix_{}".format(scale_3d)] = projected_pix
        #     data["sensor_distance_{}".format(scale_3d)] = sensor_distance
        #     data["fov_mask_{}".format(scale_3d)] = fov_mask
        
        # img_input = read_rgb(img_paths[infer_id])

        # img_input = self.to_tensor_normalized(img_input)
        # data["img_input"] = img_input
        

        # label_path = os.path.join(
        #     self.root, "dataset", "sequences", sequence, "voxels", "{}.label".format(frame_id)
        # )
        # invalid_path = os.path.join(
        #     self.root, "dataset", "sequences", sequence, "voxels", "{}.invalid".format(frame_id)
        # )
        # # data['target_1_1'] = self.read_semKITTI_label(label_path, invalid_path)
        
        
        # return data


    # @staticmethod
    # def read_semKITTI_label(label_path, invalid_path):
    #     remap_lut = SemanticKittiIO.get_remap_lut("./scenerf/data/semantic_kitti/semantic-kitti.yaml")
    #     LABEL = SemanticKittiIO._read_label_SemKITTI(label_path)
    #     INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_path)
    #     LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
    #         np.float32
    #     )  # Remap 20 classes semanticKITTI SSC
       
    #     LABEL[
    #         np.isclose(INVALID, 1)
    #     ] = 255  # Setting to unknown all voxels marked on invalid mask...
        
    #     LABEL = LABEL.reshape(256,256, 32)
    #     return LABEL

    # def __len__(self):
    #     return len(self.scans)

if __name__ == "__main__":
    import os, sys
    print(sys.path)

    dataset = Kitti_Smart(
        'train',
        root='data/kitti',
        preprocess_root='data/kitti/preprocess',
        cur_prob=1.0,
        crop_size=[352, 1216])
    
    batch = dataset[0]
    
    import pdb; pdb.set_trace()

================================================
FILE: dataset/kitti/kitti_dataset_one_frame.py
================================================
import glob, os, time, random

import numpy as np
import torch
# from torchvision import transforms
from dataset.kitti.helpers import dump_xyz, read_calib, read_poses, read_rgb
from dataset.kitti.params import val_error_frames
# from . import io_data as SemanticKittiIO
from copy import deepcopy
from mmcv.image.io import imread
from .. import OPENOCC_DATASET

if 'HFAI' in os.environ:
    hfai = True
    from dataset.loading import LoadMultiViewImageFromFilesHF, LoadPtsFromFilesHF
else:
    hfai = False


@OPENOCC_DATASET.register_module()
class Kitti_One_Frame:
    def __init__(
            self,
            split,
            root, preprocess_root,
            frames_interval=0.4,
            sequence_distance=10,
            n_sources=1,
            eval_depth=80,
            sequences=None,
            selected_frames=None, 
            n_rays=1200,
            cur_prob=1.0,
            crop_size=[370, 1220],
            strict=True,
            return_depth=False,
            prev_prob=0.5,
            choose_nearest=False,
            return_sem=False,
            sem_path=None,
            **kwargs,
    ):
        self.root = root
        self.preprocess_root = preprocess_root
        self.depth_preprocess_root = os.path.join(preprocess_root, "depth")
        self.transform_preprocess_root = os.path.join(preprocess_root, "transform")
        self.n_classes = 20
        self.n_sources = n_sources
        self.eval_depth = eval_depth
        self.n_rays = n_rays
        self.cur_prob = cur_prob
        self.return_depth = return_depth
        self.prev_prob = prev_prob
        self.choose_nearest = choose_nearest
        self.return_sem = return_sem
        assert (not return_sem) or os.path.exists(sem_path)
        self.sem_path = sem_path
        
        self.transxy = [
            [0, -1., 0, 0],
            [1., 0, 0, 0],
            [0, 0, 1., 0],
            [0, 0, 0, 1.]]
        self.transxy = np.array(self.transxy)

        splits = {
            "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],                   
            "val": ["08"],
            "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
        }
        self.split = split
        if sequences is not None:
            self.sequences = sequences
        else:
            self.sequences = splits[split]
        self.output_scale = 1
        self.scene_size = (51.2, 51.2, 6.4)
        self.vox_origin = np.array([0, -25.6, -2])
        self.frames_interval = frames_interval
        if not isinstance(sequence_distance, list):
            sequence_distance = [sequence_distance] * 2
        self.sequence_distance = sequence_distance
        self.strict = strict

        self.voxel_size = 0.2  # 0.2m
        self.img_W = crop_size[1]
        self.img_H = crop_size[0]

      
        start_time = time.time()
        self.scans = []
        self.frame2scan = {}

        for sequence in self.sequences:
            pose_path = os.path.join(self.root, "dataset", "poses", sequence + ".txt")
            gt_global_poses = read_poses(pose_path)

            calib = read_calib(
                os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
            )
            P = calib["P2"]

            T_cam0_2_cam2 = calib['T_cam0_2_cam2']
            T_cam2_2_cam0 = np.linalg.inv(T_cam0_2_cam2)
            T_velo_2_cam = T_cam0_2_cam2 @ calib["Tr"]

            if split == "val":
                glob_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
                )

            else:
                glob_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "image_2", "*.png"
                )

            
            seq_img_paths = glob.glob(glob_path)

            max_length = 0
            min_length = 50
            paired_dists = {}
            dist_step = 1 if split == 'train' else 5
            for seq_img_path in seq_img_paths:
                filename = os.path.basename(seq_img_path)
                frame_id = os.path.splitext(filename)[0]
                prev_frame_id = "{:06d}".format(int(frame_id) - dist_step)
                img_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "image_2", prev_frame_id + ".png")
                if not os.path.exists(img_path):
                    paired_dists[frame_id] = 0.
                else:
                    curr_pose = gt_global_poses[int(frame_id)]
                    prev_pose = gt_global_poses[int(prev_frame_id)]
                    prev_xyz = dump_xyz(prev_pose)
                    curr_xyz = dump_xyz(curr_pose)
                    rel_distance = np.sqrt(
                        (prev_xyz[0] - curr_xyz[0]) ** 2 + (prev_xyz[2] - curr_xyz[2]) ** 2)
                    paired_dists[frame_id] = rel_distance
            
            for seq_img_path in seq_img_paths:
                filename = os.path.basename(seq_img_path)
                frame_id = os.path.splitext(filename)[0]

                current_img_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png")
                current_lid_path = os.path.join(
                    self.root, "dataset", "sequences", sequence, "velodyne", frame_id + ".bin")

                prev_frame_ids, next_frame_ids = [], []
                prev_img_paths, next_img_paths = [], []
                prev_lid_paths, next_lid_paths = [], []
                prev_poses, next_poses = [], []
                prev_dists, next_dists = [], []

                if self.split == 'train':
                    pos_step = 1
                    neg_step = -1
                else:
                    pos_step = 5
                    neg_step = -5
                # deal with prev
                step = -1
                cnt = 0
                dist = 0.
                while True:
                    # cnt += step
                    cnt += neg_step
                    rel_frame_id = "{:06d}".format(int(frame_id) + cnt)

                    img_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")

                    if not os.path.exists(img_path):
                        break

                    dist += paired_dists["{:06d}".format(int(rel_frame_id) + pos_step)]
                    if dist < frames_interval:
                        continue
                    if dist > sequence_distance[0]:
                        break
                    if split == "val" and rel_frame_id in val_error_frames:
                        continue

                    prev_frame_ids.append(rel_frame_id)
                    prev_img_paths.append(img_path)
                    lidar_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
                    prev_lid_paths.append(lidar_path)
                    prev_poses.append(gt_global_poses[int(rel_frame_id)])
                    prev_dists.append(dist)

                # deal with next
                step = 1
                cnt = 0
                dist = 0.
                while True:
                    # cnt += step
                    cnt += pos_step
                    rel_frame_id = "{:06d}".format(int(frame_id) + cnt)

                    img_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "image_2", rel_frame_id + ".png")

                    if not os.path.exists(img_path):
                        break

                    dist += paired_dists[rel_frame_id]
                    if dist < frames_interval:
                        continue
                    if dist > sequence_distance[1]:
                        break
                    if split == "val" and rel_frame_id in val_error_frames:
                        continue

                    next_frame_ids.append(rel_frame_id)
                    next_img_paths.append(img_path)
                    lidar_path = os.path.join(
                        self.root, "dataset", "sequences", sequence, "velodyne", rel_frame_id + ".bin")
                    next_lid_paths.append(lidar_path)
                    next_poses.append(gt_global_poses[int(rel_frame_id)])
                    next_dists.append(dist)

                # ignore error frame
                if split == "val" and frame_id in val_error_frames:
                    continue

                is_included = True if selected_frames is None else frame_id in selected_frames

                if is_included:

                    prev_next_len = len(prev_poses) + len(next_poses)
                    
                    if prev_next_len > max_length:
                        max_length = prev_next_len
                    if prev_next_len < min_length:
                        min_length = prev_next_len

                    self.frame2scan.update({str(sequence) + "_" + frame_id: len(self.scans)})

                    if not self.strict:
                        prev_img_paths.append(current_img_path)
                        prev_lid_paths.append(current_lid_path)
                        next_img_paths.append(current_img_path)
                        next_lid_paths.append(current_lid_path)
                        prev_poses.append(curr_pose)
                        next_poses.append(curr_pose)
                        prev_dists.append(0.)
                        next_dists.append(0.)
                        prev_frame_ids.append(frame_id)
                        next_frame_ids.append(frame_id)

                    self.scans.append(
                        {
                            "frame_id": frame_id,
                            "sequence": sequence,
                            "img_path": current_img_path,
                            "lid_path": current_lid_path,
                            "pose": gt_global_poses[int(frame_id)],

                            "prev_img_paths": prev_img_paths,
                            "prev_lid_paths": prev_lid_paths,
                            "next_img_paths": next_img_paths,
                            "next_lid_paths": next_lid_paths,

                            "T_velo_2_cam": T_velo_2_cam,
                            "P": P,
                            "T_cam0_2_cam2": T_cam0_2_cam2,
                            "T_cam2_2_cam0": T_cam2_2_cam0,

                            "prev_poses": prev_poses,
                            "next_poses": next_poses,
                            "prev_dists": prev_dists,
                            "next_dists": next_dists,
                            "prev_frame_ids": prev_frame_ids,
                            "next_frame_ids": next_frame_ids
                        }
                    )
            print(sequence, min_length, max_length)

        # self.to_tensor_normalized = transforms.Compose(
        #     [
        #         transforms.ToTensor(),
        #         transforms.Normalize(
        #             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
        #         ),
        #     ]
        # )
        # self.to_tensor = transforms.Compose([
        #     transforms.ToTensor()
        # ])
        print("Preprocess time: --- %s seconds ---" % (time.time() - start_time))

    def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
        # lidar2img: N, 4, 4
        scan = np.fromfile(lidar_path, dtype=np.float32)
        scan = scan.reshape((-1, 4))
        scan[:, 3] = 1.0
        # points_hcoords = scan[scan[:, 0] > 0, :]
        points_hcoords = np.expand_dims(self.transxy @ scan.T, 0) # 1, 4, n
        img_points = np.transpose(lidar2img @ points_hcoords, (0, 2, 1)) # N, n, 4

        depth = img_points[..., 2] # N, n
        img_points = img_points[..., :2] # N, n, 2
        mask = (depth < self.eval_depth) & (depth > 1e-3)  # get points with depth < max_sample_depth

        img_points = img_points / np.expand_dims(depth, axis=2)  # scale 2D points
        img_points[..., 0] = img_points[..., 0] / image_size[1]
        img_points[..., 1] = img_points[..., 1] / image_size[0]
        # img_points = np.round(img_points).astype(int)
        mask = mask & (img_points[..., 0] > 0) & \
                    (img_points[..., 1] > 0) & \
                    (img_points[..., 0] < 1) & \
                    (img_points[..., 1] < 1)

        return img_points, depth, mask
    
    def load_2d_sem_label(self, scan):
        sequence = scan['sequence']
        filename = os.path.basename(scan['img_path'])
        sem_path = os.path.join(self.sem_path, sequence, 'image_02', filename + '.npy')
        sem = np.load(sem_path)[None, ...]
        return sem
    
    def __len__(self):
        return len(self.scans)
    
    def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
        img_metas = {}
        img_metas.update({
            'input_imgs_path': [scan['img_path']],
            'curr_imgs_path': [anchor_scan['img_path']],
            'prev_imgs_path': [anchor_scan['prev_img_paths'][anchor_prev]],
            'next_imgs_path': [anchor_scan['next_img_paths'][anchor_next]]
        })

        intrinsic = np.eye(4)
        intrinsic[:3, :3] = scan['P'][:3, :3]
        lidar2img = intrinsic @ scan['T_velo_2_cam'] @ np.linalg.inv(self.transxy)
        img2lidar = np.linalg.inv(lidar2img)

        temImg2lidar = self.transxy @ np.linalg.inv(scan['T_velo_2_cam']) @ \
            scan['T_cam0_2_cam2'] @ \
            np.linalg.inv(scan['pose']) @ \
            anchor_scan['pose'] @ \
            anchor_scan['T_cam2_2_cam0'] @ \
            np.linalg.inv(intrinsic)
        
        img2prevImg = intrinsic @ \
            anchor_scan['T_cam0_2_cam2'] @ \
            np.linalg.inv(anchor_scan['prev_poses'][anchor_prev]) @ \
            anchor_scan['pose'] @ \
            anchor_scan['T_cam2_2_cam0'] @ \
            np.linalg.inv(intrinsic)
        
        img2nextImg = intrinsic @ \
            anchor_scan['T_cam0_2_cam2'] @ \
            np.linalg.inv(anchor_scan['next_poses'][anchor_next]) @ \
            anchor_scan['pose'] @ \
            anchor_scan['T_cam2_2_cam0'] @ \
            np.linalg.inv(intrinsic)
        
        img_metas.update({
            'lidar2img': np.expand_dims(lidar2img, axis=0),
            'img2lidar': [img2lidar],
            'img2prevImg': [img2prevImg],
            'img2nextImg': [img2nextImg],
            'token': scan['frame_id'],
            'sequence': scan['sequence'],
            'temImg2lidar': [temImg2lidar]
        })

        return img_metas

    def read_surround_imgs(self, img_paths):
        if hfai:
            imgs = self.img_loader.load(img_paths)
        else:
            imgs = []
            for filename in img_paths:
                imgs.append(
                    imread(filename, 'unchanged').astype(np.float32))
        imgs = [img[:self.img_H, :self.img_W, :] for img in imgs]
        return imgs

    def __getitem__(self, index):
        #### 1. get color, temporal_depth choice if necessary
        if random.random() < self.cur_prob:
            temporal_supervision = 'curr'
        elif random.random() < self.prev_prob:
            temporal_supervision = 'prev'
        else:
            temporal_supervision = 'next'

        #### 2. get self, prev, next infos for the stem, and also temp_depth info
        while True:
            scan = deepcopy(self.scans[index])
            sequence = scan['sequence']
            # scan_id = scan['frame_id']
            # day, seq = scan['sequence']
            # is_right = scan['is_right']
            # stereo_sign = 'r' if is_right else 'l'

            if temporal_supervision == 'curr':
                anchor_scan = deepcopy(scan)
            elif temporal_supervision == 'prev':
                if len(scan['prev_frame_ids']) == 0:
                    index = np.random.randint(len(self))
                    continue
                anchor_scan_id = np.random.choice(scan['prev_frame_ids'])
                anchor_scan = deepcopy(self.scans[self.frame2scan[str(sequence) + '_' + anchor_scan_id]])
            else:
                if len(scan['next_frame_ids']) == 0:
                    index = np.random.randint(len(self))
                    continue
                anchor_scan_id = np.random.choice(scan['next_frame_ids'])
                anchor_scan = deepcopy(self.scans[self.frame2scan[str(sequence) + '_' + anchor_scan_id]])
                            
            if len(anchor_scan['prev_frame_ids']) == 0 or len(anchor_scan['next_frame_ids']) == 0:
                index = np.random.randint(len(self))
                continue
            anchor_prev = 0 if self.choose_nearest else np.random.randint(len(anchor_scan['prev_frame_ids']))
            anchor_next = 0 if self.choose_near

Download .txt

gitextract_ciq3vmv7/

├── .gitignore
├── LICENSE
├── README.md
├── assets/
│   └── .placeholder
├── config/
│   ├── _base_/
│   │   ├── dataset_v1.py
│   │   ├── optimizer.py
│   │   └── schedule.py
│   ├── kitti/
│   │   ├── kitti_novel_depth.py
│   │   └── kitti_occ.py
│   ├── kitti_raw/
│   │   └── kitti_raw_depth.py
│   ├── nuscenes/
│   │   ├── nuscenes_depth.py
│   │   ├── nuscenes_novel_depth.py
│   │   ├── nuscenes_occ.py
│   │   └── nuscenes_occ_bev.py
│   └── openseed/
│       └── openseed_swint_lang.yaml
├── dataset/
│   ├── __init__.py
│   ├── dataset_one_frame_eval.py
│   ├── dataset_one_frame_sweeps_dist.py
│   ├── dataset_one_frame_sweeps_dist_vis.py
│   ├── dataset_wrapper_temporal.py
│   ├── dataset_wrapper_vis.py
│   ├── kitti/
│   │   ├── helpers.py
│   │   ├── io_data.py
│   │   ├── kitti_dataset_eval.py
│   │   ├── kitti_dataset_one_frame.py
│   │   ├── params.py
│   │   └── semantic-kitti.yaml
│   ├── kitti_raw/
│   │   ├── kitti_raw_dataset.py
│   │   ├── kitti_raw_dataset_stereo.py
│   │   ├── orb-slam_poses/
│   │   │   ├── 2011_09_26/
│   │   │   │   ├── 2011_09_26_drive_0001_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0002_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0005_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0009_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0011_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0013_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0014_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0015_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0017_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0018_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0019_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0020_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0022_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0023_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0027_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0028_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0029_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0032_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0035_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0036_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0039_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0046_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0048_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0051_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0052_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0056_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0057_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0059_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0060_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0061_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0064_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0070_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0079_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0084_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0086_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0087_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0091_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0093_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0095_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0096_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0101_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0104_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0106_sync.txt
│   │   │   │   ├── 2011_09_26_drive_0113_sync.txt
│   │   │   │   └── 2011_09_26_drive_0117_sync.txt
│   │   │   ├── 2011_09_28/
│   │   │   │   ├── 2011_09_28_drive_0001_sync.txt
│   │   │   │   └── 2011_09_28_drive_0002_sync.txt
│   │   │   ├── 2011_09_29/
│   │   │   │   ├── 2011_09_29_drive_0004_sync.txt
│   │   │   │   ├── 2011_09_29_drive_0026_sync.txt
│   │   │   │   └── 2011_09_29_drive_0071_sync.txt
│   │   │   ├── 2011_09_30/
│   │   │   │   ├── 2011_09_30_drive_0016_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0018_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0020_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0027_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0028_sync.txt
│   │   │   │   ├── 2011_09_30_drive_0033_sync.txt
│   │   │   │   └── 2011_09_30_drive_0034_sync.txt
│   │   │   └── 2011_10_03/
│   │   │       ├── 2011_10_03_drive_0027_sync.txt
│   │   │       ├── 2011_10_03_drive_0034_sync.txt
│   │   │       ├── 2011_10_03_drive_0042_sync.txt
│   │   │       └── 2011_10_03_drive_0047_sync.txt
│   │   └── splits/
│   │       ├── eigen/
│   │       │   └── test_files.txt
│   │       ├── eigen_zhou/
│   │       │   ├── test_files.txt
│   │       │   ├── train_files.txt
│   │       │   └── val_files.txt
│   │       └── tulsiani/
│   │           ├── test_files.txt
│   │           ├── train_files.txt
│   │           └── val_files.txt
│   ├── loading.py
│   ├── sampler.py
│   ├── transform_3d.py
│   └── utils.py
├── docs/
│   ├── get_started.md
│   ├── installation.md
│   ├── prepare_data.md
│   └── visualization.md
├── eval_depth.py
├── eval_iou.py
├── eval_iou_kitti.py
├── eval_novel_depth.py
├── eval_novel_depth_kitti.py
├── examine_sweeps.py
├── generate_videos.py
├── loss/
│   ├── __init__.py
│   ├── base_loss.py
│   ├── edge_loss_3d_ms.py
│   ├── eikonal_loss.py
│   ├── multi_loss.py
│   ├── reproj_loss_mono_multi_new.py
│   ├── reproj_loss_mono_multi_new_combine.py
│   ├── rgb_loss_ms.py
│   ├── second_grad_loss.py
│   └── sparsity_loss.py
├── model/
│   ├── __init__.py
│   ├── backbone/
│   │   ├── __init__.py
│   │   └── unet2d.py
│   ├── encoder/
│   │   ├── __init__.py
│   │   ├── base_encoder.py
│   │   ├── bevformer/
│   │   │   ├── __init__.py
│   │   │   ├── attention/
│   │   │   │   ├── __init__.py
│   │   │   │   └── image_cross_attention.py
│   │   │   ├── bevformer_encoder.py
│   │   │   ├── bevformer_encoder_layer.py
│   │   │   ├── bevformer_pos_embed.py
│   │   │   ├── mappings.py
│   │   │   ├── mappings_old.py
│   │   │   └── utils.py
│   │   └── tpvformer/
│   │       ├── __init__.py
│   │       ├── attention/
│   │       │   ├── __init__.py
│   │       │   ├── cross_view_hybrid_attention.py
│   │       │   └── image_cross_attention.py
│   │       ├── modules/
│   │       │   ├── __init__.py
│   │       │   ├── camera_se_net.py
│   │       │   ├── split_fpn.py
│   │       │   └── split_norm.py
│   │       ├── tpvformer_encoder.py
│   │       ├── tpvformer_encoder_layer.py
│   │       ├── tpvformer_pos_embed.py
│   │       └── utils.py
│   ├── head/
│   │   ├── __init__.py
│   │   ├── base_head.py
│   │   ├── nerfacc_head/
│   │   │   ├── bev_nerf.py
│   │   │   ├── estimator.py
│   │   │   ├── img2lidar.py
│   │   │   ├── nerfacc_head.py
│   │   │   ├── ray_sampler.py
│   │   │   └── rendering.py
│   │   ├── neus_head/
│   │   │   └── neus_head.py
│   │   └── utils/
│   │       └── sh_render.py
│   ├── lifter/
│   │   ├── __init__.py
│   │   ├── base_lifter.py
│   │   ├── bev_query_lifter.py
│   │   ├── tpv_pos_lifter.py
│   │   └── tpv_query_lifter.py
│   ├── neck/
│   │   ├── __init__.py
│   │   └── identity_neck.py
│   └── segmentor/
│       ├── __init__.py
│       ├── base_segmentor.py
│       └── tpv_segmentor.py
├── train.py
├── utils/
│   ├── __init__.py
│   ├── config_tools.py
│   ├── feat_tools.py
│   ├── metric_util.py
│   ├── misc.py
│   ├── openseed_utils.py
│   ├── scenerf_metric.py
│   ├── tb_wrapper.py
│   └── temporal_pkl.py
├── vis_3d.py
├── vis_3d_scene.py
└── vis_pics.py

Download .txt

SYMBOL INDEX (448 symbols across 78 files)

FILE: dataset/__init__.py
  function get_dataloader (line 20) | def get_dataloader(

FILE: dataset/dataset_one_frame_eval.py
  class nuScenes_One_Frame_Eval (line 16) | class nuScenes_One_Frame_Eval:
    method __init__ (line 17) | def __init__(
    method __len__ (line 67) | def __len__(self):
    method get_depth_from_lidar (line 71) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
    method __getitem__ (line 95) | def __getitem__(self, index):
    method read_surround_imgs (line 152) | def read_surround_imgs(self, img_paths):
    method get_data_info_temporal (line 163) | def get_data_info_temporal(self, info, info_tem):
    method get_data_info (line 209) | def get_data_info(self, info):

FILE: dataset/dataset_one_frame_sweeps_dist.py
  function get_xyz (line 15) | def get_xyz(pose_dict):
  function get_img2global (line 18) | def get_img2global(calib_dict, pose_dict):
  function get_lidar2global (line 35) | def get_lidar2global(calib_dict, pose_dict):
  class nuScenes_One_Frame_Sweeps_Dist (line 50) | class nuScenes_One_Frame_Sweeps_Dist:
    method __init__ (line 51) | def __init__(
    method __len__ (line 153) | def __len__(self):
    method get_depth_from_lidar (line 158) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
    method composite_dict (line 183) | def composite_dict(self, anchor_info):
    method __getitem__ (line 199) | def __getitem__(self, index):
    method read_surround_imgs (line 305) | def read_surround_imgs(self, img_paths, crop_size):
    method get_data_info_temporal (line 316) | def get_data_info_temporal(self, info, info_tem):
    method get_data_info_anchor (line 337) | def get_data_info_anchor(self, info, info_tem):
    method get_data_info (line 358) | def get_data_info(self, info):
  function list2tensor (line 458) | def list2tensor(imgs):
  function get_diagonal (line 471) | def get_diagonal(trans):
  function filter_invalid_pts (line 479) | def filter_invalid_pts(pts):
  function get_pixel (line 507) | def get_pixel(uv, eps=1e-5):
  function filter_invalid_points (line 528) | def filter_invalid_points(uv):
  function saveimg (line 537) | def saveimg(i, imgs, uv, name):

FILE: dataset/dataset_one_frame_sweeps_dist_vis.py
  function get_xyz (line 8) | def get_xyz(pose_dict):
  function get_img2global (line 11) | def get_img2global(calib_dict, pose_dict):
  function get_lidar2global (line 28) | def get_lidar2global(calib_dict, pose_dict):
  class nuScenes_One_Frame_Sweeps_Dist_Vis (line 43) | class nuScenes_One_Frame_Sweeps_Dist_Vis:
    method __init__ (line 44) | def __init__(
    method __len__ (line 73) | def __len__(self):
    method __getitem__ (line 77) | def __getitem__(self, index):
    method read_surround_imgs (line 109) | def read_surround_imgs(self, img_paths):
    method get_data_info (line 117) | def get_data_info(self, info):

FILE: dataset/dataset_wrapper_temporal.py
  class tpvformer_dataset_nuscenes_temporal (line 19) | class tpvformer_dataset_nuscenes_temporal(data.Dataset):
    method __init__ (line 20) | def __init__(
    method __len__ (line 77) | def __len__(self):
    method to_tensor (line 80) | def to_tensor(self, imgs):
    method __getitem__ (line 86) | def __getitem__(self, index):
    method deal_with_length3_dataset (line 93) | def deal_with_length3_dataset(self, data):
    method deal_with_length2_dataset (line 119) | def deal_with_length2_dataset(self, data):
  function custom_collate_fn_temporal (line 178) | def custom_collate_fn_temporal(data):
  function forward_aug (line 191) | def forward_aug(imgs, metas, transforms):

FILE: dataset/dataset_wrapper_vis.py
  class tpvformer_dataset_nuscenes_vis (line 14) | class tpvformer_dataset_nuscenes_vis(data.Dataset):
    method __init__ (line 15) | def __init__(
    method __len__ (line 35) | def __len__(self):
    method to_tensor (line 38) | def to_tensor(self, imgs):
    method __getitem__ (line 44) | def __getitem__(self, index):
    method deal_with_length2_dataset (line 48) | def deal_with_length2_dataset(self, data):
  function custom_collate_fn_temporal (line 68) | def custom_collate_fn_temporal(data):
  function forward_aug (line 81) | def forward_aug(imgs, metas, transforms):

FILE: dataset/kitti/helpers.py
  function apply_transform (line 17) | def apply_transform(pts, T):
  function dump_xyz (line 29) | def dump_xyz(P):
  function read_rgb (line 33) | def read_rgb(path):
  function read_poses (line 43) | def read_poses(path):
  function read_calib (line 56) | def read_calib(calib_path):

FILE: dataset/kitti/io_data.py
  function unpack (line 10) | def unpack(compressed):
  function img_normalize (line 25) | def img_normalize(img, mean, std):
  function pack (line 33) | def pack(array):
  function get_grid_coords (line 45) | def get_grid_coords(dims, resolution):
  function _get_remap_lut (line 77) | def _get_remap_lut(config_path):
  function get_inv_map (line 99) | def get_inv_map():
  function _read_SemKITTI (line 113) | def _read_SemKITTI(path, dtype, do_unpack):
  function _read_label_SemKITTI (line 120) | def _read_label_SemKITTI(path):
  function _read_invalid_SemKITTI (line 125) | def _read_invalid_SemKITTI(path):
  function _read_occluded_SemKITTI (line 130) | def _read_occluded_SemKITTI(path):
  function _read_occupancy_SemKITTI (line 135) | def _read_occupancy_SemKITTI(path):
  function _read_rgb_SemKITTI (line 140) | def _read_rgb_SemKITTI(path):
  function _read_pointcloud_SemKITTI (line 145) | def _read_pointcloud_SemKITTI(path):
  function _read_calib_SemKITTI (line 152) | def _read_calib_SemKITTI(calib_path):
  function get_remap_lut (line 173) | def get_remap_lut(path):
  function data_augmentation_3Dflips (line 196) | def data_augmentation_3Dflips(flip, data):
  function get_cmap_semanticKITTI20 (line 216) | def get_cmap_semanticKITTI20():

FILE: dataset/kitti/kitti_dataset_eval.py
  class Kitti_Novel_View_Eval (line 23) | class Kitti_Novel_View_Eval:
    method __init__ (line 24) | def __init__(
    method get_depth_from_lidar (line 287) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
    method __len__ (line 311) | def __len__(self):
    method prepare_temImg2lidar (line 314) | def prepare_temImg2lidar(self, scan, anchor):
    method prepare_img_metas (line 327) | def prepare_img_metas(self, scan):
    method read_surround_imgs (line 346) | def read_surround_imgs(self, img_paths):
    method __getitem__ (line 357) | def __getitem__(self, index):

FILE: dataset/kitti/kitti_dataset_one_frame.py
  class Kitti_One_Frame (line 21) | class Kitti_One_Frame:
    method __init__ (line 22) | def __init__(
    method get_depth_from_lidar (line 294) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
    method load_2d_sem_label (line 318) | def load_2d_sem_label(self, scan):
    method __len__ (line 325) | def __len__(self):
    method prepare_img_metas (line 328) | def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
    method read_surround_imgs (line 375) | def read_surround_imgs(self, img_paths):
    method __getitem__ (line 386) | def __getitem__(self, index):

FILE: dataset/kitti_raw/kitti_raw_dataset.py
  class Kitti_Raw (line 20) | class Kitti_Raw:
    method __init__ (line 21) | def __init__(
    method _get_sequences (line 213) | def _get_sequences(data_path):
    method _load_split (line 228) | def _load_split(split_path):
    method _load_calibs (line 241) | def _load_calibs(data_path):
    method _load_poses (line 299) | def _load_poses(pose_path, sequences):
    method get_depth_from_lidar (line 325) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
    method load_depth (line 355) | def load_depth(self, lidar_path, lidar2img, image_size):
    method __len__ (line 404) | def __len__(self):
    method prepare_img_metas (line 407) | def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
    method read_surround_imgs (line 471) | def read_surround_imgs(self, img_paths):
    method to_scan_idx (line 479) | def to_scan_idx(self, index):
    method __getitem__ (line 485) | def __getitem__(self, index):

FILE: dataset/kitti_raw/kitti_raw_dataset_stereo.py
  class Kitti_Raw_Stereo (line 20) | class Kitti_Raw_Stereo:
    method __init__ (line 24) | def __init__(
    method _get_sequences (line 221) | def _get_sequences(data_path):
    method _load_split (line 236) | def _load_split(split_path):
    method _load_calibs (line 249) | def _load_calibs(data_path):
    method _load_poses (line 307) | def _load_poses(pose_path, sequences):
    method get_depth_from_lidar (line 333) | def get_depth_from_lidar(self, lidar_path, lidar2img, image_size):
    method load_depth (line 363) | def load_depth(self, lidar_path, lidar2img, image_size):
    method __len__ (line 412) | def __len__(self):
    method prepare_img_metas (line 415) | def prepare_img_metas(self, scan, anchor_scan, anchor_prev, anchor_next):
    method read_surround_imgs (line 480) | def read_surround_imgs(self, img_paths):
    method to_scan_idx (line 488) | def to_scan_idx(self, index):
    method append_self (line 494) | def append_self(self, flag, obj):
    method __getitem__ (line 502) | def __getitem__(self, index):

FILE: dataset/loading.py
  class LoadMultiViewImageFromFilesHF (line 11) | class LoadMultiViewImageFromFilesHF:
    method __init__ (line 25) | def __init__(self,
    method load (line 34) | def load(self, results):
    method __call__ (line 69) | def __call__(self, results):
  class OriLoadMultiViewImageFromFilesHF (line 108) | class OriLoadMultiViewImageFromFilesHF(object):
    method __init__ (line 122) | def __init__(self,
    method __call__ (line 131) | def __call__(self, results):
    method __repr__ (line 177) | def __repr__(self):
  class LoadPtsFromFilesHF (line 185) | class LoadPtsFromFilesHF:
    method __init__ (line 187) | def __init__(self,
    method load (line 194) | def load(self, filename):
  class FFrecordClient (line 218) | class FFrecordClient(BaseStorageBackend):
    method __init__ (line 222) | def __init__(self, fname, check_data=False, filename2idx='filename2idx...
    method get (line 233) | def get(self, filepath: Union[str, Path]) -> bytes:
    method get_text (line 258) | def get_text(self,

FILE: dataset/sampler.py
  class CustomDistributedSampler (line 12) | class CustomDistributedSampler(Sampler[T_co]):
    method __init__ (line 59) | def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None,
    method __iter__ (line 94) | def __iter__(self) -> Iterator[T_co]:
    method __len__ (line 122) | def __len__(self) -> int:
    method set_epoch (line 125) | def set_epoch(self, epoch: int) -> None:
    method set_last_iter (line 136) | def set_last_iter(self, last_iter: int):

FILE: dataset/transform_3d.py
  class PadMultiViewImage (line 8) | class PadMultiViewImage(object):
    method __init__ (line 19) | def __init__(self, size=None, size_divisor=None, pad_val=0):
    method _pad_img (line 29) | def _pad_img(self, results):
    method __call__ (line 61) | def __call__(self, results):
    method __repr__ (line 71) | def __repr__(self):
  class NormalizeMultiviewImage (line 79) | class NormalizeMultiviewImage(object):
    method __init__ (line 89) | def __init__(self, mean, std, to_rgb=True):
    method __call__ (line 95) | def __call__(self, results):
    method __repr__ (line 109) | def __repr__(self):
  class RandomFlip (line 114) | class RandomFlip(object):
    method __init__ (line 116) | def __init__(self, prob=0.5) -> None:
    method __call__ (line 119) | def __call__(self, results):
  class PhotoMetricDistortionMultiViewImage (line 126) | class PhotoMetricDistortionMultiViewImage:
    method __init__ (line 151) | def __init__(self,
    method __call__ (line 163) | def __call__(self, results):
    method __repr__ (line 222) | def __repr__(self):
  class RandomScaleImageMultiViewImage (line 323) | class RandomScaleImageMultiViewImage(object):
    method __init__ (line 329) | def __init__(self, scales=[], ref_focal_len=None, random_scale=None, p...
    method __call__ (line 341) | def __call__(self, results):
    method __repr__ (line 380) | def __repr__(self):
  function read_surround_imgs (line 399) | def read_surround_imgs(img_paths, crop_size=[768, 1600]):

FILE: dataset/utils.py
  function get_rm (line 4) | def get_rm(angle, axis, deg=False):

FILE: eval_depth.py
  function pass_print (line 19) | def pass_print(*args, **kwargs):
  function main (line 22) | def main(local_rank, args):

FILE: eval_iou.py
  function pass_print (line 23) | def pass_print(*args, **kwargs):
  function read_occ3d_label (line 26) | def read_occ3d_label(metas, nusc):
  function read_openoccupancy_label (line 34) | def read_openoccupancy_label(metas, nusc):
  function main (line 43) | def main(local_rank, args):

FILE: eval_iou_kitti.py
  function pass_print (line 21) | def pass_print(*args, **kwargs):
  function read_semantic_kitti (line 24) | def read_semantic_kitti(metas):
  function main (line 44) | def main(local_rank, args):

FILE: eval_novel_depth.py
  function pass_print (line 20) | def pass_print(*args, **kwargs):
  function main (line 23) | def main(local_rank, args):
  function evaluate_depth (line 285) | def evaluate_depth(gt_depth, pred_depth):
  function print_metrics (line 306) | def print_metrics(agg_depth_errors, n_frames, logger, num_cams=6):

FILE: eval_novel_depth_kitti.py
  function pass_print (line 20) | def pass_print(*args, **kwargs):
  function main (line 23) | def main(local_rank, args):
  function evaluate_depth (line 264) | def evaluate_depth(gt_depth, pred_depth):
  function print_metrics (line 281) | def print_metrics(agg_depth_errors, n_frames, logger):

FILE: examine_sweeps.py
  function gather_sensor (line 15) | def gather_sensor(sample_data_token):

FILE: generate_videos.py
  function cat_images (line 6) | def cat_images(dir, cam_img_size, pred_img_size, pred_img_size2, spacing...
  function get_video (line 70) | def get_video(img_path, video_path, fps, size):

FILE: loss/base_loss.py
  class BaseLoss (line 8) | class BaseLoss(nn.Module):
    method __init__ (line 19) | def __init__(
    method forward (line 34) | def forward(self, inputs):

FILE: loss/edge_loss_3d_ms.py
  function get_smooth_loss (line 7) | def get_smooth_loss(disp, img):
  class EdgeLoss3DMS (line 24) | class EdgeLoss3DMS(BaseLoss):
    method __init__ (line 26) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
    method edge_loss (line 44) | def edge_loss(self, curr_imgs, ms_depths, ms_rays, ms_accs=None, max_d...

FILE: loss/eikonal_loss.py
  class EikonalLoss (line 6) | class EikonalLoss(BaseLoss):
    method __init__ (line 8) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
    method eikonal_loss (line 19) | def eikonal_loss(self, eik_grad):

FILE: loss/multi_loss.py
  class MultiLoss (line 10) | class MultiLoss(nn.Module):
    method __init__ (line 12) | def __init__(self, loss_cfgs):
    method forward (line 24) | def forward(self, inputs):

FILE: loss/reproj_loss_mono_multi_new.py
  class SSIM (line 7) | class SSIM(nn.Module):
    method __init__ (line 10) | def __init__(self):
    method forward (line 23) | def forward(self, x, y):
  class ReprojLossMonoMultiNew (line 41) | class ReprojLossMonoMultiNew(BaseLoss):
    method __init__ (line 43) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
    method reproj_loss (line 72) | def reproj_loss(

FILE: loss/reproj_loss_mono_multi_new_combine.py
  class SSIM (line 7) | class SSIM(nn.Module):
    method __init__ (line 10) | def __init__(self):
    method forward (line 23) | def forward(self, x, y):
  class ReprojLossMonoMultiNewCombine (line 41) | class ReprojLossMonoMultiNewCombine(BaseLoss):
    method __init__ (line 43) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
    method reproj_loss (line 70) | def reproj_loss(

FILE: loss/rgb_loss_ms.py
  class SSIM (line 7) | class SSIM(nn.Module):
    method __init__ (line 10) | def __init__(self):
    method forward (line 23) | def forward(self, x, y):
  class RGBLossMS (line 41) | class RGBLossMS(BaseLoss):
    method __init__ (line 43) | def __init__(
    method rgb_loss (line 68) | def rgb_loss(self, ms_colors, ms_rays, gt_imgs):
  class SemLossMS (line 103) | class SemLossMS(BaseLoss):
    method __init__ (line 105) | def __init__(
    method sem_loss (line 126) | def sem_loss(self, sem, metas, ms_rays):
  class SemCELossMS (line 160) | class SemCELossMS(BaseLoss):
    method __init__ (line 162) | def __init__(
    method sem_loss (line 183) | def sem_loss(self, sem, metas, ms_rays):

FILE: loss/second_grad_loss.py
  class SecondGradLoss (line 6) | class SecondGradLoss(BaseLoss):
    method __init__ (line 8) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
    method second_grad_loss (line 19) | def second_grad_loss(self, second_grad):

FILE: loss/sparsity_loss.py
  class SparsityLoss (line 7) | class SparsityLoss(BaseLoss):
    method __init__ (line 12) | def __init__(self, weight=1.0, scale=1.0, input_dict=None, **kwargs):
    method sparsity_loss (line 24) | def sparsity_loss(self, density):
  class HardSparsityLoss (line 30) | class HardSparsityLoss(BaseLoss):
    method __init__ (line 32) | def __init__(self, weight=1.0, scale=1.0, thresh=0.2, crop=[[0, 0], [0...
    method hard_sigmoid (line 46) | def hard_sigmoid(self, density):
    method hard_sparsity_loss (line 49) | def hard_sparsity_loss(self, density):
  class SoftSparsityLoss (line 67) | class SoftSparsityLoss(BaseLoss):
    method __init__ (line 69) | def __init__(self, weight=1.0, input_dict=None, **kwargs):
    method soft_sparsity_loss (line 80) | def soft_sparsity_loss(self, density):
  class AdaptiveSparsityLoss (line 85) | class AdaptiveSparsityLoss(BaseLoss):
    method __init__ (line 87) | def __init__(self, weight=1, input_dict=None, slack=4.0, **kwargs):
    method adaptive_sparsity_loss (line 102) | def adaptive_sparsity_loss(self, sdfs, ts, ms_depths):

FILE: model/backbone/unet2d.py
  class UpSampleBN (line 11) | class UpSampleBN(nn.Module):
    method __init__ (line 12) | def __init__(self, skip_input, output_features):
    method forward (line 25) | def forward(self, x, concat_with):
  class DecoderBN (line 36) | class DecoderBN(nn.Module):
    method __init__ (line 37) | def __init__(
    method forward (line 96) | def forward(self, features):
  class Encoder (line 144) | class Encoder(nn.Module):
    method __init__ (line 145) | def __init__(self, backend):
    method forward (line 149) | def forward(self, x):
  class UNet2D (line 161) | class UNet2D(nn.Module):
    method __init__ (line 162) | def __init__(self, out_feature, use_decoder=True):
    method forward (line 194) | def forward(self, x, **kwargs):
    method get_encoder_params (line 199) | def get_encoder_params(self):  # lr/10 learning rate
    method get_decoder_params (line 202) | def get_decoder_params(self):  # lr learning rate

FILE: model/encoder/base_encoder.py
  class BaseEncoder (line 6) | class BaseEncoder(BaseModule):
    method __init__ (line 11) | def __init__(self, init_cfg=None, **kwargs):
    method forward (line 14) | def forward(

FILE: model/encoder/bevformer/attention/image_cross_attention.py
  class BEVCrossAttention (line 12) | class BEVCrossAttention(BaseModule):
    method __init__ (line 18) | def __init__(self,
    method init_weight (line 40) | def init_weight(self):
    method forward (line 46) | def forward(self,
  class BEVDeformableAttention (line 149) | class BEVDeformableAttention(BaseModule):
    method __init__ (line 178) | def __init__(self,
    method init_weights (line 226) | def init_weights(self) -> None:
    method forward (line 247) | def forward(self,

FILE: model/encoder/bevformer/bevformer_encoder.py
  class BEVFormerEncoder (line 18) | class BEVFormerEncoder(BaseEncoder):
    method __init__ (line 20) | def __init__(
    method init_weights (line 119) | def init_weights(self):
    method forward_layers (line 135) | def forward_layers(
    method forward (line 171) | def forward(

FILE: model/encoder/bevformer/bevformer_encoder_layer.py
  class BEVFormerLayer (line 11) | class BEVFormerLayer(BaseModule):
    method __init__ (line 45) | def __init__(self,
    method forward (line 121) | def forward(self,

FILE: model/encoder/bevformer/bevformer_pos_embed.py
  class BEVPositionalEncoding (line 7) | class BEVPositionalEncoding(BaseModule):
    method __init__ (line 9) | def __init__(self, num_freqs, embed_dims,
    method forward (line 33) | def forward(self):

FILE: model/encoder/bevformer/mappings.py
  class LinearMapping (line 4) | class LinearMapping:
    method __init__ (line 6) | def __init__(
    method grid2meter (line 39) | def grid2meter(self, grid):
    method meter2grid (line 97) | def meter2grid(self, meter, normalize=False):
  class GridMeterMapping (line 153) | class GridMeterMapping:
    method __init__ (line 155) | def __init__(
  class NonLinearMapping (line 199) | class NonLinearMapping:
    method __init__ (line 201) | def __init__(
    method grid2meter (line 227) | def grid2meter(self, grid):
    method meter2grid (line 254) | def meter2grid(self, meter, normalize=False):

FILE: model/encoder/bevformer/mappings_old.py
  class GridMeterMapping (line 5) | class GridMeterMapping:
    method __init__ (line 7) | def __init__(
    method grid2meter (line 37) | def grid2meter(self, grid):
    method meter2grid (line 64) | def meter2grid(self, meter):

FILE: model/encoder/bevformer/utils.py
  function get_cross_view_ref_points (line 5) | def get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar):
  function get_reference_points (line 74) | def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs...
  function point_sampling (line 116) | def point_sampling(reference_points, img_metas):

FILE: model/encoder/tpvformer/attention/cross_view_hybrid_attention.py
  class CrossViewHybridAttention (line 12) | class CrossViewHybridAttention(MultiScaleDeformableAttention):
    method forward (line 17) | def forward(self,

FILE: model/encoder/tpvformer/attention/image_cross_attention.py
  class TPVCrossAttention (line 8) | class TPVCrossAttention(BaseModule):
    method __init__ (line 10) | def __init__(
    method forward (line 71) | def forward(self,

FILE: model/encoder/tpvformer/modules/camera_se_net.py
  class Mlp (line 4) | class Mlp(nn.Module):
    method __init__ (line 6) | def __init__(self,
    method forward (line 21) | def forward(self, x):
  class SELayer (line 44) | class SELayer(nn.Module):
    method __init__ (line 46) | def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
    method forward (line 49) | def forward(self, x, x_se):
  class CameraAwareSE (line 52) | class CameraAwareSE(nn.Module):
    method __init__ (line 54) | def __init__(
    method _init_layers (line 65) | def _init_layers(self):
    method init_weight (line 87) | def init_weight(self):
    method forward (line 93) | def forward(self, ms_img_feats, metas):

FILE: model/encoder/tpvformer/modules/split_fpn.py
  class MultiPlaneFFN (line 8) | class MultiPlaneFFN(BaseModule):
    method __init__ (line 10) | def __init__(
    method _init_layers (line 29) | def _init_layers(self):
    method forward (line 35) | def forward(self, tpv, identity=None):

FILE: model/encoder/tpvformer/modules/split_norm.py
  class MultiPlaneNorm (line 8) | class MultiPlaneNorm(BaseModule):
    method __init__ (line 10) | def __init__(
    method _init_layers (line 22) | def _init_layers(self):
    method forward (line 28) | def forward(self, tpv):

FILE: model/encoder/tpvformer/tpvformer_encoder.py
  class TPVFormerEncoder (line 20) | class TPVFormerEncoder(BaseEncoder):
    method __init__ (line 22) | def __init__(
    method init_weights (line 174) | def init_weights(self):
    method forward_layers (line 192) | def forward_layers(
    method forward (line 234) | def forward(

FILE: model/encoder/tpvformer/tpvformer_encoder_layer.py
  class TPVFormerLayer (line 11) | class TPVFormerLayer(BaseModule):
    method __init__ (line 45) | def __init__(self,
    method forward (line 123) | def forward(self,

FILE: model/encoder/tpvformer/tpvformer_pos_embed.py
  function get_feat_from_meter (line 6) | def get_feat_from_meter(num_freqs, meter):
  class TPVPositionalEncoding (line 17) | class TPVPositionalEncoding(BaseModule):
    method __init__ (line 19) | def __init__(
    method forward (line 54) | def forward(self):

FILE: model/encoder/tpvformer/utils.py
  function get_cross_view_ref_points (line 5) | def get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar,...

FILE: model/head/base_head.py
  class BaseTaskHead (line 5) | class BaseTaskHead(BaseModule):
    method __init__ (line 11) | def __init__(self, init_cfg=None, **kwargs):
    method forward (line 14) | def forward(

FILE: model/head/nerfacc_head/bev_nerf.py
  class BEVNeRF (line 8) | class BEVNeRF(BaseModule):
    method __init__ (line 10) | def __init__(
    method _init_layers (line 62) | def _init_layers(self):
    method pre_compute_density_color (line 74) | def pre_compute_density_color(self, bev):
    method query_density (line 99) | def query_density(self, x):
    method forward (line 120) | def forward(self, x, condition=None):
    method forward_geo (line 155) | def forward_geo(self, x):

FILE: model/head/nerfacc_head/estimator.py
  class CustomOccGridEstimator (line 8) | class CustomOccGridEstimator(OccGridEstimator):
    method sampling (line 11) | def sampling(

FILE: model/head/nerfacc_head/img2lidar.py
  class Img2LiDAR (line 6) | class Img2LiDAR(nn.Module):
    method __init__ (line 8) | def __init__(
    method forward (line 25) | def forward(self, metas, rays):

FILE: model/head/nerfacc_head/nerfacc_head.py
  function namedtuple_map (line 13) | def namedtuple_map(fn, tup):
  class NeRFAccHead (line 21) | class NeRFAccHead(BaseTaskHead):
    method __init__ (line 23) | def __init__(
    method prepare (line 144) | def prepare(
    method render (line 153) | def render(
    method render_image_with_occgrid (line 242) | def render_image_with_occgrid(self, rays):
    method forward_occ (line 346) | def forward_occ(
    method get_uniform_sdf (line 366) | def get_uniform_sdf(self, aabb, resolution, device, shift=False):
    method forward (line 396) | def forward(

FILE: model/head/nerfacc_head/ray_sampler.py
  class RaySampler (line 5) | class RaySampler(nn.Module):
    method __init__ (line 7) | def __init__(
    method forward (line 48) | def forward(self):

FILE: model/head/nerfacc_head/rendering.py
  function custom_rendering (line 15) | def custom_rendering(

FILE: model/head/neus_head/neus_head.py
  class NeuSHead (line 22) | class NeuSHead(BaseTaskHead):
    method __init__ (line 24) | def __init__(
    method forward_occ (line 237) | def forward_occ(
    method get_uniform_sdf (line 265) | def get_uniform_sdf(self, aabb, resolution, device, shift=False):
    method prepare (line 295) | def prepare(
    method render (line 308) | def render(
    method forward (line 473) | def forward(
  function chunk_cams (line 716) | def chunk_cams(tensor, num_cams):

FILE: model/head/utils/sh_render.py
  function eval_sh_bases (line 35) | def eval_sh_bases(deg, dirs):
  function SHRender (line 84) | def SHRender(xyz_sampled, viewdirs, features, deg=2, act='relu'):

FILE: model/lifter/base_lifter.py
  class BaseLifter (line 6) | class BaseLifter(BaseModule):
    method __init__ (line 13) | def __init__(self, init_cfg=None, **kwargs) -> None:
    method forward (line 16) | def forward(

FILE: model/lifter/bev_query_lifter.py
  class BEVQueryLifter (line 7) | class BEVQueryLifter(BaseLifter):
    method __init__ (line 9) | def __init__(
    method forward (line 23) | def forward(self, ms_img_feats, *args, **kwargs):

FILE: model/lifter/tpv_pos_lifter.py
  function get_feat_from_meter (line 6) | def get_feat_from_meter(num_freqs, meter):
  class TPVPositionLifter (line 18) | class TPVPositionLifter(BaseLifter):
    method __init__ (line 20) | def __init__(
    method forward (line 81) | def forward(self, ms_img_feats, *args, **kwargs):

FILE: model/lifter/tpv_query_lifter.py
  class TPVQueryLifter (line 7) | class TPVQueryLifter(BaseLifter):
    method __init__ (line 9) | def __init__(
    method forward (line 27) | def forward(self, ms_img_feats, *args, **kwargs):

FILE: model/neck/identity_neck.py
  class IdentityNeck (line 6) | class IdentityNeck(BaseModule):
    method __init__ (line 8) | def __init__(self, init_cfg = None):
    method forward (line 11) | def forward(self, inputs):

FILE: model/segmentor/base_segmentor.py
  class CustomBaseSegmentor (line 7) | class CustomBaseSegmentor(BaseModule):
    method __init__ (line 9) | def __init__(
    method extract_img_feat (line 34) | def extract_img_feat(self, imgs, **kwargs):
    method forward (line 51) | def forward(

FILE: model/segmentor/tpv_segmentor.py
  class TPVSegmentor (line 10) | class TPVSegmentor(CustomBaseSegmentor):
    method __init__ (line 12) | def __init__(
    method extract_img_feat (line 36) | def extract_img_feat(self, imgs, metas, **kwargs):
    method forward_extra_img_backbone (line 71) | def forward_extra_img_backbone(self, imgs, **kwargs):
    method forward (line 87) | def forward(self,

FILE: train.py
  function pass_print (line 18) | def pass_print(*args, **kwargs):
  function main (line 21) | def main(local_rank, args):

FILE: utils/config_tools.py
  function modify_for_eval (line 10) | def modify_for_eval(cfg, dataset='nuscenes', novel_depth=False, args=None):

FILE: utils/feat_tools.py
  function multi2single_scale (line 4) | def multi2single_scale(

FILE: utils/metric_util.py
  function cityscapes2semantickitti (line 10) | def cityscapes2semantickitti(sem):
  function openseed2nuscenes (line 37) | def openseed2nuscenes(sem):
  class MeanIoU (line 66) | class MeanIoU:
    method __init__ (line 68) | def __init__(self,
    method reset (line 85) | def reset(self) -> None:
    method _after_step (line 90) | def _after_step(self, outputs, targets, mask=None):
    method _after_epoch (line 122) | def _after_epoch(self):
  class IoU (line 168) | class IoU(nn.Module):
    method __init__ (line 170) | def __init__(self, use_mask=False):
    method reset (line 186) | def reset(self) -> None:
    method _after_step (line 191) | def _after_step(self, outputs, targets, occ3d=False):
    method _after_step_occ3d (line 203) | def _after_step_occ3d(self, outputs, targets):
    method _after_epoch (line 221) | def _after_epoch(self):
  function cal_depth_metric (line 247) | def cal_depth_metric(depth_pred, depth_gt):
  class DepthMetric (line 282) | class DepthMetric(nn.Module):
    method __init__ (line 284) | def __init__(self, camera_names=['front'], eval_types=['raw', 'median']):
    method _reset (line 300) | def _reset(self):
    method _after_step (line 311) | def _after_step(self, depth_loc, depth_gt, depth_mask, depth_pred):
    method _after_epoch (line 351) | def _after_epoch(self):
  function compute_depth_errors (line 400) | def compute_depth_errors(gt, pred, min_depth=1e-3, max_depth=80):
  function compute_depth_errors_torch (line 424) | def compute_depth_errors_torch(gt, pred, min_depth=1e-3, max_depth=80):

FILE: utils/misc.py
  function _load_model_from_metafile (line 12) | def _load_model_from_metafile(model: str, scope: str = 'mmseg'):
  function _init_model (line 49) | def _init_model(
  function generate_segmentation_map_2d (line 85) | def generate_segmentation_map_2d(model_2d, inputs):
  function read_segmentation_map_2d (line 122) | def read_segmentation_map_2d(map_path, img_metas):

FILE: utils/openseed_utils.py
  function build_openseed_model (line 12) | def build_openseed_model():
  function forward_openseed_model (line 54) | def forward_openseed_model(model, img, size):

FILE: utils/scenerf_metric.py
  function get_iou (line 7) | def get_iou(iou_sum, cnt_class):
  function get_accuracy (line 17) | def get_accuracy(predict, target, weight=None):  # 0.05s
  class SSCMetrics (line 39) | class SSCMetrics:
    method __init__ (line 40) | def __init__(self, n_classes):
    method hist_info (line 44) | def hist_info(self, n_cl, pred, gt):
    method compute_score (line 59) | def compute_score(hist, correct, labeled):
    method add_batch (line 69) | def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
    method get_stats (line 93) | def get_stats(self):
    method reset (line 120) | def reset(self):
    method get_score_completion (line 140) | def get_score_completion(self, predict, target, nonempty=None):
    method get_score_semantic_and_completion (line 177) | def get_score_semantic_and_completion(self, predict, target, nonempty=...

FILE: utils/tb_wrapper.py
  class WrappedTBWriter (line 4) | class WrappedTBWriter(SummaryWriter, ManagerMixin):
    method __init__ (line 6) | def __init__(self, name, **kwargs):

FILE: utils/temporal_pkl.py
  function arange_according_to_scene (line 4) | def arange_according_to_scene(infos, nusc):

FILE: vis_3d.py
  function pass_print (line 37) | def pass_print(*args, **kwargs):
  function read_semantic_kitti (line 42) | def read_semantic_kitti(metas, idx):
  function read_occ3d_label (line 65) | def read_occ3d_label(metas, nusc, idx):
  function get_grid_coords (line 75) | def get_grid_coords(dims, resolution):
  function draw (line 97) | def draw(
  function main (line 344) | def main(local_rank, args):

FILE: vis_3d_scene.py
  function pass_print (line 38) | def pass_print(*args, **kwargs):
  function read_semantic_kitti (line 43) | def read_semantic_kitti(metas, idx):
  function read_occ3d_label (line 66) | def read_occ3d_label(metas, nusc, idx):
  function draw (line 76) | def draw(
  function main (line 228) | def main(local_rank, args):

FILE: vis_pics.py
  function pass_print (line 26) | def pass_print(*args, **kwargs):
  function create_label_colormap (line 29) | def create_label_colormap():
  function save_depth_map (line 56) | def save_depth_map(disp_resized_np, save_path):
  function main (line 65) | def main(local_rank, args):

Copy disabled (too large) Download .json

Condensed preview — 181 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (10,061K chars).

[
  {
    "path": ".gitignore",
    "chars": 3104,
    "preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 5811,
    "preview": "# SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction\n### [Paper](https://arxiv.org/pdf/2311.12754)  | [Projec"
  },
  {
    "path": "assets/.placeholder",
    "chars": 1,
    "preview": "\n"
  },
  {
    "path": "config/_base_/dataset_v1.py",
    "chars": 542,
    "preview": "data_path = 'data/nuscenes/'\n\ntrain_wrapper_config = dict(\n    phase='train', \n    scale_rate=0.5,\n    photometric_aug=d"
  },
  {
    "path": "config/_base_/optimizer.py",
    "chars": 237,
    "preview": "optimizer = dict(\n    optimizer=dict(\n        type='AdamW',\n        lr=2e-5,\n        weight_decay=0.0001\n    ),\n    para"
  },
  {
    "path": "config/_base_/schedule.py",
    "chars": 32,
    "preview": "max_epochs = 12\nprint_freq = 50\n"
  },
  {
    "path": "config/kitti/kitti_novel_depth.py",
    "chars": 7764,
    "preview": "_base_ = [\n    '../_base_/dataset_v1.py',\n    '../_base_/optimizer.py',\n    '../_base_/schedule.py',\n]\n\nimg_size = [370,"
  },
  {
    "path": "config/kitti/kitti_occ.py",
    "chars": 7929,
    "preview": "_base_ = [\n    '../_base_/dataset_v1.py',\n    '../_base_/optimizer.py',\n    '../_base_/schedule.py',\n]\n\nimg_size = [352,"
  },
  {
    "path": "config/kitti_raw/kitti_raw_depth.py",
    "chars": 8453,
    "preview": "_base_ = [\n    '../_base_/dataset_v1.py',\n    '../_base_/optimizer.py',\n    '../_base_/schedule.py',\n]\n\nimg_size = [370,"
  },
  {
    "path": "config/nuscenes/nuscenes_depth.py",
    "chars": 8548,
    "preview": "_base_ = [\n    '../_base_/dataset_v1.py',\n    '../_base_/optimizer.py',\n    '../_base_/schedule.py',\n]\n\n# img_size = [76"
  },
  {
    "path": "config/nuscenes/nuscenes_novel_depth.py",
    "chars": 7957,
    "preview": "_base_ = [\n    '../_base_/dataset_v1.py',\n    '../_base_/optimizer.py',\n    '../_base_/schedule.py',\n]\n\nimg_size = [768,"
  },
  {
    "path": "config/nuscenes/nuscenes_occ.py",
    "chars": 8619,
    "preview": "_base_ = [\n    '../_base_/dataset_v1.py',\n    '../_base_/optimizer.py',\n    '../_base_/schedule.py',\n]\n\nimg_size = [768,"
  },
  {
    "path": "config/nuscenes/nuscenes_occ_bev.py",
    "chars": 8739,
    "preview": "_base_ = [\n    '../_base_/dataset_v1.py',\n    '../_base_/optimizer.py',\n    '../_base_/schedule.py',\n]\n\nimg_size = [768,"
  },
  {
    "path": "config/openseed/openseed_swint_lang.yaml",
    "chars": 9845,
    "preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
  },
  {
    "path": "dataset/__init__.py",
    "chars": 3732,
    "preview": "from mmengine.registry import Registry\nOPENOCC_DATASET = Registry('openocc_dataset')\nOPENOCC_DATAWRAPPER = Registry('ope"
  },
  {
    "path": "dataset/dataset_one_frame_eval.py",
    "chars": 9690,
    "preview": "import os, numpy as np, pickle, random\nfrom mmcv.image.io import imread\nfrom pyquaternion import Quaternion\nfrom copy im"
  },
  {
    "path": "dataset/dataset_one_frame_sweeps_dist.py",
    "chars": 22271,
    "preview": "import os, numpy as np, random, mmengine, math\nfrom mmcv.image.io import imread\nfrom pyquaternion import Quaternion\nfrom"
  },
  {
    "path": "dataset/dataset_one_frame_sweeps_dist_vis.py",
    "chars": 6247,
    "preview": "import os, numpy as np, random, mmengine, math\nfrom mmcv.image.io import imread\nfrom pyquaternion import Quaternion\nfrom"
  },
  {
    "path": "dataset/dataset_wrapper_temporal.py",
    "chars": 8075,
    "preview": "\nimport numpy as np, torch\nfrom torch.utils import data\nfrom dataset.transform_3d import PadMultiViewImage, NormalizeMul"
  },
  {
    "path": "dataset/dataset_wrapper_vis.py",
    "chars": 2968,
    "preview": "\nimport numpy as np, torch\nfrom torch.utils import data\nfrom dataset.transform_3d import PadMultiViewImage, NormalizeMul"
  },
  {
    "path": "dataset/kitti/helpers.py",
    "chars": 6934,
    "preview": "import numpy as np\n# import scenerf.data.utils.fusion as fusion\n# import open3d as o3d\nfrom PIL import Image\n\n\n# def mak"
  },
  {
    "path": "dataset/kitti/io_data.py",
    "chars": 7383,
    "preview": "\"\"\"\nMost of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py\n\"\"\""
  },
  {
    "path": "dataset/kitti/kitti_dataset_eval.py",
    "chars": 23468,
    "preview": "import glob\nimport os\nimport time\n\nimport numpy as np\nimport torch\n# from torchvision import transforms\nfrom dataset.kit"
  },
  {
    "path": "dataset/kitti/kitti_dataset_one_frame.py",
    "chars": 18292,
    "preview": "import glob, os, time, random\n\nimport numpy as np\nimport torch\n# from torchvision import transforms\nfrom dataset.kitti.h"
  },
  {
    "path": "dataset/kitti/params.py",
    "chars": 1050,
    "preview": "import numpy as np\n\nsemantic_kitti_class_frequencies = np.array(\n    [\n        5.41773033e09,\n        1.57835390e07,\n   "
  },
  {
    "path": "dataset/kitti/semantic-kitti.yaml",
    "chars": 5593,
    "preview": "# This file is covered by the LICENSE file in the root of this project.\nnbr_classes: 20\ngrid_dims: [256, 32, 256]  # (W,"
  },
  {
    "path": "dataset/kitti_raw/kitti_raw_dataset.py",
    "chars": 23341,
    "preview": "import os, time, random\nfrom pathlib import Path\nfrom copy import deepcopy\nfrom collections import Counter\n\nimport numpy"
  },
  {
    "path": "dataset/kitti_raw/kitti_raw_dataset_stereo.py",
    "chars": 25715,
    "preview": "import os, time, random\nfrom pathlib import Path\nfrom copy import deepcopy\nfrom collections import Counter\n\nimport numpy"
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0001_sync.txt",
    "chars": 16118,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0002_sync.txt",
    "chars": 11435,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0005_sync.txt",
    "chars": 22961,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0009_sync.txt",
    "chars": 67151,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0011_sync.txt",
    "chars": 34953,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0013_sync.txt",
    "chars": 21649,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0014_sync.txt",
    "chars": 46989,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0015_sync.txt",
    "chars": 44622,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0017_sync.txt",
    "chars": 16929,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0018_sync.txt",
    "chars": 40221,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0019_sync.txt",
    "chars": 72478,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0020_sync.txt",
    "chars": 12756,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0022_sync.txt",
    "chars": 120073,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0023_sync.txt",
    "chars": 71295,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0027_sync.txt",
    "chars": 28107,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0028_sync.txt",
    "chars": 64999,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0029_sync.txt",
    "chars": 65060,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0032_sync.txt",
    "chars": 58646,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0035_sync.txt",
    "chars": 19411,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0036_sync.txt",
    "chars": 121989,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0039_sync.txt",
    "chars": 59671,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0046_sync.txt",
    "chars": 18586,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0048_sync.txt",
    "chars": 3258,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0051_sync.txt",
    "chars": 65448,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0052_sync.txt",
    "chars": 11543,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0056_sync.txt",
    "chars": 44489,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0057_sync.txt",
    "chars": 53463,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0059_sync.txt",
    "chars": 55857,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0060_sync.txt",
    "chars": 11553,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0061_sync.txt",
    "chars": 105707,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0064_sync.txt",
    "chars": 85569,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0070_sync.txt",
    "chars": 63546,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0079_sync.txt",
    "chars": 14866,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0084_sync.txt",
    "chars": 57460,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0086_sync.txt",
    "chars": 107196,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0087_sync.txt",
    "chars": 108988,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0091_sync.txt",
    "chars": 50939,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0093_sync.txt",
    "chars": 64924,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0095_sync.txt",
    "chars": 40187,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0096_sync.txt",
    "chars": 71614,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0101_sync.txt",
    "chars": 142239,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0104_sync.txt",
    "chars": 46664,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0106_sync.txt",
    "chars": 33801,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0113_sync.txt",
    "chars": 12888,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_26/2011_09_26_drive_0117_sync.txt",
    "chars": 99432,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_28/2011_09_28_drive_0001_sync.txt",
    "chars": 15814,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_28/2011_09_28_drive_0002_sync.txt",
    "chars": 56023,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_29/2011_09_29_drive_0004_sync.txt",
    "chars": 50994,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_29/2011_09_29_drive_0026_sync.txt",
    "chars": 23519,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_29/2011_09_29_drive_0071_sync.txt",
    "chars": 160106,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0016_sync.txt",
    "chars": 42043,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0018_sync.txt",
    "chars": 417932,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0020_sync.txt",
    "chars": 166624,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0027_sync.txt",
    "chars": 166524,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0028_sync.txt",
    "chars": 790379,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0033_sync.txt",
    "chars": 243089,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_09_30/2011_09_30_drive_0034_sync.txt",
    "chars": 187450,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0027_sync.txt",
    "chars": 689623,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0034_sync.txt",
    "chars": 715366,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0042_sync.txt",
    "chars": 179843,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/orb-slam_poses/2011_10_03/2011_10_03_drive_0047_sync.txt",
    "chars": 126027,
    "preview": "1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000 0.000000000 0.000000000 0.000000000 0.000000000 "
  },
  {
    "path": "dataset/kitti_raw/splits/eigen/test_files.txt",
    "chars": 35547,
    "preview": "2011_09_26/2011_09_26_drive_0002_sync 0000000069 l\n2011_09_26/2011_09_26_drive_0002_sync 0000000054 l\n2011_09_26/2011_09"
  },
  {
    "path": "dataset/kitti_raw/splits/eigen_zhou/test_files.txt",
    "chars": 35547,
    "preview": "2011_09_26/2011_09_26_drive_0002_sync 0000000069 l\n2011_09_26/2011_09_26_drive_0002_sync 0000000054 l\n2011_09_26/2011_09"
  },
  {
    "path": "dataset/kitti_raw/splits/eigen_zhou/train_files.txt",
    "chars": 1762076,
    "preview": "2011_09_26/2011_09_26_drive_0022_sync 473 r\n2011_09_29/2011_09_29_drive_0026_sync 1 l\n2011_09_26/2011_09_26_drive_0087_s"
  },
  {
    "path": "dataset/kitti_raw/splits/eigen_zhou/val_files.txt",
    "chars": 195800,
    "preview": "2011_09_26/2011_09_26_drive_0028_sync 82 r\n2011_09_26/2011_09_26_drive_0022_sync 247 r\n2011_10_03/2011_10_03_drive_0042_"
  },
  {
    "path": "dataset/kitti_raw/splits/tulsiani/test_files.txt",
    "chars": 55028,
    "preview": "2011_09_26/2011_09_26_drive_0001_sync 0000000000 l\n2011_09_26/2011_09_26_drive_0001_sync 0000000001 l\n2011_09_26/2011_09"
  },
  {
    "path": "dataset/kitti_raw/splits/tulsiani/train_files.txt",
    "chars": 610877,
    "preview": "2011_09_26/2011_09_26_drive_0005_sync 0000000000 l\n2011_09_26/2011_09_26_drive_0005_sync 0000000001 l\n2011_09_26/2011_09"
  },
  {
    "path": "dataset/kitti_raw/splits/tulsiani/val_files.txt",
    "chars": 63392,
    "preview": "2011_09_26/2011_09_26_drive_0048_sync 0000000000 l\n2011_09_26/2011_09_26_drive_0048_sync 0000000001 l\n2011_09_26/2011_09"
  },
  {
    "path": "dataset/loading.py",
    "chars": 10842,
    "preview": "import os.path\nimport mmcv\nimport numpy as np\nfrom mmengine import BaseStorageBackend, FileClient\nfrom typing import Uni"
  },
  {
    "path": "dataset/sampler.py",
    "chars": 5887,
    "preview": "import math\nfrom typing import TypeVar, Optional, Iterator\n\nimport torch\nfrom torch.utils.data import Sampler, Dataset\ni"
  },
  {
    "path": "dataset/transform_3d.py",
    "chars": 16837,
    "preview": "from typing import Any\nimport numpy as np\nfrom numpy import random\nimport mmcv\n# from mmcv.parallel import DataContainer"
  },
  {
    "path": "dataset/utils.py",
    "chars": 602,
    "preview": "import numpy as np\n\n\ndef get_rm(angle, axis, deg=False):\n    if deg:\n        angle = np.deg2rad(angle)\n    rm = np.eye(3"
  },
  {
    "path": "docs/get_started.md",
    "chars": 3441,
    "preview": "<!-- template from bevformer -->\n\n# Prerequisites\n\n**Please ensure you have prepared the environment and datasets.**\n\n[2"
  },
  {
    "path": "docs/installation.md",
    "chars": 2648,
    "preview": "<!-- template from bevformer -->\n# Step-by-step installation instructions\n\nThe following configuration of conda environm"
  },
  {
    "path": "docs/prepare_data.md",
    "chars": 4302,
    "preview": "<!-- template from bevformer -->\n\n## NuScenes\n\n**a. Download nuScenes data.**\n\nDownload nuScenes V1.0 full dataset data "
  },
  {
    "path": "docs/visualization.md",
    "chars": 1527,
    "preview": "<!-- TODO -->\n\n# Visualization Guide\n\n## Install packages\n\n```\npip install pyvirtualdisplay mayavi PyQt5\n```\n\n## 3D Occu"
  },
  {
    "path": "eval_depth.py",
    "chars": 11010,
    "preview": "import time, argparse, os.path as osp, os, sys\nimport torch, numpy as np\nimport torch.distributed as dist\n\nimport mmcv\nf"
  },
  {
    "path": "eval_iou.py",
    "chars": 13909,
    "preview": "import time, argparse, os.path as osp, os\nimport torch, numpy as np\nimport torch.distributed as dist\nimport torch.nn.fun"
  },
  {
    "path": "eval_iou_kitti.py",
    "chars": 9251,
    "preview": "import time, argparse, os.path as osp, os\nimport torch, numpy as np\nimport torch.distributed as dist\n\nimport mmcv\nfrom m"
  },
  {
    "path": "eval_novel_depth.py",
    "chars": 15410,
    "preview": "import time, argparse, os.path as osp\nimport os, math, pickle\nimport torch, numpy as np\nimport torch.distributed as dist"
  },
  {
    "path": "eval_novel_depth_kitti.py",
    "chars": 12811,
    "preview": "import time, argparse, os.path as osp\nimport os, math, pickle\nimport torch, numpy as np\nimport torch.distributed as dist"
  },
  {
    "path": "examine_sweeps.py",
    "chars": 4710,
    "preview": "#### check nuscenes sweeps\nfrom nuscenes import NuScenes\nimport mmengine\nfrom copy import deepcopy\nimport numpy as np\n\nn"
  },
  {
    "path": "generate_videos.py",
    "chars": 4751,
    "preview": "import os, os.path as osp\nimport PIL.Image as Image\nimport cv2\nimport argparse\n\ndef cat_images(dir, cam_img_size, pred_i"
  },
  {
    "path": "loss/__init__.py",
    "chars": 537,
    "preview": "from mmengine.registry import Registry\nOPENOCC_LOSS = Registry('openocc_loss')\n\nfrom .multi_loss import MultiLoss\nfrom ."
  },
  {
    "path": "loss/base_loss.py",
    "chars": 1278,
    "preview": "import torch.nn as nn\nfrom utils.tb_wrapper import WrappedTBWriter\nif 'selfocc' in WrappedTBWriter._instance_dict:\n    w"
  },
  {
    "path": "loss/edge_loss_3d_ms.py",
    "chars": 3022,
    "preview": "import torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport torch.nn.functional as F\n\n\ndef get_smooth"
  },
  {
    "path": "loss/eikonal_loss.py",
    "chars": 605,
    "preview": "from .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\n\n\n@OPENOCC_LOSS.register_module()\nclass EikonalLoss(BaseLoss)"
  },
  {
    "path": "loss/multi_loss.py",
    "chars": 1394,
    "preview": "import torch.nn as nn\nfrom . import OPENOCC_LOSS\nfrom utils.tb_wrapper import WrappedTBWriter\nif 'selfocc' in WrappedTBW"
  },
  {
    "path": "loss/reproj_loss_mono_multi_new.py",
    "chars": 13361,
    "preview": "import torch.nn as nn, torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport numpy as np\nimport torch."
  },
  {
    "path": "loss/reproj_loss_mono_multi_new_combine.py",
    "chars": 10409,
    "preview": "import torch.nn as nn, torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport numpy as np\nimport torch."
  },
  {
    "path": "loss/rgb_loss_ms.py",
    "chars": 6788,
    "preview": "import torch.nn as nn, torch\nfrom .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport torch.nn.functional as F\n"
  },
  {
    "path": "loss/second_grad_loss.py",
    "chars": 537,
    "preview": "from .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\n\n\n@OPENOCC_LOSS.register_module()\nclass SecondGradLoss(BaseLo"
  },
  {
    "path": "loss/sparsity_loss.py",
    "chars": 3493,
    "preview": "from .base_loss import BaseLoss\nfrom . import OPENOCC_LOSS\nimport torch, numpy as np\n\n\n@OPENOCC_LOSS.register_module()\nc"
  },
  {
    "path": "model/__init__.py",
    "chars": 134,
    "preview": "from .backbone import *\nfrom .neck import *\nfrom .lifter import *\nfrom .encoder import *\nfrom .head import *\nfrom .segme"
  },
  {
    "path": "model/backbone/__init__.py",
    "chars": 63,
    "preview": "from mmseg.models.backbones import *\nfrom .unet2d import UNet2D"
  },
  {
    "path": "model/backbone/unet2d.py",
    "chars": 7355,
    "preview": "\"\"\"\nCode adapted from https://github.com/shariqfarooq123/AdaBins/blob/main/models/unet_adaptive_bins.py\n\"\"\"\nimport torch"
  },
  {
    "path": "model/encoder/__init__.py",
    "chars": 49,
    "preview": "from .bevformer import *\nfrom .tpvformer import *"
  },
  {
    "path": "model/encoder/base_encoder.py",
    "chars": 471,
    "preview": "from mmseg.registry import MODELS\nfrom mmengine.model import BaseModule\n\n\n@MODELS.register_module()\nclass BaseEncoder(Ba"
  },
  {
    "path": "model/encoder/bevformer/__init__.py",
    "chars": 179,
    "preview": "from .bevformer_encoder import BEVFormerEncoder\nfrom .bevformer_pos_embed import BEVPositionalEncoding\nfrom .bevformer_e"
  },
  {
    "path": "model/encoder/bevformer/attention/__init__.py",
    "chars": 76,
    "preview": "from .image_cross_attention import BEVCrossAttention, BEVDeformableAttention"
  },
  {
    "path": "model/encoder/bevformer/attention/image_cross_attention.py",
    "chars": 15352,
    "preview": "\r\nimport warnings\r\nimport torch, math\r\nimport torch.nn as nn\r\nfrom mmengine.model import xavier_init, constant_init, Bas"
  },
  {
    "path": "model/encoder/bevformer/bevformer_encoder.py",
    "chars": 8174,
    "preview": "from mmseg.registry import MODELS\nfrom mmcv.cnn.bricks.transformer import build_positional_encoding, build_transformer_l"
  },
  {
    "path": "model/encoder/bevformer/bevformer_encoder_layer.py",
    "chars": 9243,
    "preview": "\r\nimport copy, torch\r\nfrom mmengine import ConfigDict\r\nfrom mmengine.model import BaseModule, ModuleList\r\nfrom mmengine."
  },
  {
    "path": "model/encoder/bevformer/bevformer_pos_embed.py",
    "chars": 2463,
    "preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom mmengine.registry import MODELS\n\n\n@MODELS.regist"
  },
  {
    "path": "model/encoder/bevformer/mappings.py",
    "chars": 11591,
    "preview": "import torch\nfrom typing_extensions import Literal\n\nclass LinearMapping:\n\n    def __init__(\n            self,\n          "
  },
  {
    "path": "model/encoder/bevformer/mappings_old.py",
    "chars": 4864,
    "preview": "import torch\nfrom typing_extensions import Literal\n\n\nclass GridMeterMapping:\n\n    def __init__(\n        self,\n        be"
  },
  {
    "path": "model/encoder/bevformer/utils.py",
    "chars": 10131,
    "preview": "import torch\nimport numpy as np\n\n\ndef get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar):\n    # ref po"
  },
  {
    "path": "model/encoder/tpvformer/__init__.py",
    "chars": 202,
    "preview": "from .tpvformer_encoder import TPVFormerEncoder\nfrom .tpvformer_pos_embed import TPVPositionalEncoding\nfrom .tpvformer_e"
  },
  {
    "path": "model/encoder/tpvformer/attention/__init__.py",
    "chars": 118,
    "preview": "from .image_cross_attention import TPVCrossAttention\nfrom .cross_view_hybrid_attention import CrossViewHybridAttention"
  },
  {
    "path": "model/encoder/tpvformer/attention/cross_view_hybrid_attention.py",
    "chars": 5701,
    "preview": "from typing import Optional, no_type_check\nfrom mmcv.ops.multi_scale_deform_attn import \\\n    MultiScaleDeformableAttent"
  },
  {
    "path": "model/encoder/tpvformer/attention/image_cross_attention.py",
    "chars": 3234,
    "preview": "\r\nfrom mmengine.model import BaseModule\r\nfrom mmengine.registry import MODELS\r\nfrom mmcv.cnn.bricks.transformer import b"
  },
  {
    "path": "model/encoder/tpvformer/modules/__init__.py",
    "chars": 116,
    "preview": "from .split_fpn import MultiPlaneFFN\nfrom .split_norm import MultiPlaneNorm\nfrom .camera_se_net import CameraAwareSE"
  },
  {
    "path": "model/encoder/tpvformer/modules/camera_se_net.py",
    "chars": 4596,
    "preview": "import torch, torch.nn as nn\nimport numpy as np\n\nclass Mlp(nn.Module):\n\n    def __init__(self,\n                 in_featu"
  },
  {
    "path": "model/encoder/tpvformer/modules/split_fpn.py",
    "chars": 1198,
    "preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom mmengine.registry import MODELS\nfrom mmcv.cnn.br"
  },
  {
    "path": "model/encoder/tpvformer/modules/split_norm.py",
    "chars": 844,
    "preview": "from mmengine.registry import MODELS\nfrom mmengine.model import BaseModule\nfrom mmcv.cnn import build_norm_layer\nimport "
  },
  {
    "path": "model/encoder/tpvformer/tpvformer_encoder.py",
    "chars": 11750,
    "preview": "from mmseg.registry import MODELS\nfrom mmcv.cnn.bricks.transformer import build_positional_encoding, build_transformer_l"
  },
  {
    "path": "model/encoder/tpvformer/tpvformer_encoder_layer.py",
    "chars": 9859,
    "preview": "\r\nimport copy, torch\r\nfrom mmengine import ConfigDict\r\nfrom mmengine.model import BaseModule, ModuleList\r\nfrom mmengine."
  },
  {
    "path": "model/encoder/tpvformer/tpvformer_pos_embed.py",
    "chars": 3554,
    "preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom mmengine.registry import MODELS\n\n\ndef get_feat_f"
  },
  {
    "path": "model/encoder/tpvformer/utils.py",
    "chars": 4298,
    "preview": "import torch\nimport numpy as np\n\n\ndef get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar, offset=0):\n  "
  },
  {
    "path": "model/head/__init__.py",
    "chars": 57,
    "preview": "import nerfacc\nfrom .neus_head.neus_head import NeuSHead\n"
  },
  {
    "path": "model/head/base_head.py",
    "chars": 503,
    "preview": "from mmengine.model import BaseModule\nfrom mmseg.models import HEADS\n\n@HEADS.register_module()\nclass BaseTaskHead(BaseMo"
  },
  {
    "path": "model/head/nerfacc_head/bev_nerf.py",
    "chars": 6512,
    "preview": "import torch, torch.nn as nn\nfrom mmengine.model import BaseModule\nfrom ...encoder.bevformer.mappings import GridMeterMa"
  },
  {
    "path": "model/head/nerfacc_head/estimator.py",
    "chars": 6467,
    "preview": "from nerfacc import OccGridEstimator\nimport torch\nfrom torch import Tensor\nfrom typing import Callable, Optional, Tuple\n"
  },
  {
    "path": "model/head/nerfacc_head/img2lidar.py",
    "chars": 2542,
    "preview": "import torch, torch.nn as nn\nimport numpy as np, os\nfrom dataset.utils import get_rm\n\n\nclass Img2LiDAR(nn.Module):\n\n    "
  },
  {
    "path": "model/head/nerfacc_head/nerfacc_head.py",
    "chars": 18413,
    "preview": "import os\nfrom ..base_head import BaseTaskHead\nfrom .ray_sampler import RaySampler\nfrom .img2lidar import Img2LiDAR\nfrom"
  },
  {
    "path": "model/head/nerfacc_head/ray_sampler.py",
    "chars": 3212,
    "preview": "import torch, torch.nn as nn\nimport numpy as np\n\n\nclass RaySampler(nn.Module):\n\n    def __init__(\n            self,\n    "
  },
  {
    "path": "model/head/nerfacc_head/rendering.py",
    "chars": 6751,
    "preview": "\"\"\"\nModified from NeRFAcc.\nCopyright (c) 2022 Ruilong Li, UC Berkeley.\n\"\"\"\n\nfrom typing import Callable, Dict, Optional,"
  },
  {
    "path": "model/head/neus_head/neus_head.py",
    "chars": 31708,
    "preview": "import torch, torch.distributed as dist, torch.nn as nn, os\nfrom nerfstudio.models.neus_custom import NeuSCustomModelCon"
  },
  {
    "path": "model/head/utils/sh_render.py",
    "chars": 3199,
    "preview": "import torch\n\n################## sh function ##################\nC0 = 0.28209479177387814\nC1 = 0.4886025119029199\nC2 = [\n"
  },
  {
    "path": "model/lifter/__init__.py",
    "chars": 135,
    "preview": "from .bev_query_lifter import BEVQueryLifter\nfrom .tpv_query_lifter import TPVQueryLifter\nfrom .tpv_pos_lifter import TP"
  },
  {
    "path": "model/lifter/base_lifter.py",
    "chars": 522,
    "preview": "from mmseg.registry import MODELS\nfrom mmengine.model import BaseModule\n\n\n@MODELS.register_module()\nclass BaseLifter(Bas"
  },
  {
    "path": "model/lifter/bev_query_lifter.py",
    "chars": 682,
    "preview": "from .base_lifter import BaseLifter\nfrom mmseg.registry import MODELS\nimport torch, torch.nn as nn\n\n\n@MODELS.register_mo"
  },
  {
    "path": "model/lifter/tpv_pos_lifter.py",
    "chars": 3789,
    "preview": "from .base_lifter import BaseLifter\nfrom mmseg.registry import MODELS\nimport torch, torch.nn as nn\nfrom model.encoder.be"
  },
  {
    "path": "model/lifter/tpv_query_lifter.py",
    "chars": 1176,
    "preview": "from .base_lifter import BaseLifter\nfrom mmseg.registry import MODELS\nimport torch, torch.nn as nn\n\n\n@MODELS.register_mo"
  },
  {
    "path": "model/neck/__init__.py",
    "chars": 115,
    "preview": "from mmseg.models.necks import *\nfrom mmdet3d.models.necks import SECONDFPN\nfrom .identity_neck import IdentityNeck"
  },
  {
    "path": "model/neck/identity_neck.py",
    "chars": 571,
    "preview": "from mmseg.registry import MODELS\nfrom mmengine.model import BaseModule\n\n\n@MODELS.register_module()\nclass IdentityNeck(B"
  },
  {
    "path": "model/segmentor/__init__.py",
    "chars": 40,
    "preview": "from .tpv_segmentor import TPVSegmentor\n"
  },
  {
    "path": "model/segmentor/base_segmentor.py",
    "chars": 1672,
    "preview": "from mmseg.models import SEGMENTORS, builder\nfrom mmdet3d.registry import MODELS\nfrom mmengine.model import BaseModule\n\n"
  },
  {
    "path": "model/segmentor/tpv_segmentor.py",
    "chars": 4560,
    "preview": "import torch, torch.nn as nn\nimport numpy as np\nfrom mmseg.models import SEGMENTORS\nfrom mmseg.models import build_backb"
  },
  {
    "path": "train.py",
    "chars": 16480,
    "preview": "import time, argparse, os.path as osp, os\nimport torch, numpy as np\nimport torch.distributed as dist\nfrom copy import de"
  },
  {
    "path": "utils/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "utils/config_tools.py",
    "chars": 3535,
    "preview": "NUM_RAYS = {\n    # 'nuscenes': [96, 200],\n    # 'nuscenes': [384, 800],\n    'nuscenes': [450, 800],\n    'kitti': [176, 6"
  },
  {
    "path": "utils/feat_tools.py",
    "chars": 1273,
    "preview": "import torch\nimport torch.nn.functional as F\n\ndef multi2single_scale(\n        ms_feats, imgs=None, \n        max_shape=No"
  },
  {
    "path": "utils/metric_util.py",
    "chars": 16923,
    "preview": "\nimport numpy as np\n# from mmseg.utils import get_root_logger\nfrom mmengine import MMLogger\nlogger = MMLogger.get_instan"
  },
  {
    "path": "utils/misc.py",
    "chars": 4917,
    "preview": "import os.path as osp\nimport torch, numpy as np\nimport torch.nn.functional as F\nfrom mmengine import Config\nfrom mmengin"
  },
  {
    "path": "utils/openseed_utils.py",
    "chars": 2520,
    "preview": "import os, numpy as np\n\nfrom openseed.BaseModel import BaseModel\nfrom openseed import build_model\nfrom openseed.utils.ar"
  },
  {
    "path": "utils/scenerf_metric.py",
    "chars": 8091,
    "preview": "\"\"\"\nPart of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py\n\"\"\"\nimport numpy as np\ni"
  },
  {
    "path": "utils/tb_wrapper.py",
    "chars": 273,
    "preview": "from torch.utils.tensorboard import SummaryWriter\nfrom mmengine.utils import ManagerMixin\n\nclass WrappedTBWriter(Summary"
  },
  {
    "path": "utils/temporal_pkl.py",
    "chars": 1956,
    "preview": "import pickle\nimport argparse\n\ndef arange_according_to_scene(infos, nusc):\n    scenes = dict()\n\n    for i, info in enume"
  },
  {
    "path": "vis_3d.py",
    "chars": 28116,
    "preview": "import os\noffscreen = False\nif os.environ.get('DISP', 'f') == 'f':\n    from pyvirtualdisplay import Display\n    display "
  },
  {
    "path": "vis_3d_scene.py",
    "chars": 25768,
    "preview": "import os\noffscreen = False\nif os.environ.get('DISP', 'f') == 'f':\n    from pyvirtualdisplay import Display\n    display "
  },
  {
    "path": "vis_pics.py",
    "chars": 13295,
    "preview": "import os\nimport time, argparse, math, os.path as osp, numpy as np\nimport torch\nimport torch.distributed as dist\nimport "
  }
]

About this extraction

This page contains the full source code of the huang-yh/SelfOcc GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 181 files (9.5 MB), approximately 2.5M tokens, and a symbol index with 448 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo