Repository: CyberAgentAILab/SuperNormal
Branch: main
Commit: 09e26150f7e8
Files: 135
Total size: 479.3 KB

Directory structure:
gitextract_10qul1w_/

├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── config/
│   ├── diligent.conf
│   └── own_objects.conf
├── create_env.sh
├── data_capture_and_preprocessing/
│   ├── README.md
│   ├── gather_and_convert_normal_map.py
│   ├── iPhone_mvps_data_preprocessing.py
│   ├── metashape2neus.py
│   ├── metashape2neus2_json_and_images.py
│   └── sam_mvps.py
├── download_data.sh
├── exp_runner.py
├── models/
│   ├── cd_and_fscore.py
│   ├── dataset_loader.py
│   ├── fields.py
│   └── renderer.py
├── run_diligent.sh
├── run_own_object.sh
├── third_parties/
│   └── nerfacc-0.3.5/
│       └── nerfacc-0.3.5/
│           ├── .github/
│           │   └── workflows/
│           │       ├── building.yml
│           │       ├── code_checks.yml
│           │       ├── cuda/
│           │       │   ├── cu101-Linux-env.sh
│           │       │   ├── cu101-Linux.sh
│           │       │   ├── cu101-Windows-env.sh
│           │       │   ├── cu101-Windows.sh
│           │       │   ├── cu102-Linux-env.sh
│           │       │   ├── cu102-Linux.sh
│           │       │   ├── cu102-Windows-env.sh
│           │       │   ├── cu102-Windows.sh
│           │       │   ├── cu111-Linux-env.sh
│           │       │   ├── cu111-Linux.sh
│           │       │   ├── cu111-Windows-env.sh
│           │       │   ├── cu111-Windows.sh
│           │       │   ├── cu113-Linux-env.sh
│           │       │   ├── cu113-Linux.sh
│           │       │   ├── cu113-Windows-env.sh
│           │       │   ├── cu113-Windows.sh
│           │       │   ├── cu115-Linux-env.sh
│           │       │   ├── cu115-Linux.sh
│           │       │   ├── cu115-Windows-env.sh
│           │       │   ├── cu115-Windows.sh
│           │       │   ├── cu116-Linux-env.sh
│           │       │   ├── cu116-Linux.sh
│           │       │   ├── cu116-Windows-env.sh
│           │       │   ├── cu116-Windows.sh
│           │       │   ├── cu117-Linux-env.sh
│           │       │   ├── cu117-Linux.sh
│           │       │   ├── cu117-Windows-env.sh
│           │       │   └── cu117-Windows.sh
│           │       └── publish.yml
│           ├── .gitignore
│           ├── .gitmodules
│           ├── .pre-commit-config.yaml
│           ├── .readthedocs.yaml
│           ├── CMakeLists.txt
│           ├── LICENSE
│           ├── MANIFEST.in
│           ├── README.md
│           ├── docs/
│           │   ├── Makefile
│           │   ├── requirements.txt
│           │   └── source/
│           │       ├── _static/
│           │       │   └── css/
│           │       │       └── readthedocs.css
│           │       ├── apis/
│           │       │   ├── generated/
│           │       │   │   ├── nerfacc.accumulate_along_rays.rst
│           │       │   │   ├── nerfacc.pack_data.rst
│           │       │   │   ├── nerfacc.ray_aabb_intersect.rst
│           │       │   │   ├── nerfacc.ray_resampling.rst
│           │       │   │   ├── nerfacc.render_transmittance_from_alpha.rst
│           │       │   │   ├── nerfacc.render_transmittance_from_density.rst
│           │       │   │   ├── nerfacc.render_visibility.rst
│           │       │   │   ├── nerfacc.render_weight_from_alpha.rst
│           │       │   │   ├── nerfacc.render_weight_from_density.rst
│           │       │   │   ├── nerfacc.unpack_data.rst
│           │       │   │   └── nerfacc.unpack_info.rst
│           │       │   ├── grid.rst
│           │       │   ├── rendering.rst
│           │       │   └── utils.rst
│           │       ├── conf.py
│           │       ├── examples/
│           │       │   ├── dnerf.rst
│           │       │   ├── ngp.rst
│           │       │   ├── unbounded.rst
│           │       │   └── vanilla.rst
│           │       └── index.rst
│           ├── examples/
│           │   ├── datasets/
│           │   │   ├── __init__.py
│           │   │   ├── dnerf_synthetic.py
│           │   │   ├── nerf_360_v2.py
│           │   │   ├── nerf_synthetic.py
│           │   │   └── utils.py
│           │   ├── radiance_fields/
│           │   │   ├── __init__.py
│           │   │   ├── mlp.py
│           │   │   └── ngp.py
│           │   ├── requirements.txt
│           │   ├── train_mlp_dnerf.py
│           │   ├── train_mlp_nerf.py
│           │   ├── train_ngp_nerf.py
│           │   └── utils.py
│           ├── nerfacc/
│           │   ├── __init__.py
│           │   ├── cdf.py
│           │   ├── contraction.py
│           │   ├── cuda/
│           │   │   ├── __init__.py
│           │   │   ├── _backend.py
│           │   │   └── csrc/
│           │   │       ├── cdf.cu
│           │   │       ├── contraction.cu
│           │   │       ├── include/
│           │   │       │   ├── helpers_contraction.h
│           │   │       │   ├── helpers_cuda.h
│           │   │       │   └── helpers_math.h
│           │   │       ├── intersection.cu
│           │   │       ├── pack.cu
│           │   │       ├── pybind.cu
│           │   │       ├── ray_marching.cu
│           │   │       ├── render_transmittance.cu
│           │   │       ├── render_transmittance_cub.cu
│           │   │       └── render_weight.cu
│           │   ├── grid.py
│           │   ├── intersection.py
│           │   ├── losses.py
│           │   ├── pack.py
│           │   ├── ray_marching.py
│           │   ├── sampling.py
│           │   ├── version.py
│           │   └── vol_rendering.py
│           ├── scripts/
│           │   ├── run_aws_listing.py
│           │   ├── run_dev_checks.py
│           │   └── run_profiler.py
│           ├── setup.cfg
│           ├── setup.py
│           └── tests/
│               ├── test_contraction.py
│               ├── test_grid.py
│               ├── test_intersection.py
│               ├── test_loss.py
│               ├── test_pack.py
│               ├── test_ray_marching.py
│               ├── test_rendering.py
│               └── test_resampling.py
└── utilities/
    └── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
data/
exp/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 CyberAgent AI Lab

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
<h2 align="center">SuperNormal: Neural Surface Reconstruction via Multi-View Normal Integration</h2>
<h4 align="center">
    <a href="https://xucao-42.github.io/homepage/"><strong>Xu Cao</strong></a>
    ·
    <a href="https://taketomitakafumi.sakura.ne.jp/web/en/"><strong>Takafumi Taketomi</strong></a>
<br>
CyberAgent </h4>
<h4 align="center"><a href="https://cvpr.thecvf.com/">CVPR 2024 </a></h3>
<p align="center">
  <br>
    <a href="https://arxiv.org/abs/2312.04803">
      <img src='https://img.shields.io/badge/arXiv-Paper-981E32?style=for-the-badge&Color=B31B1B' alt='arXiv PDF'>
    </a>

[//]: # (    <a href='https://xucao-42.github.io/mvas_homepage/'>)

[//]: # (      <img src='https://img.shields.io/badge/MVAS-Project Page-5468FF?style=for-the-badge' alt='Project Page'></a>)
</p>


### Update
- **2024/09/30**: Real-world raw data and step-by-step data pre-processing instructions are available. See [here](./data_capture_and_preprocessing/README.md).

<div align="center">
<img src="./media/teaser.png" alt="Teaser" width="100%">
Fast and fine-grained 3D reconstruction from multi-view surface normal maps. 
</div>

### Quick Start
Code was tested on Ubuntu 18.04 (WSL2) using Python 3.8, PyTorch 2.1.0, and CUDA 11.8 on an Nvidia RTX4090Ti (24GB). 

**Before started, please ensure CUDA is installed in your environment ([11.8 can be found here](https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=WSL-Ubuntu&target_version=2.0&target_type=deb_local)).**
It is required by [tiny-cuda-nn](https://github.com/NVlabs/tiny-cuda-nn).

<details><summary> You should see something like the following after typing `nvcc --version` </summary>

```commandline
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
```
</details>

Clone the repository and prepare the conda environment:
```commandline
git clone https://github.com/CyberAgentAILab/SuperNormal.git
cd SuperNormal
. ./create_env.sh
```

Download data (~1.8GB):
```commandline
./download_data.sh
```

Run on the DiLiGenT-MV benchmark objects or on our captured objects:
```commandline
./run_diligent.sh  # Training should take about 50 seconds per object
```
or 
```commandline
./run_own_object.sh  # Training should take about 5 minutes per object
```
Results are saved under `./exp`.

NOTE: If RuntimeError like below occurs, `apt install ninja-build` may resolve the error.
```
RuntimeError: Ninja is required to load C++ extensions
```

### Hyperparameter tuning tips
Training hyperparameters are defined in `./configs/*.conf`.
Some important hyperparameters are:
- `dataset.normal_dir`: You can choose normal maps estimated by different methods as input for DiLiGenT-MV benchmark objects.
- `train.end_iter`: The number of iterations for training. Should be adjusted according to the number of views and normal map resolutions.
- `train.increase_bindwidth_every`: A strategy used in [Neuralangelo](https://research.nvidia.com/labs/dir/neuralangelo/) to progressively activate finer hash grid during training. Less than `end_iter`/`model.encoding.n_levels` should be fine.
- `train.batch_size`: Number of patches in each batch for training. Should be adjusted according to the GPU memory.
- `train.patch_size`: Better to be fixed to 3, i.e., each patch is 3x3. Large patch size will cause inaccurate volume rendering results for boudary pixels in a patch.

### Modifications to NerfAcc
We add several functions to the original [NerfAcc](https://www.nerfacc.com) to adapt it to patch-based volume rendering.
The key new functions (which are indicated by `patch_based` in function name) are in 
[third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/render_weight.cu/](./third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/render_weight.cu) 
and [third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/vol_rendering.py](./third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/vol_rendering.py).


### Acknowledgement
This repo is built up on [NeuS](https://github.com/Totoro97/NeuS) and benefits from the amazing [tiny-cuda-nn](https://github.com/NVlabs/tiny-cuda-nn) and [NerfAcc](https://www.nerfacc.com).
We also learned a lot from [instant-nsr-pl](https://github.com/bennyguo/instant-nsr-pl).

### Citation
If you find our work useful in your research, please consider citing:
```bibtex
@inproceedings{supernormal2024cao,
  title={SuperNormal: {N}eural Surface Reconstruction via Multi-View Normal Integration},
  author={Cao Xu and Taketomi Takafumi},
  booktitle={CVPR},
  year={2024}
}
```


================================================
FILE: __init__.py
================================================


================================================
FILE: config/diligent.conf
================================================
general {
    dataset_class = models.dataset_loader.Dataset
    renderer_class = models.renderer.NeuSRenderer

    base_exp_dir = ./exp/diligent_mv/CASE_NAME
    recording = [
        ./,
        ./models
    ]
}

dataset {
    data_dir = data/diligent_mv_normals/CASE_NAME/
    normal_dir = normal_world_space_sdmunips # choose normal maps estimated by different methods, should be in the world space
    cameras_name = cameras_sphere.npz
    exclude_views = [0, 4, 8, 12, 16]  # index of views to exclude for test purpose, 0-based
    upsample_factor = 1
}

train {
    learning_rate = 5e-4
    learning_rate_alpha = 0.05
    end_iter = 5000
    increase_bindwidth_every = 350  # following neuralangelo's strategy

    gradient_method = dfd  # dfd or fd or ad, for directional finite difference, finite difference, and auto-differentiation

    batch_size = 2048
    patch_size = 3  # i.e., each training step samples 2048 patches of 3x3 pixels

    warm_up_end = 50
    use_white_bkgd = False

    loss_type = l2  # for normal loss
    normal_weight = 1
    eikonal_weight = 1
    mask_weight = 1
}

val {
    save_freq = 1000

    val_normal_freq = 5001
    val_normal_resolution_level = 1
    gradient_method = dfd  # dfd or fd or ad, can be different from training

    val_mesh_freq = 10000
    val_mesh_res = 512

    report_freq = 100
    eval_metric_freq = 5000
}

model {
    sdf_network {
        d_out = 1
        d_in = 3
        d_hidden = 64
        n_layers = 1
        skip_in = [-1]  # -1 for no skip connection
        bias = 0.6
        geometric_init = True
        weight_norm = True
        input_concat = True  # concat input positions and encoded features
    }

    variance_network {
        init_val = 0.5
    }

    ray_marching {
        start_step_size = 1e-2
        end_step_size = 1e-3
        occ_threshold = 0.1
        occ_sigmoid_k = 80.0
        occ_resolution = 128
        occ_update_freq = 8  # batches
    }

    encoding{
        otype=HashGrid,
		n_levels=14
		n_features_per_level=2
		log2_hashmap_size=19
		base_resolution=32
		per_level_scale=1.3195079107728942
   }
}

================================================
FILE: config/own_objects.conf
================================================
general {
    dataset_class = models.dataset_loader.Dataset
    renderer_class = models.renderer.NeuSRenderer

    base_exp_dir = ./exp/own_objects/CASE_NAME
    recording = [
        ./,
        ./models
    ]
}

dataset {
    data_dir = data/own_objects_normals/CASE_NAME/
    normal_dir = normal_world_space_sdmunips
    cameras_name = cameras_sphere.npz
    exclude_views = []  # index of views to exclude, 0-based
    upsample_factor = 1
}

train {
    learning_rate = 5e-4
    learning_rate_alpha = 0.05
    end_iter = 30000
    increase_bindwidth_every = 2000  # following neuralangelo's strategy

    gradient_method = dfd  # dfd or fd or ad, for directional finite difference, finite difference, and auto-differentiation

    batch_size = 2048
    patch_size = 3  # i.e., each training step samples 2048 patches of 3x3 pixels

    warm_up_end = 500
    use_white_bkgd = False

    loss_type = l2  # for normal loss
    normal_weight = 1
    eikonal_weight = 1
    mask_weight = 1
}

val {
    save_freq = 10000

    val_normal_freq = 30000
    val_normal_resolution_level = 2
    gradient_method = dfd  # dfd or fd or ad, can be different from training

    val_mesh_freq = 30000
    val_mesh_res = 1024

    report_freq = 100
    eval_metric_freq = 30000
}

model {
    sdf_network {
        d_out = 1
        d_in = 3
        d_hidden = 64
        n_layers = 1
        skip_in = [-1]
        bias = 0.8
        geometric_init = True
        weight_norm = True
        input_concat = True  # concat input positions and encoded features
    }

    variance_network {
        init_val = 0.5
    }

    ray_marching
    {
        start_step_size = 1e-2
        end_step_size = 1e-3
        occ_threshold = 0.1
        occ_sigmoid_k = 80.0
        occ_resolution = 128
        occ_update_freq = 8  # batches
    }


    encoding{
        otype=HashGrid,
		n_levels=14
		n_features_per_level=2
		log2_hashmap_size=19
		base_resolution=32
		per_level_scale=1.3195079107728942
    }
}

================================================
FILE: create_env.sh
================================================
conda deactivate
conda remove -y -n sn --all
conda create -y -n sn python=3.8
conda activate sn

pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# install tiny-cuda-nn
export PATH="/usr/local/cuda/bin:$PATH"
export LIBRARY_PATH="/usr/local/cuda/lib64/stubs:$LIBRARY_PATH"
pip install git+https://github.com/NVlabs/tiny-cuda-nn/@2ec562e853e6f482b5d09168705205f46358fb39#subdirectory=bindings/torch

pip install -e ./third_parties/nerfacc-0.3.5/nerfacc-0.3.5/
pip install opencv-python==4.8.1.78 trimesh==3.23.5 open3d==0.17 pyvista==0.42.3 scipy==1.10.1 scikit-image==0.21.0 pyhocon==0.3.59 pyexr==0.3.10 tensorboard==2.14.0 icecream==2.1.3 PyMCubes==0.1.4 pyembree==0.2.11

================================================
FILE: data_capture_and_preprocessing/README.md
================================================

This is a step-by-step guide to preprocess the raw images captured by an iPhone for the MVPS task.
You can download our raw images using the following command *(~6 GB per object)*.

```
gdown 'https://drive.google.com/file/d/1BcCuZR0C-snmCNf8iGhkFgkQ6arfcQ-L/view?usp=sharing' --fuzzy
unzip flower_girl.zip
rm flower_girl.zip

gdown 'https://drive.google.com/file/d/12QzgRbOjBSx295BS4zihnOjcdYh7ZaP9/view?usp=sharing' --fuzzy
unzip lion.zip
rm lion.zip

gdown 'https://drive.google.com/file/d/1cvKbI5VvDhsuA4a06rYqqoAtQd8GtyeI/view?usp=sharing'  --fuzzy
unzip dog.zip
rm dog.zip
```

## File structure
You should have the following file structure under each object's folder:
```
 - RAW
 - mask
 - cameras.xml
```

The `RAW` folder contains all the DNG images captured by an iPhone. 
The `mask` folder contains the foreground masks for each view.
The `cameras.xml` contains the calibrated camera parameters using [Metashape](https://oakcorp.net/agisoft/download/). 

## Step-by-step data pre-processing
First we convert the DNG images to PNG file format.
```
# pip install rawpy
python iPhone_mvps_data_preprocessing.py --data_dir <path/to/obj_folder>
```
Now the file structure looks like this
```
    - RAW
    - mvps_png_full
    - sfm_png_full
    - mask
    - cameras.xml
```
The `mvps_png_full` folder contains the pre-processed images for photometric stereo, and the `sfm_png_full` folder contains the images for camera calibration using Structure from Motion.
In each view, we first take an image in ambient light and then additionally illuminate the object with an active light source.
So the first image in each view is collected in `sfm_png_full`.

### Mask preparation
Now we prepare the foreground masks for each view.
We used SAM to interactively segment the foreground objects.
Please install SAM according to the [official instructions](https://github.com/facebookresearch/segment-anything).
After installation, run the following command to segment the foreground objects for all views:

```
python sam_mvps.py --data_dir <path/to/obj_folder/mvps_png_full> --checkpoint <path/to/sam_vit_h_4b8939.pth>
```
This will pop up a window where you can interactively segment the foreground objects.
Select points on the object to segment the foreground object, and press `Esc` to check the intermediate results.
Continue to select points until you are satisfied with the segmentation results, and press `Enter` to save the mask.
The process will be repeated for all views.

The same mask will be saved in two places: `obj_folder/mask` and the corresponding folder containing the image from the same viewpoint. 
The latter will be used for normal map estimation.

### Camera calibration
In [MetaShape](https://oakcorp.net/agisoft/download/), import the images in the `sfm_png_full` folder and run the camera calibration process.
```
[Workflow] -> [Add Folder] -> select `sfm_png_full` -> select single cameras -> [Workflow] -> [Align Photos]
```

After camera calibration, export the camera parameters to `cameras.xml`.
```
[File] -> [Export] -> [Export Cameras]
```

The resulting `cameras.xml` file is what we have put in the object folder.


### Normal map estimation
Install [SDM-UniPS](https://github.com/satoshi-ikehata/SDM-UniPS-CVPR2023) and run the following command to generate the normal maps for each view:
```
python <path/to/sdm_unips/main.py> --session_name YOUR_SESSION_NAME --test_dir <path/to/obj_folder/mvps_png_full> --checkpoint <path/to/sdm_unips_checkpoint_dir> --scalable --target normal
```
Tips: Prepare the mask for each view to improve the normal estimation results. This should be done when you have completed the previous mask segmentation step.

The original SDM-UniPS code outputs normal maps in the PNG format. You can instead get EXR format by replacing [this line](https://github.com/satoshi-ikehata/SDM-UniPS-CVPR2023/blob/96e68f353173c2ae85bfe609e4728a19a2f8c92e/sdm_unips/modules/builder/builder.py#L162) with the following one:
```
pyexr.write(f'{testdata.data.data_workspace}/normal.exr', nout)
```
Remember to install the [pyexr](https://github.com/tvogels/pyexr) package and import it in the file.
After normal estimation, we collect the normal maps in the same folder.
Since SDM-UniPS estimates normal maps in camera space, we also convert them to the world space using the camera parameters from the previous step.

```
python gather_and_convert_normal_map.py --data_dir <path/to/obj_folder> --sdm_unips_result_dir <path/to/YOUR_SESSION_NAME/results>
```
The file structure is now as follows:
```
    - RAW
    - mvps_png_full
    - sfm_png_full
    - mask
    - normal_camera_space_sdmunips
    - normal_world_space_sdmunips
    - cameras.xml
    - results # if your SDM-UniPS output is in this folder
```

### Convert camera parameters to NeuS format
The last step is to convert the camera parameters to the NeuS format.
```
python metashape2neus.py --xml_path <path/to/obj_folder/cameras.xml>
```
This will create a `cameras_sphere.npz` file in the same folder as `cameras.xml`.
We also provide the converter to NeuS2 format. Check `metashape2neus2_json_and_images.py` for more details.

## Tips for capturing your own data
We used the iPhone's built-in camera app to take the images. Here are some tips for successful reconstruction:
- Use a tripod to stabilize the camera.
- Use a remote shutter release to avoid camera shake.
- Keep the same focus point in each view. On iPhone, you can press and hold the screen to lock the focus point.
- Use a white/black background to simplify the segmentation process.
- Use a turntable to capture the object from different angles. 
- Place the object on a textured surface to help the Structure from Motion process.
- Place the object in the center of the image.
- We used a [video light](https://www.ulanzi.com/collections/lighting/products/mini-led-video-light-ulanzi-vl49-1672) to illuminate the object from different angles in each view. Other light sources like a ring light/flashlight may also work.
- In each view, vary the light source's position sufficiently around the camera. We used 12 different light positions in our setup. 
- Reduce the exposure if the captured images are overexposed.

The above capture process can be done with off-the-shelf equipment, but it is tedious. 
It would be more convenient if you could build a custom rig to automate the capture process, such as [this example](https://youtu.be/zyEw-1QUlkU?si=8RvYC23emoP8TXrU).

================================================
FILE: data_capture_and_preprocessing/gather_and_convert_normal_map.py
================================================
import os
import cv2
import pyexr
from glob import glob
import numpy as np
import shutil
from bs4 import BeautifulSoup  # $ pip install beautifulsoup4 lxml
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--sdm_unips_result_dir", type=str, default="../../SDM-UniPS-CVPR2023/flower_girl/results")
parser.add_argument("--data_dir", type=str, default="./flower_girl")
args = parser.parse_args()

xml_path = os.path.join(args.data_dir, "cameras.xml")
obj_name = os.path.basename(args.data_dir)
num_views = len(glob(os.path.join(args.sdm_unips_result_dir, "view_*.data")))

normal_map_camera_dir = os.path.join(args.data_dir, "normal_camera_space_sdmunips")
normal_map_world_dir = os.path.join(args.data_dir, "normal_world_space_sdmunips")

# create directories
os.makedirs(normal_map_camera_dir, exist_ok=True)
os.makedirs(normal_map_world_dir, exist_ok=True)

with open(xml_path, "r") as f:
    xml_data = f.read()
bs_data = BeautifulSoup(xml_data, "xml")
b_unique = bs_data.find_all('camera')

for tag in b_unique:
    img_name = tag.get("label")
    view_idx = int(img_name.split("_")[-1])
    # camera to world transform
    C2W = np.array([float(i) for i in tag.find("transform").text.split(" ")]).reshape((4, 4))


normal_map_all = []
normal_map_path_all = []
for i in range(num_views):
    view_dir = os.path.join(args.sdm_unips_result_dir, f"view_{i:02d}.data")
    for tag in b_unique:
        img_name = tag.get("label")
        view_idx = int(img_name.split("_")[-1])
        # camera to world transform
        if view_idx == i:
            C2W = np.array([float(i) for i in tag.find("transform").text.split(" ")]).reshape((4, 4))
            R = C2W[:3, :3]
            break
    if os.path.exists(view_dir):
        # copy normal map
        normal_map_file = os.path.join(view_dir, "normal.exr")
        new_normal_map_file = os.path.join(normal_map_camera_dir, f"{i:02d}.exr")
        shutil.copy(normal_map_file, new_normal_map_file)

        # convert normal map to world space
        normal_map_camera = pyexr.read(new_normal_map_file)
        normal_map_camera[..., [1, 2]] *= -1  # revert y and z axis to match opencv conversion, X right, Y down, Z front
        H, W = normal_map_camera.shape[:2]
        normal_world = (R @ normal_map_camera.reshape(-1, 3).T).T.reshape([H, W, 3])
        pyexr.write(os.path.join(normal_map_world_dir, f"{i:02d}.exr"), normal_world)


================================================
FILE: data_capture_and_preprocessing/iPhone_mvps_data_preprocessing.py
================================================
import rawpy, os
from glob import glob
import cv2
import numpy as np
import os
from tqdm import tqdm
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, default="./flower_girl")
parser.add_argument("--num_img_per_view", type=int, default=13)
arg = parser.parse_args()

dng_list = glob(os.path.join(arg.data_dir, "RAW", "*.DNG"))
dng_list.sort()
num_image_per_view = arg.num_img_per_view
num_view = len(dng_list) // num_image_per_view

resize_factor = 1  # resize the png image to 1/2, 1/4, or 1

if resize_factor == 1/2:
    sfm_data_dir = os.path.join(arg.data_dir, "sfm_png_half")
    mvps_data_dir = os.path.join(arg.data_dir, "mvps_png_half")
elif resize_factor == 1/4:
    sfm_data_dir = os.path.join(arg.data_dir, "sfm_png_quarter")
    mvps_data_dir = os.path.join(arg.data_dir, "mvps_png_quarter")
elif resize_factor == 1:
    mvps_data_dir = os.path.join(arg.data_dir, "mvps_png_full")
    sfm_data_dir = os.path.join(arg.data_dir, "sfm_png_full")

os.makedirs(sfm_data_dir, exist_ok=True)
os.makedirs(mvps_data_dir, exist_ok=True)

for view_idx in tqdm(range(num_view)):
    view_dir = os.path.join(mvps_data_dir, f"view_{view_idx:02d}.data")
    if os.path.exists(view_dir):
        continue
    os.makedirs(view_dir, exist_ok=True)
    view_dng_list = dng_list[view_idx * num_image_per_view: (view_idx + 1) * num_image_per_view]

    for dng_idx, dng_path in enumerate(view_dng_list):
        with rawpy.imread(dng_path) as raw:
            rgb = raw.postprocess(no_auto_bright=True, output_bps=16)[..., ::-1].astype(np.float32)
            rgb = rgb.astype(np.uint16)
        rgb_resized = cv2.resize(rgb, (0, 0), fx=resize_factor, fy=resize_factor)

        # choose the first image in each view for SfM
        if dng_idx == 0:
            cv2.imwrite(os.path.join(sfm_data_dir, f"{view_idx:02d}.png"), rgb_resized)

        cv2.imwrite(os.path.join(view_dir, f"L{dng_idx:02d}.png"), rgb_resized)


================================================
FILE: data_capture_and_preprocessing/metashape2neus.py
================================================
import os.path
import xml
from bs4 import BeautifulSoup  # pip install beautifulsoup4 lxml
import numpy as np

# details of camera normalization can be found in Sec. C.3 in https://openaccess.thecvf.com/content/CVPR2023/supplemental/Cao_Multi-View_Azimuth_Stereo_CVPR_2023_supplemental.pdf
def normalize_camera(R_list, t_list, camera2object_ratio=3):
    A_camera_normalize = 0
    b_camera_normalize = 0
    camera_center_list = []
    for view_idx in range(len(R_list)):
        R = R_list[view_idx]
        t = t_list[view_idx]
        camera_center = - R.T @ t  # in world coordinate
        camera_center_list.append(camera_center)
        vi = R[2][:, None]  # the camera's principal axis in the world coordinates
        Vi = vi @ vi.T
        A_camera_normalize += np.eye(3) - Vi
        b_camera_normalize += camera_center.T @ (np.eye(3) - Vi)
    offset = np.linalg.lstsq(A_camera_normalize, np.squeeze(b_camera_normalize), rcond=None)[0]
    camera_center_dist_list = [np.sqrt(np.sum((np.squeeze(c) - offset) ** 2))
                               for c in camera_center_list]
    scale = np.max(camera_center_dist_list) / camera2object_ratio
    return offset, scale

def make4x4(P):
    assert P.shape[-1] == 4 or P.shape[-1] == 3
    assert len(P.shape) == 2
    assert P.shape[0] == 3 or P.shape[0] == 4
    ret = np.eye(4)
    ret[:P.shape[0], :P.shape[1]] = P
    return ret

class MetashapePoseLoader:
    def __init__(self, xml_path, camera2object_ratio):
        with open(xml_path, "r") as f:
            xml_data = f.read()
        bs_data = BeautifulSoup(xml_data, "xml")
        c_unique = bs_data.find_all('resolution')
        img_width = int(c_unique[0].get("width"))
        img_height = int(c_unique[0].get("height"))
        c_intrinsics = bs_data.find_all('calibration')
        f = float(c_intrinsics[0].find("f").text)
        cx_offset = float(c_intrinsics[0].find("cx").text)
        cy_offset = float(c_intrinsics[0].find("cy").text)
        K = np.array([[f, 0, (img_width-1)/2 + cx_offset],
                        [0, f, (img_height-1)/2 + cy_offset],
                        [0, 0, 1]])

        b_unique = bs_data.find_all('camera')
        R_list = []
        t_list = []
        C2W_list = []
        camera_sphere = dict()
        for tag in b_unique:
            img_name = tag.get("label")
            view_idx = int(img_name.split("_")[-1])
            # camera to world transform
            C2W = np.array([float(i) for i in tag.find("transform").text.split(" ")]).reshape((4, 4))
            C2W_list.append(C2W)

            assert int(img_name) == view_idx

            W2C = np.linalg.inv(C2W)
            R_list.append(W2C[:3, :3])
            t_list.append(W2C[:3, 3])

            camera_sphere[f"world_mat_{view_idx}"] = make4x4(K) @ W2C

        offset, scale = normalize_camera(R_list, t_list, camera2object_ratio=camera2object_ratio)
        print("offset", offset, "scale", scale)
        num_views = len(C2W_list)

        scale_mat = np.eye(4)
        scale_mat[:3, :3] *= scale
        scale_mat[:3, 3] = offset
        for im_idx in range(num_views):
            camera_sphere[f"scale_mat_{im_idx}"] = scale_mat

        data_dir = os.path.dirname(xml_path)
        np.savez(os.path.join(data_dir, 'cameras_sphere.npz'), **camera_sphere)


if __name__=="__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--xml_path", type=str, required=True)
    parser.add_argument("--ratio", type=float, default=10)
    args = parser.parse_args()

    MetashapePoseLoader(args.xml_path, camera2object_ratio=args.ratio)

================================================
FILE: data_capture_and_preprocessing/metashape2neus2_json_and_images.py
================================================
from glob import glob
import os
import numpy as np
import cv2
from bs4 import BeautifulSoup
from metashape2neus import normalize_camera, make4x4
import json
import argparse

def create_json_file(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str, default="./flower_girl")
arg = parser.parse_args()

data_dir = os.path.join(arg.data_dir, "sfm_png_full")
mask_dir = os.path.join(arg.data_dir, "mask")
xml_path = os.path.join(arg.data_dir, "cameras.xml")
obj_name = os.path.basename(arg.data_dir)

target_dir = os.path.join(arg.data_dir, "neus2_input", "images")
os.makedirs(target_dir, exist_ok=True)

# load images and masks and save them as rgba images
img_list = glob(os.path.join(data_dir, "*.png"))
img_list.sort()
num_view = len(img_list)
print(num_view)
img_h, img_w = cv2.imread(img_list[0]).shape[:2]

for i in range(num_view):
    img_path = img_list[i]
    mask_path = os.path.join(mask_dir, f"{i:02d}.png")
    img = cv2.imread(img_path)
    mask = cv2.imread(mask_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2BGRA)
    img[..., 3] = mask[..., 0]
    new_img_path = os.path.join(target_dir, f"{i:02d}.png")
    cv2.imwrite(new_img_path, img)
    print(f"Saved {new_img_path}")

data = {
    "from_na": True,
    "w": img_w,
    "h": img_h,
    "aabb_scale": 1.0,
    "frames": [],
    "scale": 1,
    "offset": [1, 1, 1],
}

with open(xml_path, "r") as f:
    xml_data = f.read()
bs_data = BeautifulSoup(xml_data, "xml")
c_unique = bs_data.find_all('resolution')
img_width = int(c_unique[0].get("width"))
img_height = int(c_unique[0].get("height"))
c_intrinsics = bs_data.find_all('calibration')
f = float(c_intrinsics[0].find("f").text)
cx_offset = float(c_intrinsics[0].find("cx").text)
cy_offset = float(c_intrinsics[0].find("cy").text)
K = np.array([[f, 0, (img_width - 1) / 2 + cx_offset],
              [0, f, (img_height - 1) / 2 + cy_offset],
              [0, 0, 1]])

b_unique = bs_data.find_all('camera')
R_list = []
t_list = []
C2W_list = []
camera_sphere = dict()
for tag in b_unique:
    img_name = tag.get("label")
    view_idx = int(img_name.split("_")[-1])
    # camera to world transform
    C2W = np.array([float(i) for i in tag.find("transform").text.split(" ")]).reshape((4, 4))
    C2W_list.append(C2W)

    print(img_name, view_idx)
    W2C = np.linalg.inv(C2W)
    R_list.append(W2C[:3, :3])
    t_list.append(W2C[:3, 3])

    camera_sphere[f"world_mat_{view_idx}"] = make4x4(K) @ W2C
    print(img_name)
    data["frames"].append({
        "file_path": f"images/{img_name}.png",
        "transform_matrix": C2W.tolist(),
        "intrinsic_matrix": make4x4(K).tolist()
    })

offset, scale = normalize_camera(R_list, t_list, camera2object_ratio=10)
data["scale"] = scale
data["offset"] = list((-offset*scale + 0.5))


create_json_file(data, os.path.join(arg.data_dir, "neus2_input", 'transform.json'))

================================================
FILE: data_capture_and_preprocessing/sam_mvps.py
================================================
import os.path
from glob import glob
import argparse
import torch.cuda
from segment_anything import SamPredictor, sam_model_registry

parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint", type=str, default=None)
parser.add_argument("--data_dir", type=str, default="./")
args = parser.parse_args()

sam = sam_model_registry["vit_h"](checkpoint=args.checkpoint)
sam.to(device="cuda")
predictor = SamPredictor(sam)

import cv2
import numpy as np
import matplotlib.pyplot as plt
import time
from IPython.display import display, clear_output

obj_dir = os.listdir(args.data_dir)
obj_dir = [os.path.join(args.data_dir, obj) for obj in obj_dir if ".data" in obj]
mask_dir = os.path.join(os.path.dirname(os.path.dirname(args.data_dir)), "mask")
os.makedirs(mask_dir, exist_ok=True)

def pick_point(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        print(f'You selected point ({x}, {y})')
        points.append(np.array([[x, y]]))

def show_mask(mask, ax, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)


def show_points(coords, labels, ax, marker_size=375):
    pos_points = coords[labels == 1]
    neg_points = coords[labels == 0]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)


def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))


for obj_dir_path in obj_dir:
    mask_path = os.path.join(obj_dir_path, "mask.png")
    if os.path.exists(mask_path):
        continue
    # randomly pick an image from the object directory
    img_list = glob(os.path.join(obj_dir_path, "*.png")) + glob(os.path.join(obj_dir_path, "*.jpg"))
    img_test_path = img_list[0]
    img_test = cv2.imread(img_test_path)

    predictor.set_image(img_test)
    torch.cuda.synchronize()

    points = []

    while True:
        # Create a window
        cv2.namedWindow('image', cv2.WINDOW_NORMAL)

        # Bind the callback function to the window
        cv2.setMouseCallback('image', pick_point)

        while(1):
            cv2.imshow('image', img_test)
            if cv2.waitKey(20) & 0xFF == 27:  # Break the loop when 'ESC' is pressed
                break

        cv2.destroyAllWindows()
        print(f'Selected points: {points}')

        input_point = np.concatenate(points, axis=0).reshape(-1, 2)
        input_label = np.ones(input_point.shape[0], dtype=np.int64)
        print(f'Input point: {input_point}')

        masks, scores, logits = predictor.predict(
            point_coords=input_point,
            point_labels=input_label,
            multimask_output=False,
        )

        for i, (mask, score) in enumerate(zip(masks, scores)):
            plt.figure(figsize=(10,10))
            plt.imshow(img_test[:, :, ::-1])
            show_mask(mask, plt.gca())
            show_points(input_point, input_label, plt.gca())
            plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
            plt.axis('off')
            plt.show(block=False)
            plt.pause(3)
            plt.close()

        value = input("Press enter to save the mask, or c to continue selecting points: ")
        if value == "c":
            continue
        elif value == "":
            break

    # save the mask
    base_dir = os.path.dirname(img_test_path)
    view_idx = int(base_dir.split("/")[-1].split(".")[0].split("_")[-1])
    mask_path1 = os.path.join(base_dir, "mask.png")
    mask_path2 = os.path.join(mask_dir, f"{view_idx:02d}.png")
    cv2.imwrite(mask_path1, mask.astype(np.uint8) * 255)
    cv2.imwrite(mask_path2, mask.astype(np.uint8) * 255)
    print(f"Mask saved at {mask_path1} and {mask_path2}")


================================================
FILE: download_data.sh
================================================
pip install gdown==5.1.0
gdown 'https://drive.google.com/file/d/1Y3-v5jo-IRyTsPh8srZxIc2v5WZdPly_/view?usp=sharing' --fuzzy
unzip data.zip
rm data.zip

================================================
FILE: exp_runner.py
================================================
import os
import logging
import argparse
import numpy as np
import cv2 as cv
import trimesh
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from shutil import copyfile
from tqdm.auto import tqdm
from pyhocon import ConfigFactory
from models.fields import SDFNetwork, SingleVarianceNetwork

import pyexr
import time
from utilities.utils import crop_image_by_mask, toRGBA

import open3d as o3d
import pyvista as pv
pv.set_plot_theme("document")
pv.global_theme.transparent_background = True
from models.cd_and_fscore import chamfer_distance_and_f1_score
import csv
from collections import OrderedDict


def get_class(kls):
    parts = kls.split('.')
    module = ".".join(parts[:-1])
    m = __import__(module)
    for comp in parts[1:]:
        m = getattr(m, comp)
    return m

class Runner:
    def __init__(self, conf_text, mode='train', is_continue=False, datadir=None):
        self.device = torch.device('cuda')
        self.conf_text = conf_text

        if not is_continue:
            exp_time = str(time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime(time.time())))
            exp_time_dir = f"exp_{exp_time}"

        self.conf = ConfigFactory.parse_string(conf_text)
        self.base_exp_dir = os.path.join(self.conf['general.base_exp_dir'], exp_time_dir)
        os.makedirs(self.base_exp_dir, exist_ok=True)
        self.dataset = get_class(self.conf['general.dataset_class'])(self.conf['dataset'])
        self.iter_step = 0

        # Training parameters
        self.end_iter = self.conf.get_int('train.end_iter')
        self.batch_size = self.conf.get_int('train.batch_size')
        self.patch_size = self.conf.get_int('train.patch_size', default=3)

        self.learning_rate = self.conf.get_float('train.learning_rate')
        self.learning_rate_alpha = self.conf.get_float('train.learning_rate_alpha')
        self.use_white_bkgd = self.conf.get_bool('train.use_white_bkgd')
        self.warm_up_end = self.conf.get_float('train.warm_up_end', default=0.0)

        self.loss_type = self.conf.get('train.loss_type', 'l1')
        self.normal_weight = self.conf.get_float('train.normal_weight')
        self.eikonal_weight = self.conf.get_float('train.eikonal_weight')
        self.mask_weight = self.conf.get_float('train.mask_weight')

        self.increase_bindwidth_every = self.conf.get_int('train.increase_bindwidth_every', default=350)

        # validation parameters
        self.val_normal_freq = self.conf.get_int('val.val_normal_freq')
        self.val_normal_resolution_level = self.conf.get_int('val.val_normal_resolution_level')
        self.val_gradient_method = self.conf.get('val.gradient_method', 'dfd')

        self.val_mesh_freq = self.conf.get_int('val.val_mesh_freq')
        self.val_mesh_res = self.conf.get_int('val.val_mesh_res')

        self.eval_metric_freq = self.conf.get_int('val.eval_metric_freq')
        self.report_freq = self.conf.get_int('val.report_freq')
        self.save_freq = self.conf.get_int('val.save_freq')

        # Ray marching parameters
        self.start_step_size = self.conf.get_float('model.ray_marching.start_step_size', default=1e-2)
        self.end_step_size = self.conf.get_float('model.ray_marching.end_step_size', default=5e-4)
        self.slop_step = (np.log10(self.start_step_size) - np.log10(self.end_step_size)) / self.end_iter

        # Networks
        params_to_train = []
        self.sdf_network = SDFNetwork(**self.conf['model.sdf_network'], encoding_config=self.conf['model.encoding']).to(self.device)
        self.deviation_network = SingleVarianceNetwork(**self.conf['model.variance_network']).to(self.device)

        params_to_train += list(self.sdf_network.parameters())
        params_to_train += list(self.deviation_network.parameters())

        self.renderer = get_class(self.conf['general.renderer_class'])(self.sdf_network,
                                                                       self.deviation_network,
                                                                       self.conf["train"]["gradient_method"])

        self.optimizer = torch.optim.Adam(params_to_train, lr=self.learning_rate)

        self.is_continue = is_continue
        self.mode = mode

        # Load checkpoint
        latest_model_name = None
        if is_continue:
            model_list_raw = os.listdir(os.path.join(self.base_exp_dir, 'checkpoints'))
            model_list = []
            for model_name in model_list_raw:
                if model_name[-3:] == 'pth' and int(model_name[5:-4]) <= self.end_iter:
                    model_list.append(model_name)
            model_list.sort()
            latest_model_name = model_list[-1]

        if latest_model_name is not None:
            logging.info('Find checkpoint: {}'.format(latest_model_name))
            self.load_checkpoint(latest_model_name)

        # Backup codes and configs for debug
        if self.mode[:5] == 'train':
            self.file_backup()

    def train(self):
        print("Start training...")
        self.writer = SummaryWriter(log_dir=os.path.join(self.base_exp_dir, 'logs'))
        self.writer.add_graph(self.sdf_network, verbose=False, input_to_model=torch.randn(1, 3))
        self.update_learning_rate()

        # create a csv file to save the evaluation metrics
        csv_file_name = f"eval_metrics.csv"
        csv_file_path = os.path.join(self.base_exp_dir, csv_file_name)
        if not os.path.exists(csv_file_path):
            with open(csv_file_path, 'w') as f:
                writer = csv.writer(f)
                if len(self.dataset.exclude_view_list)>0:
                    writer.writerow(['iter',
                                     'mae_all_view',
                                     'mae_test_view',
                                     'CD',
                                     'fscore'])
                else:
                    writer.writerow(['iter',
                                     'mae_all_view',
                                     'CD',
                                     'fscore'])

        res_step = self.end_iter - self.iter_step
        pbar = tqdm(range(res_step))
        for iter_i in pbar:
            # update ray marching step size
            self.renderer.sampling_step_size = 10 ** (np.log10(self.start_step_size) - self.slop_step*iter_i)

            # update occupancy grid
            self.renderer.occupancy_grid.every_n_step(step=iter_i,
                                                      occ_eval_fn=self.renderer.occ_eval_fn,
                                                      occ_thre=self.conf["model.ray_marching"]["occ_threshold"],
                                                      n=self.conf["model.ray_marching"]["occ_update_freq"])

            # following neuralangelo, gradually increase ingp bandwidth
            if self.iter_step % self.increase_bindwidth_every == 0:
                self.renderer.sdf_network.increase_bandwidth()

            # sample patches of pixels for training
            rays_o_patch_all, rays_d_patch_all, marching_plane_normal, V_inverse_patch_all, true_normal, mask = \
                self.dataset.gen_random_patches(self.batch_size, patch_H=self.patch_size, patch_W=self.patch_size)

            rays_o_patch_center = rays_o_patch_all[:, self.patch_size // 2, self.patch_size // 2]  # (num_patch, 3)
            rays_d_patch_center = rays_d_patch_all[:, self.patch_size // 2, self.patch_size// 2]  # (num_patch, 3)
            near, far = self.dataset.near_far_from_sphere(rays_o_patch_center, rays_d_patch_center)

            if self.mask_weight > 0.0:
                mask = (mask > 0.5).float()
            else:
                mask = torch.ones_like(mask)

            mask_sum = mask.sum() + 1e-5

            # forward rendering
            render_out = self.renderer.render(rays_o_patch_all,
                                              rays_d_patch_all,
                                              marching_plane_normal,
                                              near, far, V_inverse_patch_all)

            if render_out['gradients'] is None:  # all rays are in the zero region of the occupancy grid
                self.update_learning_rate()
                continue

            comp_normal = render_out['comp_normal']  # rendered normal at pixels
            gradients = render_out['gradients']  # gradients at all sampled 3D points
            comp_mask = render_out['weight_sum']  # rendered occupancy at pixels
            samples_per_ray = render_out['samples_per_ray']

            normal_error = (comp_normal - true_normal) * mask
            if self.loss_type == 'l1':
                normal_loss = F.l1_loss(normal_error, torch.zeros_like(normal_error), reduction='sum') / mask_sum
            elif self.loss_type == 'l2':
                normal_loss = F.mse_loss(normal_error, torch.zeros_like(normal_error), reduction='sum') / mask_sum

            gradients_norm = torch.linalg.norm(gradients, ord=2, dim=-1)
            eikonal_loss = F.mse_loss(gradients_norm, torch.ones_like(gradients_norm), reduction='mean')
            mask_loss = F.binary_cross_entropy(comp_mask.clip(1e-5, 1.0 - 1e-5), mask)

            loss = self.normal_weight * normal_loss + \
                   self.mask_weight * mask_loss + \
                   self.eikonal_weight * eikonal_loss

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            self.iter_step += 1
            self.update_learning_rate()

            if self.iter_step % self.report_freq == 0:
                message_postfix = OrderedDict(loss=f"{loss:.3e}",
                                              s=f"{self.deviation_network.variance.item():.3e}",
                                              rm_step=f"{self.renderer.sampling_step_size.item():.3e}",
                                              samples_per_ray=f"{samples_per_ray:.1f}")
                pbar.set_postfix(ordered_dict=message_postfix)

            if self.iter_step % self.save_freq == 0:
                self.save_checkpoint()

            if self.iter_step % self.val_mesh_freq == 0:
                self.validate_mesh(resolution=self.val_mesh_res)

            if self.iter_step % self.val_normal_freq == 0:
                for val_idx in range(self.dataset.n_images):
                    self.validate_normal_patch_based(idx=val_idx, resolution_level=self.val_normal_resolution_level,
                                                     gradient_method=self.val_gradient_method)

            if self.iter_step % self.eval_metric_freq == 0:
                # no gt mesh, skip the evaluation
                if self.dataset.mesh_gt is None:
                    continue

                # remove invisible faces in the gt mesh
                if self.dataset.mesh_gt is not None and self.dataset.points_gt is None:
                    self.dataset.mesh_gt.vertices = o3d.utility.Vector3dVector(
                        (np.asarray(self.dataset.mesh_gt.vertices) -
                         self.dataset.scale_mats_np[0][:3, 3][None]) /
                        self.dataset.scale_mats_np[0][0, 0])
                    mesh = trimesh.Trimesh(np.asarray(self.dataset.mesh_gt.vertices),
                                           np.asarray(self.dataset.mesh_gt.triangles), process=False)
                    self.dataset.points_gt = self.find_visible_points(mesh) * self.dataset.scale_mats_np[0][0, 0] + \
                                             self.dataset.scale_mats_np[0][:3, 3][None]

                cd, fscore = self.eval_geo(resolution=512)
                print(f'iter: {self.iter_step} cd: {cd:.3e}, fscore: {fscore:.3e}')
                if len(self.dataset.exclude_view_list)>0:
                    mae_allview, mae_test_view = self.eval_mae(gradient_method=self.val_gradient_method)

                    print('MAE (all views) {0}: {1:.5f}'.format(self.val_gradient_method, mae_allview))
                    print('MAE (test views) {0}: {1:.5f}'.format(self.val_gradient_method, mae_test_view))

                    with open(csv_file_path, 'a') as f:
                        writer = csv.writer(f)
                        writer.writerow([self.iter_step,
                                         mae_allview,
                                         mae_test_view,
                                         cd, fscore])

                else:
                    mae_allview = self.eval_mae(gradient_method="dfd")
                    # write to csv file
                    with open(csv_file_path, 'a') as f:
                        writer = csv.writer(f)
                        writer.writerow([self.iter_step,
                                         mae_allview,
                                         cd, fscore])

    def update_learning_rate(self):
        if self.iter_step < self.warm_up_end:
            learning_factor = self.iter_step / self.warm_up_end
        else:
            alpha = self.learning_rate_alpha
            progress = (self.iter_step - self.warm_up_end) / (self.end_iter - self.warm_up_end)
            learning_factor = (np.cos(np.pi * progress) + 1.0) * 0.5 * (1 - alpha) + alpha

        for g in self.optimizer.param_groups:
            g['lr'] = self.learning_rate * learning_factor

    def file_backup(self):
        dir_lis = self.conf['general.recording']
        os.makedirs(os.path.join(self.base_exp_dir, 'recording'), exist_ok=True)
        for dir_name in dir_lis:
            cur_dir = os.path.join(self.base_exp_dir, 'recording', dir_name)
            os.makedirs(cur_dir, exist_ok=True)
            files = os.listdir(dir_name)
            for f_name in files:
                if f_name[-3:] == '.py':
                    copyfile(os.path.join(dir_name, f_name), os.path.join(cur_dir, f_name))
        try:
            copyfile(self.conf_path, os.path.join(self.base_exp_dir, 'recording', 'config.conf'))
        except:
            # save conf_text into a txt file
            with open(os.path.join(self.base_exp_dir, 'recording', 'config.conf'), 'w') as f:
                f.write(self.conf_text)

    def load_checkpoint(self, checkpoint_name):
        checkpoint = torch.load(os.path.join(self.base_exp_dir, 'checkpoints', checkpoint_name), map_location=self.device)
        self.sdf_network.load_state_dict(checkpoint['sdf_network_fine'])
        self.deviation_network.load_state_dict(checkpoint['variance_network_fine'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.iter_step = checkpoint['iter_step']
        logging.info('End')

    def save_checkpoint(self):
        checkpoint = {
            'sdf_network_fine': self.sdf_network.state_dict(),
            'variance_network_fine': self.deviation_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'iter_step': self.iter_step,
        }

        os.makedirs(os.path.join(self.base_exp_dir, 'checkpoints'), exist_ok=True)
        torch.save(checkpoint, os.path.join(self.base_exp_dir, 'checkpoints', 'ckpt_{:0>6d}.pth'.format(self.iter_step)))

    def validate_normal_pixel_based(self, idx=-1, resolution_level=-1):
        if idx < 0:
            idx = np.random.randint(self.dataset.n_images)

        print('Validate: iter: {}, camera: {}'.format(self.iter_step, idx))

        if resolution_level < 0:
            resolution_level = self.validate_resolution_level
        rays_o, rays_d = self.dataset.gen_rays_at(idx, resolution_level=resolution_level, within_mask=False)
        H, W, _ = rays_o.shape
        rays_o = rays_o.reshape(-1, 3).split(8192)
        rays_d = rays_d.reshape(-1, 3).split(8192)

        out_normal_fine = []
        out_depth_fine = []

        mask_np = self.dataset.masks_np[idx].astype(bool)[..., 0]
        mask_np = cv.resize(mask_np.astype(np.uint8),
                            ((int(W), int(H))),
                            interpolation=cv.INTER_NEAREST).astype(bool)

        for rays_o_batch, rays_d_batch in tqdm(zip(rays_o, rays_d)):
            near, far = self.dataset.near_far_from_sphere(rays_o_batch, rays_d_batch)
            # background_rgb = torch.ones([1, 3]) if self.use_white_bkgd else None

            batch_normal, batch_depth = self.renderer.render_normal_pixel_based(rays_o_batch,
                                              rays_d_batch,
                                              near,
                                              far)

            out_normal_fine.append(batch_normal.detach().cpu().numpy())
            out_depth_fine.append(batch_depth.detach().cpu().numpy())

        if len(out_normal_fine) > 0:
            normal_img = np.concatenate(out_normal_fine, axis=0)
            rot = np.linalg.inv(self.dataset.pose_all[idx, :3, :3].detach().cpu().numpy())  # W2C rotation
            # normal_img_world = (normal_img.reshape([H, W, 3]) * 128 + 128).clip(0, 255)
            normal_img = np.matmul(rot[None, :, :], normal_img[:, :, None]).reshape([H, W, 3, -1])
            normal_img[:,:, [1, 2]] *= -1
            normal_img_norm = np.linalg.norm(np.squeeze(normal_img), axis=2, keepdims=True)
            normal_img_normalized = np.squeeze(normal_img) / (normal_img_norm+1e-7)

            # normal_img = ((np.squeeze(normal_img)/normal_img_norm) * 128 + 128).clip(0, 255)
            normal_img = (np.squeeze(normal_img) * 128 + 128).clip(0, 255)
            normal_img_normalized = (np.squeeze(normal_img_normalized) * 128 + 128).clip(0, 255)


            depth_img = np.concatenate(out_depth_fine, axis=0).reshape([H, W])

        os.makedirs(os.path.join(self.base_exp_dir, 'normals'), exist_ok=True)
        os.makedirs(os.path.join(self.base_exp_dir, "depth"), exist_ok=True)

        normal_img_norm[~mask_np] = np.nan
        depth_img[~mask_np] = np.nan

        normal_img_norm = np.squeeze(normal_img_norm.clip(0.8, 1.2))
        normal_img_norm = (normal_img_norm - np.nanmin(normal_img_norm)) / (np.nanmax(normal_img_norm) - np.nanmin(normal_img_norm))
        normal_img_norm = np.nan_to_num(normal_img_norm)
        normal_img_norm = (normal_img_norm * 255).astype(np.uint8)
        normal_img_norm = cv.applyColorMap(normal_img_norm, cv.COLORMAP_JET)
        normal_img_norm[~mask_np] = 0
        cv.imwrite(os.path.join(self.base_exp_dir,
                                        'normals',
                                        '{:0>8d}_{}_{}_norm.png'.format(self.iter_step, 0, idx)),
                           normal_img_norm[..., ::-1])

        cv.imwrite(os.path.join(self.base_exp_dir,
                                        'normals',
                                        '{:0>8d}_{}_{}.png'.format(self.iter_step, 0, idx)),
                           normal_img[..., ::-1])
        cv.imwrite(os.path.join(self.base_exp_dir,
                                        'normals',
                                        '{:0>8d}_{}_{}_normalized.png'.format(self.iter_step, 0, idx)),
                            normal_img_normalized[..., ::-1])
        np.save(os.path.join(self.base_exp_dir,
                                'depth',
                                '{:0>8d}_{}_{}.npy'.format(self.iter_step, 0, idx)),
                    depth_img)
        return idx, (normal_img - 128) / 128.

    def validate_normal_patch_based(self, idx=-1, resolution_level=-1, gradient_method="dfd"):
        if idx < 0:
            idx = np.random.randint(self.dataset.n_images)

        print('Rendering normal maps...  iter: {}, camera: {}'.format(self.iter_step, idx))

        if resolution_level < 0:
            resolution_level = self.validate_resolution_level
        rays_o_patch_center, \
            rays_d_patch_center, \
            rays_o_patches_all, \
            rays_v_patches_all, \
            rays_ez, \
            rays_A_inverse, horizontal_num_patch, vertical_num_patch = self.dataset.gen_patches_at(idx, resolution_level=resolution_level,
                                                                                                   patch_H=self.patch_size,
                                                                                                   patch_W=self.patch_size)
        mask_np = self.dataset.masks_np[idx].astype(bool)  # (H, W)

        img_w = horizontal_num_patch * self.patch_size
        img_h = vertical_num_patch * self.patch_size
        # resize mask to the size of the image
        mask_np = cv.resize(mask_np.astype(np.uint8),
                            ((int(img_w), int(img_h))),
                            interpolation=cv.INTER_NEAREST).astype(bool)

        num_patches = rays_o_patches_all.shape[0]
        eval_patch_size = 1024
        comp_normal_map = np.zeros([img_h, img_w, 3])
        comp_normal_list = []

        for patch_idx in range(0, num_patches, eval_patch_size):
            rays_o_patch_center_batch = rays_o_patch_center[patch_idx:patch_idx+eval_patch_size]
            rays_d_patch_center_batch = rays_d_patch_center[patch_idx:patch_idx+eval_patch_size]
            rays_o_patches_all_batch = rays_o_patches_all[patch_idx:patch_idx+eval_patch_size]
            rays_v_patches_all_batch = rays_v_patches_all[patch_idx:patch_idx+eval_patch_size]
            rays_ez_batch = rays_ez[patch_idx:patch_idx+eval_patch_size]
            rays_A_inverse_batch = rays_A_inverse[patch_idx:patch_idx+eval_patch_size]

            near, far = self.dataset.near_far_from_sphere(rays_o_patch_center_batch,
                                                          rays_d_patch_center_batch)
            render_out = self.renderer.render(rays_o_patches_all_batch,
                                                    rays_v_patches_all_batch,
                                                    rays_ez_batch,
                                                    near, far,
                                                    rays_A_inverse_batch, gradient_method, mode='eval')

            comp_normal = render_out['comp_normal']
            comp_normal = comp_normal.detach().cpu().numpy()
            comp_normal_list.append(comp_normal)

        comp_normal_list = np.concatenate(comp_normal_list, axis=0)

        count = 0
        for i in range(0, img_h, self.patch_size):
            for j in range(0, img_w, self.patch_size):
                comp_normal_map[i:i+self.patch_size, j:j+self.patch_size] = comp_normal_list[count]
                count += 1
        normal_img_world = comp_normal_map

        rot = np.linalg.inv(self.dataset.pose_all[idx, :3, :3].detach().cpu().numpy())  # W2C rotation

        normal_img = np.matmul(rot, normal_img_world[..., None]).squeeze()
        normal_img[..., [1, 2]] *= -1
        normal_img_png = (np.squeeze(normal_img) * 128 + 128).clip(0, 255)
        normal_img_norm = np.linalg.norm(np.squeeze(normal_img), axis=2, keepdims=True)
        normal_dir = os.path.join(self.base_exp_dir, f'normals_validation_{gradient_method}', 'iter_{:0>6d}'.format(self.iter_step))
        os.makedirs(normal_dir, exist_ok=True)

        normal_img_normalized = np.squeeze(normal_img) / (normal_img_norm + 1e-7)
        normal_img_normalized = (np.squeeze(normal_img_normalized) * 128 + 128).clip(0, 255)

        normal_eval = np.zeros((img_h, img_w, 3))
        normal_eval[:normal_img_png.shape[0], :normal_img_png.shape[1]] = normal_img_png

        normal_eval_normalized = np.zeros((img_h, img_w, 3))
        normal_eval_normalized[:normal_img_normalized.shape[0], :normal_img_normalized.shape[1]] = normal_img_normalized

        normal_img_normalized = crop_image_by_mask(toRGBA(normal_eval_normalized.astype(np.uint8)[...,::-1], mask_np), mask_np)

        cv.imwrite(os.path.join(normal_dir, '{:0>8d}_{}_{}_rendered.png'.format(self.iter_step, 0, idx)),
                           normal_eval[..., ::-1])

        cv.imwrite(os.path.join(normal_dir, '{:0>8d}_{}_{}_normalized.png'.format(self.iter_step, 0, idx)),
                            normal_img_normalized)
        return normal_img_world, normal_dir

    def validate_mesh(self, world_space=True, resolution=256, threshold=0.0):
        print('Extracting mesh...  iter: {}'.format(self.iter_step))
        bound_min = torch.tensor(self.dataset.object_bbox_min, dtype=torch.float32)
        bound_max = torch.tensor(self.dataset.object_bbox_max, dtype=torch.float32)

        vertices, triangles =\
            self.renderer.extract_geometry(bound_min, bound_max, resolution=resolution, threshold=threshold)

        mesh = trimesh.Trimesh(vertices, triangles)
        vertices, triangles = mesh.vertices, mesh.faces

        save_dir = os.path.join(self.base_exp_dir, 'meshes_validation')
        os.makedirs(save_dir, exist_ok=True)

        if world_space:
            vertices = vertices * self.dataset.scale_mats_np[0][0, 0] + self.dataset.scale_mats_np[0][:3, 3][None]

        self.writer.add_mesh('mesh_eval', vertices=vertices[None,...], faces=triangles[None,...], global_step=self.iter_step)

        mesh = self.remove_isolated_clusters(trimesh.Trimesh(vertices, triangles))
        mesh_path = os.path.join(save_dir, 'iter_{:0>8d}.ply'.format(self.iter_step))
        o3d.io.write_triangle_mesh((mesh_path), mesh)

        print(f'Mesh saved at {mesh_path}')

    def remove_isolated_clusters(self, mesh):
        # cleaning the marching cube extracted mesh
        import copy
        mesh = mesh.as_open3d
        # with o3d.utility.VerbosityContextManager(
        #         o3d.utility.VerbosityLevel.Debug) as cm:
        triangle_clusters, cluster_n_triangles, cluster_area = (
            mesh.cluster_connected_triangles())
        triangle_clusters = np.asarray(triangle_clusters)
        cluster_n_triangles = np.asarray(cluster_n_triangles)

        mesh_eval = copy.deepcopy(mesh)
        largest_cluster_idx = cluster_n_triangles.argmax()
        triangles_to_remove = triangle_clusters != largest_cluster_idx
        mesh_eval.remove_triangles_by_mask(triangles_to_remove)
        mesh_eval.remove_unreferenced_vertices()
        return mesh_eval

    @torch.no_grad()
    def eval_mae(self, gradient_method):
        print("Computing mean angular errors...")
        normal_gt_dir = os.path.join(self.dataset.data_dir, "normal_world_space_GT")

        ae_map_list = []
        normal_map_eval_list = []
        ae_map_eval_list = []
        ae_map_test_list = []
        for idx in range(self.dataset.n_images):
            normal_gt = pyexr.read(os.path.join(normal_gt_dir, "{:02d}.exr".format(idx)))[..., :3]

            mask_np = self.dataset.masks_np[idx].astype(bool)

            normal_map_world, save_dir = self.validate_normal_patch_based(idx, resolution_level=self.val_normal_resolution_level, gradient_method=gradient_method)

            normal_map_world = normal_map_world / (1e-10 + np.linalg.norm(normal_map_world, axis=-1, keepdims=True))

            normal_eval = np.zeros((self.dataset.H, self.dataset.W, 3))
            normal_eval[:normal_map_world.shape[0], :normal_map_world.shape[1]] = normal_map_world
            normal_eval[~mask_np] = np.nan
            normal_map_eval_list.append(normal_eval)
            # self.writer.add_image(step=self.iter_step, data=(normal_eval + 1) / 2, name=("normal_eval_{:02d}".format(idx)))
            # pyexr.write(os.path.join(normal_save_dir, "{:02d}.exr".format(idx)), normal_img)

            angular_error_map = np.rad2deg(np.arccos(np.clip(np.sum(normal_gt * normal_eval, axis=-1), -1, 1)))
            # save angular error map

            ae_map_list.append(angular_error_map.copy())
            if idx in self.dataset.exclude_view_list:
                ae_map_test_list.append(angular_error_map.copy())

            # apply jet to angular error map
            angular_error_map[~mask_np] = 0
            angular_error_map_jet = cv.applyColorMap((angular_error_map / 20 * 255).clip(0, 255).astype(np.uint8),
                                                     cv.COLORMAP_JET)
            angular_error_map_jet[~mask_np] = 255
            angular_error_map_jet = crop_image_by_mask(toRGBA(angular_error_map_jet, mask_np), mask_np)
            cv.imwrite(os.path.join(save_dir, '{:0>8d}_{}_{}_ae_up_{}.png'.format(self.iter_step, 0, idx, 20)), angular_error_map_jet)


            ae_map_eval_list.append(angular_error_map_jet)

        mae = np.nanmean(np.stack(ae_map_list, axis=0))
        self.writer.add_scalar('Statistics/mae_allview', mae, self.iter_step)

        if len(ae_map_test_list) > 0:
            mae_test = np.nanmean(np.stack(ae_map_test_list, axis=0))
            self.writer.add_scalar('Statistics/mae_testview', mae_test, self.iter_step)
            return mae, mae_test

        return mae

    @torch.no_grad()
    def eval_geo(self, resolution=1024):
        # save the mesh
        save_dir = os.path.join(self.base_exp_dir, 'points_val')
        os.makedirs(save_dir, exist_ok=True)

        # save gt points
        pcd_gt = o3d.geometry.PointCloud()
        pcd_gt.points = o3d.utility.Vector3dVector(self.dataset.points_gt)
        if not os.path.exists(os.path.join(save_dir, f"pcd_gt.ply")):
            o3d.io.write_point_cloud(os.path.join(save_dir, f"pcd_gt.ply"), pcd_gt)

        # marching cubes
        bound_min = torch.tensor(self.dataset.object_bbox_min, dtype=torch.float32)
        bound_max = torch.tensor(self.dataset.object_bbox_max, dtype=torch.float32)

        vertices, triangles = \
            self.renderer.extract_geometry(bound_min, bound_max, resolution=resolution, threshold=0)

        # vertices = vertices * self.dataset.scale_mats_np[0][0, 0] + self.dataset.scale_mats_np[0][:3, 3][None]
        mesh = trimesh.Trimesh(np.asarray(vertices), np.asarray(triangles), process=False)
        vertices_world = vertices * self.dataset.scale_mats_np[0][0, 0] + self.dataset.scale_mats_np[0][:3, 3][None]
        mesh_world = trimesh.Trimesh(np.asarray(vertices_world), np.asarray(triangles), process=False)
        mesh_world_path = os.path.join(save_dir, f"{self.iter_step}_world.obj")
        mesh_world.export(mesh_world_path)

        points_eval = self.find_visible_points(mesh)*self.dataset.scale_mats_np[0][0, 0] + self.dataset.scale_mats_np[0][:3, 3][None]

        # save the sampled points
        sampled_points_path = os.path.join(save_dir, f"{self.iter_step}_points_eval.ply")
        pcd_eval = o3d.geometry.PointCloud()
        pcd_eval.points = o3d.utility.Vector3dVector(points_eval)
        o3d.io.write_point_cloud(sampled_points_path, pcd_eval)

        cd, fscore = chamfer_distance_and_f1_score(points_eval, self.dataset.points_gt)
        self.writer.add_scalar('Statistics/cd', cd, self.iter_step)
        self.writer.add_scalar('Statistics/fscore', fscore, self.iter_step)
        return cd, fscore

    def find_visible_points(self, mesh):
        num_view = self.dataset.n_images
        points_list = []
        for view_idx in range(num_view):
            rays_o, rays_v = self.dataset.gen_rays_at(view_idx, resolution_level=1, within_mask=True)
            rays_o, rays_v = rays_o.cpu().detach().numpy(), rays_v.cpu().detach().numpy()
            rays_v = rays_v / np.linalg.norm(rays_v, axis=-1, keepdims=True)
            locations, index_ray, index_tri = mesh.ray.intersects_location(
                ray_origins=rays_o,
                ray_directions=rays_v,
                multiple_hits=False)
            points_list.append(locations)
        return np.concatenate(points_list, axis=0)


if __name__ == '__main__':
    import warnings
    warnings.filterwarnings("ignore")

    torch.set_default_tensor_type('torch.cuda.FloatTensor')

    parser = argparse.ArgumentParser()
    parser.add_argument('--conf', type=str, default='./confs/base.conf')
    parser.add_argument('--mode', type=str, default='eval_normal')
    parser.add_argument('--mcube_threshold', type=float, default=0.0)
    parser.add_argument('--is_continue', default=False, action="store_true")
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--obj_name', type=str, default='')

    args = parser.parse_args()
    torch.cuda.set_device(args.gpu)

    print(f'Running on the object: {args.obj_name}')

    f = open(args.conf)
    conf_text = f.read()
    conf_text = conf_text.replace('CASE_NAME', args.obj_name)

    runner = Runner(conf_text, args.mode, args.is_continue)
    runner.train()


================================================
FILE: models/cd_and_fscore.py
================================================
from scipy.spatial import KDTree
import numpy as np


def chamfer_distance_and_f1_score(ref_points, eval_points, f_threshold=0.5):
    """
    This function calculates the chamfer distance and f1 score between two sets of points.

    Parameters:
    ref_points (numpy.ndarray): Reference points. A (p, 3) array representing points in the world space.
    eval_points (numpy.ndarray): Points to be evaluated. A (p, 3) array representing points in the world space.
    f_threshold (float, optional): Threshold for f1 score calculation. Default is 0.5mm.

    Returns:
    chamfer_dist (float): The chamfer distance between gt_points and eval_points.
    f_score (float): The f1 score between gt_points and eval_points.
    """
    print("computing chamfer distance and f1 score...")
    distance_eval2gt, _ = KDTree(ref_points).query(eval_points, k=1, p=2)   # p=2 for Euclidean distance
    distance_gt2eval, _ = KDTree(eval_points).query(ref_points, k=1, p=2)

    # following Uncertainty-aware deep multi-view photometric stereo
    chamfer_dist = (np.mean(distance_eval2gt) + np.mean(distance_gt2eval))/2

    precision = np.mean(distance_eval2gt < f_threshold)
    recall = np.mean(distance_gt2eval < f_threshold)
    f_score = 2 * precision * recall / (precision + recall)

    return chamfer_dist, f_score


================================================
FILE: models/dataset_loader.py
================================================
import torch
import torch.nn.functional as F
import cv2 as cv
import numpy as np
import os
from glob import glob
from icecream import ic
import pyexr
import open3d as o3d
import time
from concurrent.futures import ThreadPoolExecutor


def load_K_Rt_from_P(filename, P=None):
    # This function is borrowed from IDR: https://github.com/lioryariv/idr
    if P is None:
        lines = open(filename).read().splitlines()
        if len(lines) == 4:
            lines = lines[1:]
        lines = [[x[0], x[1], x[2], x[3]] for x in (x.split(" ") for x in lines)]
        P = np.asarray(lines).astype(np.float32).squeeze()

    K, R, t, *_ = cv.decomposeProjectionMatrix(P)
    # CAUTION: R is the W2C rotation matrix but t is the camera position in world coordinate.
    K = K / K[2, 2]

    intrinsics = np.eye(4)
    intrinsics[:3, :3] = K

    C2W = np.eye(4, dtype=np.float32)
    C2W[:3, :3] = R.T
    C2W[:3, 3] = (t[:3] / t[3])[:, 0]

    return intrinsics, C2W


class Dataset:
    def __init__(self, conf):
        super(Dataset, self).__init__()
        print('Load data: Begin')
        self.device = torch.device('cuda')
        self.conf = conf
        normal_dir = conf.get_string('normal_dir')

        self.data_dir = conf.get_string('data_dir')
        self.cameras_name = conf.get_string('cameras_name')
        self.exclude_view_list = conf['exclude_views']  # list of views to exclude from training. Used in novel-view normal synthesis evaluation.
        self.upsample_factor = conf.get_int('upsample_factor', default=1)
        ic(self.exclude_view_list)

        # load the GT mesh for evaluation if any
        mesh_path = os.path.join(self.data_dir, 'mesh_Gt.ply')
        if os.path.exists(mesh_path):
            self.mesh_gt = o3d.io.read_triangle_mesh(mesh_path)
        else:
            self.mesh_gt = None
        self.points_gt = None  # will be computed from the mesh at evaluation time

        camera_dict = np.load(os.path.join(self.data_dir, self.cameras_name))
        self.camera_dict = camera_dict
        self.normal_lis = sorted(glob(os.path.join(self.data_dir, normal_dir, '*.exr')))
        self.n_images = len(self.normal_lis)
        self.train_images = set(range(self.n_images)) - set(self.exclude_view_list)
        self.img_idx_list = [int(os.path.basename(x).split('.')[0]) for x in self.normal_lis]

        print("loading normal maps...")
        with ThreadPoolExecutor(max_workers=min(64, os.cpu_count()*5)) as executor:
            def read_normal(im_name):
                return pyexr.read(im_name)[..., :3]
            self.normal_np = np.stack(list(executor.map(read_normal, self.normal_lis)))

        if self.upsample_factor > 1:
            # resize normal maps
            self.normal_np = F.interpolate(torch.from_numpy(self.normal_np).permute(0, 3, 1, 2), scale_factor=self.upsample_factor, mode='bilinear', align_corners=False).permute(0, 2, 3, 1).numpy()
        self.normals = torch.from_numpy(self.normal_np.astype(np.float32)).to(self.device)  # [n_images, H, W, 3]
        print("loading normal maps done.")

        self.masks_lis = sorted(glob(os.path.join(self.data_dir, 'mask/*.png')))
        with ThreadPoolExecutor(max_workers=min(64, os.cpu_count()*5)) as executor:
            def read_mask(im_name):
                return cv.imread(im_name)
            self.masks_np = np.stack(list(executor.map(read_mask, self.masks_lis))) / 255.0

        if self.upsample_factor > 1:
            # resize mask
            self.masks_np = F.interpolate(torch.from_numpy(self.masks_np).permute(0, 3, 1, 2), scale_factor=self.upsample_factor, mode='nearest').permute(0, 2, 3, 1).numpy()
        self.masks_np = self.masks_np[..., 0]
        self.total_pixel = np.sum(self.masks_np)

        # set background of normal map to 0
        self.normal_np[self.masks_np == 0] = 0

        # world_mat is a projection matrix from world to image
        self.world_mats_np = [camera_dict['world_mat_%d' % idx].astype(np.float32) for idx in self.img_idx_list]
        self.scale_mats_np = []

        # scale_mat: used for coordinate normalization, we assume the scene to render is inside a unit sphere at origin.
        self.scale_mats_np = [camera_dict['scale_mat_%d' % idx].astype(np.float32) for idx in self.img_idx_list]

        self.intrinsics_all = []
        self.pose_all = []
        self.V_inverse_all = []

        self.H, self.W = self.normal_np.shape[1], self.normal_np.shape[2]
        for scale_mat, world_mat, normal_map, mask in zip(self.scale_mats_np, self.world_mats_np, self.normals, self.masks_np):
            P = world_mat @ scale_mat
            P = P[:3, :4]
            intrinsics, C2W = load_K_Rt_from_P(None, P)
            if self.upsample_factor > 1:
                # resize intrinsics
                intrinsics[0, 0] *= self.upsample_factor
                intrinsics[1, 1] *= self.upsample_factor
                intrinsics[0, 2] *= self.upsample_factor
                intrinsics[1, 2] *= self.upsample_factor
            self.intrinsics_all.append(torch.from_numpy(intrinsics).float())
            self.pose_all.append(torch.from_numpy(C2W).float())

            intrinsics_inverse = torch.inverse(torch.from_numpy(intrinsics).float())
            pose = torch.from_numpy(C2W).float()
            # compute the V_inverse
            tx = torch.linspace(0, self.W - 1, int(self.W))
            ty = torch.linspace(0, self.H - 1, int(self.H))
            pixels_x, pixels_y = torch.meshgrid(tx, ty)
            p = torch.stack([pixels_x, pixels_y, torch.ones_like(pixels_y)], dim=-1).to(intrinsics_inverse.device)  # W, H, 3
            p = torch.matmul(intrinsics_inverse[None, None, :3, :3],
                             p[:, :, :, None]).squeeze()  # W, H, 3
            rays_v = p / torch.linalg.norm(p, ord=2, dim=-1, keepdim=True)  # W, H, 3
            rays_v = torch.matmul(pose[None, None, :3, :3],
                                  rays_v[:, :, :, None]).squeeze()  # W, H, 3
            rays_v = rays_v.transpose(0, 1).to(self.device) # H, W, 3

            # the axis direction of the camera coordinate system in the world coordinate system
            rays_right = pose[None, :3, 0].expand(rays_v.shape).to(self.device)  # H, W, 3
            rays_down = pose[None, :3, 1].expand(rays_v.shape).to(self.device)  # H, W, 3

            V_concat = torch.cat([rays_v[..., None, :],
                                  rays_right[..., None, :],
                                  rays_down[..., None, :]], dim=-2)  # (H, W, 3, 3)

            # computing the inverse may take a while if the resolution is high
            # For 512x612, it takes about 0.8ms
            V_inverse = torch.inverse(V_concat)  # (H, W, 3, 3)
            self.V_inverse_all.append(V_inverse)

        self.masks = torch.from_numpy(self.masks_np.astype(np.float32)).to(self.device) # [n_images, H, W, 3]
        self.intrinsics_all = torch.stack(self.intrinsics_all).to(self.device)   # [n_images, 4, 4]
        self.intrinsics_all_inv = torch.inverse(self.intrinsics_all)  # [n_images, 4, 4]
        self.focal_length = self.intrinsics_all[0][0, 0]
        self.pose_all = torch.stack(self.pose_all).to(self.device)  # [n_images, 4, 4]
        self.image_pixels = self.H * self.W
        self.V_inverse_all = torch.stack(self.V_inverse_all).to(self.device)  # [n_images, H, W, 3, 3]

        # for mesh extraction
        self.object_bbox_min = np.array([-1., -1., -1.])
        self.object_bbox_max = np.array([1.,  1.,  1.])
        print('Load data: End')

    def gen_rays_at(self, img_idx, resolution_level=1, within_mask=False):
        """
        Generate all rays at world space from one camera.
        """
        mask_np = self.masks_np[img_idx].astype(bool)
        # resize the mask using resolution_level
        mask_np = cv.resize(mask_np.astype(np.uint8)*255, (int(self.W // resolution_level), int(self.H // resolution_level)), interpolation=cv.INTER_NEAREST).astype(bool)

        l = resolution_level
        tx = torch.linspace(0, self.W - 1, int(self.W // l))
        ty = torch.linspace(0, self.H - 1, int(self.H // l))
        pixels_x, pixels_y = torch.meshgrid(tx, ty)
        p = torch.stack([pixels_x, pixels_y, torch.ones_like(pixels_y)], dim=-1) # W, H, 3
        p = torch.matmul(self.intrinsics_all_inv[img_idx, None, None, :3, :3], p[:, :, :, None]).squeeze()  # W, H, 3
        rays_v = p / torch.linalg.norm(p, ord=2, dim=-1, keepdim=True)  # W, H, 3
        rays_v = torch.matmul(self.pose_all[img_idx, None, None, :3, :3], rays_v[:, :, :, None]).squeeze()  # W, H, 3
        rays_o = self.pose_all[img_idx, None, None, :3, 3].expand(rays_v.shape)  # W, H, 3
        rays_o = rays_o.transpose(0, 1)
        rays_v = rays_v.transpose(0, 1)

        if within_mask:
            return rays_o[mask_np], rays_v[mask_np]
        else:
            return rays_o, rays_v

    def gen_patches_at(self, img_idx, resolution_level=1, patch_H=3, patch_W=3):
        tx = torch.linspace(0, self.W - 1, int(self.W // resolution_level))
        ty = torch.linspace(0, self.H - 1, int(self.H // resolution_level))
        pixels_y, pixels_x = torch.meshgrid(ty, tx)

        p = torch.stack([pixels_x, pixels_y, torch.ones_like(pixels_y)], dim=-1) # H, W, 3
        p = torch.matmul(self.intrinsics_all_inv[img_idx, :3, :3], p[..., None]).squeeze()  # H, W, 3
        rays_v = p / torch.linalg.norm(p, ord=2, dim=-1, keepdim=True)  # W, H, 3
        rays_v = torch.matmul(self.pose_all[img_idx, :3, :3], rays_v[:, :, :, None]).squeeze()  # H, W, 3

        # split rays_v into non-overlapping patches
        height, width, _ = rays_v.shape
        horizontal_num_patch = width // patch_W
        vertical_num_patch = height // patch_H
        rays_v_patches_all = []
        rays_V_inverse_patches_all = []
        rays_ez_patches_all = []
        mask_value = []
        for i in range(0, height-patch_H//2-1, patch_H):
            for j in range(0, width-patch_W//2-1, patch_W):
                rays_v_patch = rays_v[i:i + patch_H, j:j + patch_W]
                rays_v_patches_all.append(rays_v_patch)

                rays_V_inverse_patch = self.V_inverse_all[img_idx][i:i + patch_H, j:j + patch_W]
                rays_V_inverse_patches_all.append(rays_V_inverse_patch)

                rays_ez_patch = self.normals[img_idx][i + patch_H//2, j + patch_W//2]
                rays_ez_patches_all.append(rays_ez_patch)

                mask_value.append(self.masks_np[img_idx][i + patch_H//2, j + patch_W//2].astype(bool))
        rays_v_patches_all = torch.stack(rays_v_patches_all, dim=0)  # (num_patch, patch_H, patch_W, 3)
        rays_V_inverse_patches_all = torch.stack(rays_V_inverse_patches_all, dim=0)  # (num_patch, patch_H, patch_W, 3, 3)
        rays_o_patches_all = self.pose_all[img_idx, :3, 3].expand(rays_v_patches_all.shape)  # (num_patch, patch_H, patch_W, 3)

        rays_o_patch_center = rays_o_patches_all[:, patch_H//2, patch_W//2]  # (num_patch, 3)
        rays_d_patch_center = rays_v_patches_all[:, patch_H//2, patch_W//2]  # (num_patch, 3)

        marching_plane_normal_patches_all = self.pose_all[img_idx, :3, 2].expand(rays_d_patch_center.shape)  # (num_patch, 3)

        return rays_o_patch_center, \
                rays_d_patch_center, \
            rays_o_patches_all, \
            rays_v_patches_all, \
            marching_plane_normal_patches_all, \
            rays_V_inverse_patches_all, horizontal_num_patch, vertical_num_patch

    def gen_random_patches(self, num_patch, patch_H=3, patch_W=3):
        """
        Generate random patches of rays at world space from all viewpoints.
        X-axis right, Y-axis down

        Parameters:
        num_patch (int): The number of patches to generate.
        patch_H (int, optional): The height of the patches. Default is 3.
        patch_W (int, optional): The width of the patches. Default is 3.

        Returns:
        rays_o_patch_all (torch.Tensor): The origins of the rays in each patch. A tensor of shape (num_patch, patch_H, patch_W, 3).
        rays_d_patch_all (torch.Tensor): The directions of the rays in each patch. A tensor of shape (num_patch, patch_H, patch_W, 3).
        marching_plane_normal (torch.Tensor): The normal direction of the image/marching plane.
                Since we randomly sample patches from all viewpoints, this normal is only identical for each patch. A tensor of shape (num_patch, 3).
        V_inverse_patch_all (torch.Tensor): The inverse of the V matrix at patches of pixels. A tensor of shape (num_patch, patch_H, patch_W, 3, 3).
        normal (torch.Tensor): The normals at patches of pixels. A tensor of shape (num_patch, patch_H, patch_W, 3).
        mask (torch.Tensor): The mask values at patches of pixels. A tensor of shape (num_patch, patch_H, patch_W, 1).
        """
        # randomly sample center pixel locations of patches
        # assume all images have the same resolution
        patch_center_x = torch.randint(low=0+patch_W//2, high=self.W-1-patch_W//2, size=[num_patch], device=self.device)  # (num_patch, )
        patch_center_y = torch.randint(low=0+patch_H//2, high=self.H-1-patch_H//2, size=[num_patch], device=self.device)  # (num_patch, )

        # compute all pixel locations within the patches given patch size (patch_H, patch_W)
        patch_center_x_all = patch_center_x[:, None, None] + torch.arange(-patch_W//2+1, patch_W//2+1, device=self.device).repeat(patch_H, 1)   # (num_patch, patch_H, patch_W)
        patch_center_y_all = patch_center_y[:, None, None] + torch.arange(-patch_H//2+1, patch_H//2+1, device=self.device).reshape(-1, 1).repeat(1, patch_W)   # (num_patch, patch_H, patch_W)

        # randomly sample viewpoints
        img_idx = np.random.choice(list(self.train_images), size=[num_patch])  # (num_patch, )
        img_idx = torch.tensor(img_idx, device=self.device)
        img_idx_expand = img_idx.view(-1, 1, 1).expand_as(patch_center_x_all)  # (num_patch, patch_H, patch_W)

        # input normals and mask values for supervision
        normal = self.normals[img_idx_expand, patch_center_y_all, patch_center_x_all]  # (num_patch, patch_H, patch_W, 3)
        V_inverse_patch_all = self.V_inverse_all[img_idx_expand, patch_center_y_all, patch_center_x_all]  # (num_patch, patch_H, patch_W, 3, 3)
        mask = self.masks[img_idx_expand, patch_center_y_all, patch_center_x_all].unsqueeze(-1)#[..., :1]     # (num_patch, patch_H, patch_W)

        # compute all ray directions within patches
        p_all = torch.stack([patch_center_x_all, patch_center_y_all, torch.ones_like(patch_center_y_all)], dim=-1).float().to(self.device)  # (num_patch, patch_H, patch_W, 3)
        p_all = torch.matmul(self.intrinsics_all_inv[img_idx_expand, :3, :3], p_all[..., None])[..., 0]  # (num_patch, patch_H, patch_W, 3)
        p_norm_all = torch.linalg.norm(p_all, ord=2, dim=-1, keepdim=True)  # (num_patch, patch_H, patch_W, 1)
        rays_d_patch_all = p_all / p_norm_all  # (num_patch, patch_H, patch_W, 3)
        rays_d_patch_all = torch.matmul(self.pose_all[img_idx, None, None, :3, :3], rays_d_patch_all[..., None])[..., 0]  # (num_patch, patch_H, patch_W, 3)
        rays_o_patch_all = self.pose_all[img_idx, None, None, :3, 3].expand(rays_d_patch_all.shape)  # (num_patch, patch_H, patch_W, 3)

        # the normal direction of the image/marching plane is the 3rd column of world2camera transformation
        marching_plane_normal = self.pose_all[img_idx, :3, 2].expand((num_patch, 3))  # (num_patch, 3)

        return rays_o_patch_all, \
                rays_d_patch_all, \
                marching_plane_normal, \
                V_inverse_patch_all, \
                normal,\
                mask

    def near_far_from_sphere(self, rays_o, rays_d):
        """
        This function calculates the near and far intersection points of rays with a unit sphere.

        Parameters:
        rays_o (torch.Tensor): Origin of the rays. A tensor of shape (N, 3) where N is the number of rays.
        rays_d (torch.Tensor): Direction of the rays. A tensor of shape (N, 3) where N is the number of rays.

        Returns:
        near (torch.Tensor): Near intersection points of the rays with the unit sphere. A tensor of shape (N, ).
        far (torch.Tensor): Far intersection points of the rays with the unit sphere. A tensor of shape (N, ).
        """
        a = torch.sum(rays_d**2, dim=-1, keepdim=True)
        b = 2.0 * torch.sum(rays_o * rays_d, dim=-1, keepdim=True)
        c = torch.sum(rays_o**2, dim=-1, keepdim=True) - 1.0
        mid = 0.5 * (-b) / a
        near = mid - torch.sqrt(b ** 2 - 4 * a * c) / (2 * a)
        far = mid + torch.sqrt(b ** 2 - 4 * a * c) / (2 * a)
        return near[..., 0], far[..., 0]

    def image_at(self, idx, resolution_level):
        img = cv.imread(self.images_lis[idx])
        return (cv.resize(img, (self.W // resolution_level, self.H // resolution_level))).clip(0, 255)


================================================
FILE: models/fields.py
================================================
import torch
import torch.nn as nn
import numpy as np
import tinycudann as tcnn
from icecream import ic

class SDFNetwork(nn.Module):
    def __init__(self,
                 d_in,
                 d_out,
                 d_hidden,
                 n_layers,
                 skip_in=(4,),
                 bias=0.5,
                 geometric_init=True,
                 weight_norm=True,
                 inside_outside=False,
                 encoding_config=None,
                 input_concat=False):
        super(SDFNetwork, self).__init__()
        self.input_concat = input_concat

        dims = [d_in] + [d_hidden for _ in range(n_layers)] + [d_out]

        if encoding_config is not None:
            self.encoding = tcnn.Encoding(d_in, encoding_config).to(torch.float32)
            dims[0] = self.encoding.n_output_dims
            if input_concat:
                dims[0] += d_in
        else:
            self.encoding = None

        self.num_layers = len(dims)
        self.skip_in = skip_in

        self.bindwidth = 0
        self.enc_dim = self.encoding.n_output_dims

        for l in range(0, self.num_layers - 1):
            if l + 1 in self.skip_in:
                out_dim = dims[l + 1] - dims[0]
            else:
                out_dim = dims[l + 1]

            lin = nn.Linear(dims[l], out_dim)

            if geometric_init:
                if l == self.num_layers - 2:
                    if not inside_outside:
                        torch.nn.init.normal_(lin.weight, mean=np.sqrt(np.pi) / np.sqrt(dims[l]), std=0.0001)
                        torch.nn.init.constant_(lin.bias, -bias)
                    else:
                        torch.nn.init.normal_(lin.weight, mean=-np.sqrt(np.pi) / np.sqrt(dims[l]), std=0.0001)
                        torch.nn.init.constant_(lin.bias, bias)
                elif self.encoding is not None and l == 0:
                    torch.nn.init.constant_(lin.bias, 0.0)
                    torch.nn.init.constant_(lin.weight[:, 3:], 0.0)
                    torch.nn.init.normal_(lin.weight[:, :3], 0.0, np.sqrt(2) / np.sqrt(out_dim))
                elif self.encoding is not None and l in self.skip_in:
                    torch.nn.init.constant_(lin.bias, 0.0)
                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
                    torch.nn.init.constant_(lin.weight[:, -(dims[0] - 3):], 0.0)
                else:
                    torch.nn.init.constant_(lin.bias, 0.0)
                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
            if weight_norm:
                lin = nn.utils.weight_norm(lin)

            setattr(self, "lin" + str(l), lin)
        self.activation = nn.Softplus(beta=100)
        # self.activation = nn.ReLU()

    def increase_bandwidth(self):
        self.bindwidth += 1

    def forward(self, inputs):
        if self.encoding is not None:
            encoded = self.encoding(inputs).to(torch.float32)

            # set the dimension of the encoding to 0 if the input is outside the bandwidth
            enc_mask = torch.ones(self.enc_dim, dtype=torch.bool, device=encoded.device, requires_grad=False)
            enc_mask[self.bindwidth*2:] = 0
            encoded = encoded * enc_mask

        if self.input_concat:
            inputs = torch.cat([inputs, encoded], dim=1)

        x = inputs
        for l in range(0, self.num_layers - 1):
            lin = getattr(self, "lin" + str(l))

            if l in self.skip_in:
                x = torch.cat([x, inputs], 1) / np.sqrt(2)

            x = lin(x)

            if l < self.num_layers - 2:
                x = self.activation(x)
        return x

    def sdf(self, x):
        return self.forward(x)[:, :1]

    def sdf_hidden_appearance(self, x):
        return self.forward(x)

    @torch.enable_grad()
    def gradient(self, x):
        x.requires_grad_(True)
        y = self.sdf(x)
        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
        gradients = torch.autograd.grad(
            outputs=y,
            inputs=x,
            grad_outputs=d_output,
            create_graph=True,
            retain_graph=True,
            only_inputs=True)[0]
        return gradients.unsqueeze(1)

    @torch.enable_grad()
    def divergence(self, y, x):
        div = 0.
        for i in range(y.shape[-1]):
            div += torch.autograd.grad(y[..., i], x, torch.ones_like(y[..., i]), create_graph=True)[0][..., i:i + 1]
        return div

    @torch.enable_grad()
    def laplace(self, x):
        return self.divergence(self.gradient(x), x)


class SingleVarianceNetwork(nn.Module):
    def __init__(self, init_val):
        super(SingleVarianceNetwork, self).__init__()
        self.register_parameter('variance', nn.Parameter(torch.tensor(init_val)))

    def forward(self, x):
        return torch.ones([len(x), 1]) * torch.exp(self.variance * 10.0)

================================================
FILE: models/renderer.py
================================================
import torch
import numpy as np
import mcubes
from tqdm import tqdm
from nerfacc import ContractionType, OccupancyGrid, ray_marching, \
    render_weight_from_alpha_patch_based, accumulate_along_rays_patch_based, \
    render_weight_from_alpha, accumulate_along_rays

def extract_fields(bound_min, bound_max, resolution, query_func):
    N = 64
    X = torch.linspace(bound_min[0], bound_max[0], resolution).split(N)
    Y = torch.linspace(bound_min[1], bound_max[1], resolution).split(N)
    Z = torch.linspace(bound_min[2], bound_max[2], resolution).split(N)

    u = np.zeros([resolution, resolution, resolution], dtype=np.float32)
    with torch.no_grad():
        for xi, xs in tqdm(enumerate(X)):
            for yi, ys in enumerate(Y):
                for zi, zs in enumerate(Z):
                    xx, yy, zz = torch.meshgrid(xs, ys, zs)
                    pts = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1)
                    val = query_func(pts).reshape(len(xs), len(ys), len(zs)).detach().cpu().numpy()
                    u[xi * N: xi * N + len(xs), yi * N: yi * N + len(ys), zi * N: zi * N + len(zs)] = val
    return u


def extract_geometry(bound_min, bound_max, resolution, threshold, query_func):
    u = extract_fields(bound_min, bound_max, resolution, query_func)
    vertices, triangles = mcubes.marching_cubes(u, threshold)
    b_max_np = bound_max.detach().cpu().numpy()
    b_min_np = bound_min.detach().cpu().numpy()

    vertices = vertices / (resolution - 1.0) * (b_max_np - b_min_np)[None, :] + b_min_np[None, :]
    return vertices, triangles


class NeuSRenderer:
    def __init__(self, sdf_network, deviation_network,
                 gradient_method="dfd"):
        self.sdf_network = sdf_network
        self.deviation_network = deviation_network

        # define the occ grid, see NerfAcc for more details
        self.scene_aabb = torch.as_tensor([-1., -1., -1., 1., 1., 1.], dtype=torch.float32)
        # define the contraction_type for scene contraction
        self.contraction_type = ContractionType.AABB
        # create Occupancy Grid
        self.occupancy_grid = OccupancyGrid(
            roi_aabb=self.scene_aabb,
            resolution=128,  # if res is different along different axis, use [256,128,64]
            contraction_type=self.contraction_type).to("cuda")
        self.sampling_step_size = 0.01  # ray marching step size, will be modified during training
        self.gradient_method = gradient_method   # dfd or fd or ad


    def occ_eval_fn(self, x):
        # function for updating the occ grid given the current sdf
        sdf = self.sdf_network(x)[..., :1]
        alpha = torch.sigmoid(- sdf * 80)  # occ grids with alpha below the occ threshold will be set as 0
        return alpha


    def render(self, rays_o_patch_all,  # (num_patch, patch_H, patch_W, 3)
                     rays_d_patch_all,  # (num_patch, patch_H, patch_W, 3)
                     marching_plane_normal,  # (num_patch, 3)
                     near,  # (num_patch,)
                     far,  # (num_patch,)
                     V_inverse_patch_all,  # (num_patch, patch_H, patch_W, 3, 3)
                     val_gradient_method='dfd',
                     mode='train'):
        # patch size, should be odd
        patch_H = rays_o_patch_all.shape[1]
        patch_W = rays_o_patch_all.shape[2]
        num_patch = rays_o_patch_all.shape[0]

        # extract camera location and ray direction of the patches' center pixels
        rays_o_patch_center = rays_o_patch_all[:, patch_H//2, patch_W//2]  # (num_patch, 3)
        rays_d_patch_center = rays_d_patch_all[:, patch_H//2, patch_W//2]  # (num_patch, 3)

        def alpha_fn_patch_center(t_starts, t_ends, ray_indices, ret_sdf=False):
            # the function used in ray marching
            ray_indices = ray_indices.long()
            t_origins = rays_o_patch_center[ray_indices]
            t_dirs = rays_d_patch_center[ray_indices]
            positions_starts = t_origins + t_dirs * t_starts
            positions_ends = t_origins + t_dirs * t_ends

            t_starts_shift_left = t_starts[1:]
            # attach the last element of t_ends to the end of t_starts_shift_left
            t_starts_shift_left = torch.cat([t_starts_shift_left, t_starts[-1:]], 0)

            # compute the diff mask between t_ends and t_starts_shift_left
            diff_mask = ((t_ends - t_starts_shift_left) != 0).squeeze()
            # if the diff maks is empty, return
            positions_ends_diff = positions_ends[diff_mask].reshape(-1, 3)

            positions_all = torch.cat([positions_starts, positions_ends_diff], 0)

            sdf_all = self.sdf_network(positions_all)
            sdf_start = sdf_all[:positions_starts.shape[0]]
            sdf_end_diff = sdf_all[positions_starts.shape[0]:]

            sdf_start_shift_left = sdf_start[1:]
            sdf_start_shift_left = torch.cat([sdf_start_shift_left, sdf_start[-1:]], 0)

            sdf_start_shift_left[diff_mask] = sdf_end_diff

            inv_s = self.deviation_network(torch.zeros([1, 3]))[:, :1].clip(1e-6, 1e6)  # Single parameter
            inv_s = inv_s.expand(sdf_start.shape[0], 1)

            prev_cdf = torch.sigmoid(sdf_start * inv_s)
            next_cdf = torch.sigmoid(sdf_start_shift_left * inv_s)

            p = prev_cdf - next_cdf
            c = prev_cdf

            alpha = ((p + 1e-5) / (c + 1e-5)).view(-1).clip(0.0, 1.0)
            alpha = alpha.reshape(-1, 1)
            if ret_sdf:
                return alpha, sdf_start, sdf_start_shift_left
            else:
                return alpha

        with torch.no_grad():
            patch_indices, t_starts_patch_center, t_ends_patch_center = ray_marching(
                rays_o_patch_center, rays_d_patch_center,
                t_min=near,
                t_max=far,
                grid=self.occupancy_grid,
                render_step_size=self.sampling_step_size,
                stratified=True,
                cone_angle=0.0,
                early_stop_eps=1e-8,
                alpha_fn=alpha_fn_patch_center,
            )
            samples_per_ray = patch_indices.shape[0] / num_patch
            if patch_indices.shape[0] == 0:  # all patch center rays are within the zero region of the occ grid. skip this iteration.
                return {
                    "comp_normal": torch.zeros([num_patch, patch_H, patch_W, 3], device=rays_o_patch_center.device)
                }

            num_samples = patch_indices.shape[0]
            patch_indices = patch_indices.long()

            # compute the sampling distance on remaining rays
            t_starts_patch_all = t_starts_patch_center[:, None, None, :] * (rays_d_patch_center * marching_plane_normal).sum(-1, keepdim=True)[patch_indices][:, None, None, :] \
                                 /(rays_d_patch_all * marching_plane_normal[:, None, None, :]).sum(-1, keepdim=True)[patch_indices]
            t_ends_patch_all = t_ends_patch_center[:, None, None, :] * (rays_d_patch_center * marching_plane_normal).sum(-1, keepdim=True)[patch_indices][:, None, None, :] \
                               /(rays_d_patch_all * marching_plane_normal[:, None, None, :]).sum(-1, keepdim=True)[patch_indices]


            t_starts_patch_center_shift_left = t_starts_patch_center[1:]
            t_starts_patch_center_shift_left = torch.cat([t_starts_patch_center_shift_left, t_starts_patch_center[-1:]], 0)
            diff_mask = ((t_ends_patch_center - t_starts_patch_center_shift_left) != 0)[..., 0]
            positions_starts_patch_all = rays_o_patch_all[patch_indices] + rays_d_patch_all[patch_indices] * t_starts_patch_all
            positions_ends_patch_all = rays_o_patch_all[patch_indices] + rays_d_patch_all[patch_indices] * t_ends_patch_all  # (num_samples, patch_H, patch_W, 3)
            positions_ends_diff = positions_ends_patch_all[diff_mask]
            positions_all = torch.cat([positions_starts_patch_all, positions_ends_diff], 0)
            positions_all_flat = positions_all.reshape(-1, 3)

        sdf_all = self.sdf_network(positions_all_flat)
        sdf_all = sdf_all.reshape(*positions_all.shape[:-1], 1)

        sdf_starts_patch_all = sdf_all[:positions_starts_patch_all.shape[0]]

        sdf_end_diff = sdf_all[positions_starts_patch_all.shape[0]:]
        sdf_ends_patch_all = sdf_starts_patch_all[1:]
        sdf_ends_patch_all = torch.cat([sdf_ends_patch_all, sdf_starts_patch_all[-1:]], 0)
        sdf_ends_patch_all[diff_mask] = sdf_end_diff

        inv_s = self.deviation_network(torch.zeros([1, 3]))[:, :1].clip(1e-6, 1e6)  # Single parameter

        prev_cdf = torch.sigmoid(sdf_starts_patch_all * inv_s)  # (num_samples, patch_H, patch_W, 1)
        next_cdf = torch.sigmoid(sdf_ends_patch_all * inv_s)   # (num_samples, patch_H, patch_W, 1)

        p = prev_cdf - next_cdf
        c = prev_cdf

        alpha = ((p + 1e-5) / (c + 1e-5)).clip(0.0, 1.0)  # (num_samples, patch_H, patch_W, 1)
        weights_cuda = render_weight_from_alpha_patch_based(alpha.reshape(num_samples, patch_H*patch_W, 1), patch_indices)  # (num_samples, patch_H, patch_W, 1)

        if mode == 'train':
            gradient_method = self.gradient_method
        elif mode == 'eval':
            gradient_method = val_gradient_method

        if gradient_method == "dfd":
            with torch.no_grad():
                # distance between neighboring points on the same marching plane
                dist_x = torch.norm(positions_starts_patch_all[:, :, 1:, :] -
                                    positions_starts_patch_all[:, :, :-1, :], dim=-1, keepdim=True)  # (num_samples, patch_H, patch_W-1, 1)
                dist_y = torch.norm(positions_starts_patch_all[:, 1:, :, :] -
                                    positions_starts_patch_all[:, :-1, :, :], dim=-1, keepdim=True)  # (num_samples, patch_H-1, patch_W, 1)

            # directional derivatives along the ray direction
            # forward difference
            df_dt = (sdf_ends_patch_all - sdf_starts_patch_all) / (t_ends_patch_all - t_starts_patch_all)  # (num_samples, patch_H, patch_W, 1)
            # directional derivatives along the image's x-direction
            # central difference
            df_dx = (sdf_starts_patch_all[:, :, 2:] - sdf_starts_patch_all[:, :, :-2]) / (dist_x[:, :, :-1] + dist_x[:, :, 1:] )  # (num_samples, patch_H, patch_W-2, 1)
            # directional derivatives along the image's y-direction
            # central difference
            df_dy = (sdf_starts_patch_all[:, 2:, :] - sdf_starts_patch_all[:, :-2, :]) / (dist_y[:, 1:, :] + dist_y[:, :-1, :])  # (num_samples, patch_H-2, patch_W, 1)

            # for points only have one-side neighbor point,
            # we use forward or backward difference correspondingly
            df_dx_left_boundary = (sdf_starts_patch_all[:, :, 1:2] - sdf_starts_patch_all[:, :, 0:1]) / dist_x[:, :, 0:1]  # (num_samples, patch_H, 1)
            df_dx_right_boundary = (sdf_starts_patch_all[:, :, -1:] - sdf_starts_patch_all[:, :, -2:-1]) / dist_x[:, :, -1:]  # (num_samples, patch_H, 1)
            df_dy_top_boundary = (sdf_starts_patch_all[:, 1:2, :] - sdf_starts_patch_all[:, 0:1, :]) / dist_y[:, 0:1, :]  # (num_samples, 1, patch_W)
            df_dy_bottom_boundary = (sdf_starts_patch_all[:, -1:, :] - sdf_starts_patch_all[:, -2:-1, :]) / dist_y[:, -1:, :]  # (num_samples, 1, patch_W)

            # concat the directional derivatives for boundary points and central points
            df_dx = torch.cat([df_dx_left_boundary, df_dx, df_dx_right_boundary], dim=2)  # (num_samples, patch_H, patch_W, 1)
            df_dy = torch.cat([df_dy_top_boundary, df_dy, df_dy_bottom_boundary], dim=1)  # (num_samples, patch_H, patch_W, 1)

            # concat the directional partial derivatives in three directions
            projected_gradients = torch.cat([df_dt,
                                             df_dx,
                                             df_dy], dim=-1)  # (num_patches, patch_H, patch_W, 3)

            # recover the gradients from directional partial derivatives using the inverse of known directions
            V_inverse = V_inverse_patch_all[patch_indices]  # (num_patches, patch_H, patch_W, 3, 3)
            gradients = (V_inverse @ projected_gradients[..., None])[..., 0]  # (num_samples, patch_H, patch_W, 3)

        elif gradient_method == "ad":
            gradients = self.sdf_network.gradient(positions_starts_patch_all.reshape(-1, 3)).reshape(num_samples, patch_H, patch_W, 3)

        elif gradient_method == "fd":
            # 6-point finite difference
            self.fd_epsilon = 1e-3
            positions_xn = positions_starts_patch_all + torch.tensor([[[[-self.fd_epsilon, 0, 0]]]], device=positions_starts_patch_all.device).expand(
                positions_starts_patch_all.shape)
            positions_xp = positions_starts_patch_all + torch.tensor([[[[self.fd_epsilon, 0, 0]]]], device=positions_starts_patch_all.device).expand(
                positions_starts_patch_all.shape)
            positions_yn = positions_starts_patch_all + torch.tensor([[[[0, -self.fd_epsilon, 0]]]], device=positions_starts_patch_all.device).expand(
                positions_starts_patch_all.shape)
            positions_yp = positions_starts_patch_all + torch.tensor([[[[0, self.fd_epsilon, 0]]]], device=positions_starts_patch_all.device).expand(
                positions_starts_patch_all.shape)
            positions_zn = positions_starts_patch_all + torch.tensor([[[[0, 0, -self.fd_epsilon]]]], device=positions_starts_patch_all.device).expand(
                positions_starts_patch_all.shape)
            positions_zp = positions_starts_patch_all + torch.tensor([[[[0, 0, self.fd_epsilon]]]], device=positions_starts_patch_all.device).expand(
                positions_starts_patch_all.shape)

            positions_concat = torch.cat(
                [positions_xn, positions_xp, positions_yn, positions_yp, positions_zn, positions_zp], 0).to(
                torch.float32).reshape(-1, 3)

            sdf_concat = self.sdf_network(positions_concat).reshape(-1, patch_H, patch_W, 1)
            num_samples = positions_starts_patch_all.shape[0]
            sdf_xn = sdf_concat[:num_samples].reshape(num_samples, patch_H, patch_W, 1)
            sdf_xp = sdf_concat[num_samples:2 * num_samples].reshape(num_samples, patch_H, patch_W, 1)
            sdf_yn = sdf_concat[2 * num_samples:3 * num_samples].reshape(num_samples, patch_H, patch_W, 1)
            sdf_yp = sdf_concat[3 * num_samples:4 * num_samples].reshape(num_samples, patch_H, patch_W, 1)
            sdf_zn = sdf_concat[4 * num_samples:5 * num_samples].reshape(num_samples, patch_H, patch_W, 1)
            sdf_zp = sdf_concat[5 * num_samples:].reshape(num_samples, patch_H, patch_W, 1)

            df_dx = (sdf_xp - sdf_xn) / (2 * self.fd_epsilon)
            df_dy = (sdf_yp - sdf_yn) / (2 * self.fd_epsilon)
            df_dz = (sdf_zp - sdf_zn) / (2 * self.fd_epsilon)

            gradients = torch.stack([df_dx, df_dy, df_dz], -1)

        weights_sum_cuda = accumulate_along_rays_patch_based(weights_cuda, patch_indices, n_patches=num_patch)  # (num_samples, patch_H, patch_W, 1)
        weights_sum = weights_sum_cuda.reshape(num_patch, patch_H, patch_W, 1)

        comp_normals_cuda = accumulate_along_rays_patch_based(weights_cuda, patch_indices, values=gradients.reshape(num_samples,patch_H * patch_W, 3),n_patches=num_patch)  # (num_samples, patch_H, patch_W, 3)
        comp_normal = comp_normals_cuda.reshape(num_patch, patch_H, patch_W, 3)
        inv_s = self.deviation_network(torch.zeros([1, 3]))[:, :1].clip(1e-6, 1e6)  # Single parameter

        return {
            's_val': 1/inv_s,
            'weight_sum': weights_sum,
            'gradients': gradients,
            "comp_normal": comp_normal,
            "samples_per_ray": samples_per_ray,
        }

    @torch.no_grad()
    def render_normal_pixel_based(self, rays_o, rays_d, near, far):
        def alpha_fn(t_starts, t_ends, ray_indices, ret_sdf=False):
            ray_indices = ray_indices.long()
            t_origins = rays_o[ray_indices]
            t_dirs = rays_d[ray_indices]
            positions_starts = t_origins + t_dirs * t_starts
            positions_ends = t_origins + t_dirs * t_ends

            t_starts_shift_left = t_starts[1:]
            # attach the last element of t_ends to the end of t_starts_shift_left
            t_starts_shift_left = torch.cat([t_starts_shift_left, t_starts[-1:]], 0)

            # compute the diff mask between t_ends and t_starts_shift_left
            diff_mask = ((t_ends - t_starts_shift_left) != 0).squeeze()
            # if the diff maks is empty, return

            positions_ends_diff = positions_ends[diff_mask].reshape(-1, 3)

            # ic(diff_mask.shape, positions_ends_diff.shape, positions_starts.shape)
            positions_all = torch.cat([positions_starts, positions_ends_diff], 0)

            sdf_all = self.sdf_network(positions_all)
            sdf_start = sdf_all[:positions_starts.shape[0]]
            sdf_end_diff = sdf_all[positions_starts.shape[0]:]

            sdf_start_shift_left = sdf_start[1:]
            sdf_start_shift_left = torch.cat([sdf_start_shift_left, sdf_start[-1:]], 0)

            sdf_start_shift_left[diff_mask] = sdf_end_diff

            inv_s = self.deviation_network(torch.zeros([1, 3]))[:, :1].clip(1e-6, 1e6)  # Single parameter
            inv_s = inv_s.expand(sdf_start.shape[0], 1)

            prev_cdf = torch.sigmoid(sdf_start * inv_s)
            next_cdf = torch.sigmoid(sdf_start_shift_left * inv_s)

            p = prev_cdf - next_cdf
            c = prev_cdf

            alpha = ((p + 1e-5) / (c + 1e-5)).view(-1).clip(0.0, 1.0)
            alpha = alpha.reshape(-1, 1)
            if ret_sdf:
                return alpha, sdf_start, sdf_start_shift_left
            else:
                return alpha

        ray_indices, t_starts, t_ends = ray_marching(
            rays_o, rays_d,
            t_min=near.squeeze(),
            t_max=far.squeeze(),
            grid=self.occupancy_grid,
            render_step_size=self.sampling_step_size,
            stratified=True,
            cone_angle=0.0,
            alpha_thre=0.0,
            early_stop_eps=1e-3,
            alpha_fn=alpha_fn,
        )

        alpha = alpha_fn(t_starts, t_ends, ray_indices)

        ray_indices = ray_indices.long()
        t_origins = rays_o[ray_indices]
        t_dirs = rays_d[ray_indices]
        midpoints = (t_starts + t_ends) / 2.
        positions = t_origins + t_dirs * midpoints
        gradients = self.sdf_network.gradient(positions).reshape(-1, 3)

        n_rays = rays_o.shape[0]
        weights = render_weight_from_alpha(alpha, ray_indices=ray_indices, n_rays=n_rays)  # [n_samples, 1]
        comp_normal = accumulate_along_rays(weights, ray_indices, values=gradients, n_rays=n_rays)
        comp_depth = accumulate_along_rays(weights, ray_indices, values=midpoints, n_rays=n_rays)
        return comp_normal, comp_depth

    def extract_geometry(self, bound_min, bound_max, resolution, threshold=0.0):
        return extract_geometry(bound_min,
                                bound_max,
                                resolution=resolution,
                                threshold=threshold,
                                query_func=lambda pts: -self.sdf_network.sdf(pts))


================================================
FILE: run_diligent.sh
================================================
for obj_name in buddha pot2 reading bear cow; do
     python exp_runner.py --conf config/diligent.conf --obj_name $obj_name
done


================================================
FILE: run_own_object.sh
================================================
for obj_name in lion dog1 woman; do
     python exp_runner.py --conf config/own_objects.conf --obj_name $obj_name
done

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/building.yml
================================================
name: Building Wheels

on: [workflow_dispatch]

jobs:

  wheel:
    runs-on: ${{ matrix.os }}
    environment: production

    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-18.04, windows-2019]
        python-version: ['3.7', '3.8', '3.9']
        torch-version: [1.10.0, 1.11.0, 1.12.0, 1.13.0]
        cuda-version: ['cu102', 'cu113', 'cu116', 'cu117']
        # os: [ubuntu-18.04]
        # python-version: ['3.9']
        # torch-version: [1.10.0]
        # cuda-version: ['cu102']
        exclude:
          - torch-version: 1.10.0
            cuda-version: 'cu116'
          - torch-version: 1.10.0
            cuda-version: 'cu117'
          - torch-version: 1.11.0
            cuda-version: 'cu116'
          - torch-version: 1.11.0
            cuda-version: 'cu117'
          - torch-version: 1.12.0
            cuda-version: 'cu117'
          - torch-version: 1.13.0
            cuda-version: 'cu102'
          - torch-version: 1.13.0
            cuda-version: 'cu113'
          - os: windows-2019
            torch-version: 1.11.0
            cuda-version: 'cu102'
          - os: windows-2019
            torch-version: 1.12.0
            cuda-version: 'cu102'
          # - os: macos-10.15
          #   cuda-version: 'cu102'
          # - os: macos-10.15
          #   cuda-version: 'cu113'
          # - os: macos-10.15
          #   cuda-version: 'cu116'
          # - os: macos-10.15
          #   cuda-version: 'cu117'

    steps:
      - uses: actions/checkout@v2

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}

      - name: Upgrade pip
        run: |
          pip install --upgrade setuptools
          pip install ninja

      - name: Free up disk space
        if: ${{ runner.os == 'Linux' }}
        run: |
          sudo rm -rf /usr/share/dotnet

      - name: Install CUDA ${{ matrix.cuda-version }}
        if: ${{ matrix.cuda-version != 'cpu' }}
        run: |
          bash .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}.sh

      - name: Install PyTorch ${{ matrix.torch-version }}+${{ matrix.cuda-version }}
        run: |
          pip install torch==${{ matrix.torch-version }} --extra-index-url https://download.pytorch.org/whl/${{ matrix.cuda-version }}
          python -c "import torch; print('PyTorch:', torch.__version__)"
          python -c "import torch; print('CUDA:', torch.version.cuda)"
          python -c "import torch; print('CUDA Available:', torch.cuda.is_available())"

      - name: Patch PyTorch static constexpr on Windows
        if: ${{ runner.os == 'Windows' }}
        run: |
          Torch_DIR=`python -c 'import os; import torch; print(os.path.dirname(torch.__file__))'`
          sed -i '31,38c\
          TORCH_API void lazy_init_num_threads();' ${Torch_DIR}/include/ATen/Parallel.h
        shell: bash

      - name: Set version
        if: ${{ runner.os != 'macOS' }}
        run: |
          VERSION=`sed -n 's/^__version__ = "\(.*\)"/\1/p' nerfacc/version.py`
          TORCH_VERSION=`echo "pt${{ matrix.torch-version }}" | sed "s/..$//" | sed "s/\.//g"`
          CUDA_VERSION=`echo ${{ matrix.cuda-version }}`
          echo "New version name: $VERSION+$TORCH_VERSION$CUDA_VERSION"
          sed -i "s/$VERSION/$VERSION+$TORCH_VERSION$CUDA_VERSION/" nerfacc/version.py
        shell:
          bash

      - name: Install main package for CPU
        if: ${{ matrix.cuda-version == 'cpu' }}
        run: |
          FORCE_ONLY_CPU=1 pip install -e .
        shell:
          bash

      - name: Install main package for GPU
        if: ${{ matrix.cuda-version != 'cpu' }}
        run: |
          source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh
          pip install .
        shell:
          bash

      - name: Test installation
        run: |
          python -c "import nerfacc; print('nerfacc:', nerfacc.__version__)"

      - name: Build wheel
        run: |
          pip install wheel
          source .github/workflows/cuda/${{ matrix.cuda-version }}-${{ runner.os }}-env.sh
          python setup.py bdist_wheel --dist-dir=dist
        shell: bash

      - name: Configure AWS
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-west-2

      - name: Upload wheel
        run: |
          aws s3 sync dist s3://nerfacc-bucket/whl/torch-${{ matrix.torch-version }}_${{ matrix.cuda-version }} --grants read=uri=http://acs.amazonaws.com/groups/global/AllUsers

  update_aws_listing:
    needs: [wheel]
    runs-on: ubuntu-latest
    environment: production

    steps:
      - uses: actions/checkout@v2
      
      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: 3.9

      - name: Upgrade pip
        run: |
          pip install --upgrade setuptools
          pip install boto3
      
      - name: Configure AWS
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-west-2

      - name: Update AWS listing
        run: |
          python scripts/run_aws_listing.py \
            --access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }} \
            --secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
            --bucket="nerfacc-bucket" \
            --region="us-west-2"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/code_checks.yml
================================================
name: Core Tests.

on:
  push:
    branches: [master]
  pull_request:
    branches: [master]

permissions:
  contents: read

jobs:
  build:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v3
      - name: Set up Python 3.8.12
        uses: actions/setup-python@v4
        with:
          python-version: "3.8.12"
      - name: Install dependencies
        run: |
          pip install isort==5.10.1 black[jupyter]==22.3.0
      - name: Run isort
        run: isort docs/ nerfacc/ scripts/ examples/ tests/ --profile black --skip examples/pycolmap --line-length 80 --check
      - name: Run Black
        run: black docs/ nerfacc/ scripts/ examples/ tests/ --exclude examples/pycolmap --line-length 80 --check
      # - name: Python Pylint
      #   run: |
      #     pylint nerfacc/ tests/ scripts/ examples/


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu101-Linux-env.sh
================================================
#!/bin/bash

CUDA_HOME=/usr/local/cuda-10.1
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
PATH=${CUDA_HOME}/bin:${PATH}

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu101-Linux.sh
================================================
#!/bin/bash

OS=ubuntu1804

wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget -nv https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb
sudo dpkg -i cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb
sudo apt-key add /var/cuda-repo-10-1-local-10.1.243-418.87.00/7fa2af80.pub

sudo apt-get -qq update
sudo apt install -y cuda-nvcc-10-1 cuda-libraries-dev-10-1
sudo apt clean

rm -f https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-${OS}-10-1-local-10.1.243-418.87.00_1.0-1_amd64.deb


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu101-Windows-env.sh
================================================
#!/bin/bash

CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v10.1
PATH=${CUDA_HOME}/bin:$PATH
PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu101-Windows.sh
================================================
#!/bin/bash

# Install NVIDIA drivers, see:
# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"

export CUDA_SHORT=10.1
export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers/
export CUDA_FILE=cuda_${CUDA_SHORT}.243_426.00_win10.exe

# Install CUDA:
curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
echo ""
echo "Installing from ${CUDA_FILE}..."
PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
echo "Done!"
rm -f "${CUDA_FILE}"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu102-Linux-env.sh
================================================
#!/bin/bash

CUDA_HOME=/usr/local/cuda-10.2
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
PATH=${CUDA_HOME}/bin:${PATH}

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu102-Linux.sh
================================================
#!/bin/bash

OS=ubuntu1804

wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget -nv https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
sudo dpkg -i cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
sudo apt-key add /var/cuda-repo-10-2-local-10.2.89-440.33.01/7fa2af80.pub

sudo apt-get -qq update
sudo apt install -y cuda-nvcc-10-2 cuda-libraries-dev-10-2
sudo apt clean

rm -f https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-${OS}-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu102-Windows-env.sh
================================================
#!/bin/bash

CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v10.2
PATH=${CUDA_HOME}/bin:$PATH
PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu102-Windows.sh
================================================
#!/bin/bash

# Install NVIDIA drivers, see:
# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"

export CUDA_SHORT=10.2
export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
export CUDA_FILE=cuda_${CUDA_SHORT}.89_441.22_win10.exe

# Install CUDA:
curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
echo ""
echo "Installing from ${CUDA_FILE}..."
PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
echo "Done!"
rm -f "${CUDA_FILE}"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu111-Linux-env.sh
================================================
#!/bin/bash

CUDA_HOME=/usr/local/cuda-11.1
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
PATH=${CUDA_HOME}/bin:${PATH}

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu111-Linux.sh
================================================
#!/bin/bash

OS=ubuntu1804

wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget -nv https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb
sudo dpkg -i cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb
sudo apt-key add /var/cuda-repo-${OS}-11-1-local/7fa2af80.pub

sudo apt-get -qq update
sudo apt install -y cuda-nvcc-11-1 cuda-libraries-dev-11-1
sudo apt clean

rm -f https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda-repo-${OS}-11-1-local_11.1.1-455.32.00-1_amd64.deb


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu111-Windows-env.sh
================================================
#!/bin/bash

CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.1
PATH=${CUDA_HOME}/bin:$PATH
PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="6.0+PTX"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu111-Windows.sh
================================================
#!/bin/bash

# Install NVIDIA drivers, see:
# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"

export CUDA_SHORT=11.1
export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.1/local_installers
export CUDA_FILE=cuda_${CUDA_SHORT}.1_456.81_win10.exe

# Install CUDA:
curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
echo ""
echo "Installing from ${CUDA_FILE}..."
PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
echo "Done!"
rm -f "${CUDA_FILE}"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu113-Linux-env.sh
================================================
#!/bin/bash

CUDA_HOME=/usr/local/cuda-11.3
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
PATH=${CUDA_HOME}/bin:${PATH}

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu113-Linux.sh
================================================
#!/bin/bash

OS=ubuntu1804

wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget -nv https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
sudo dpkg -i cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb
sudo apt-key add /var/cuda-repo-${OS}-11-3-local/7fa2af80.pub

sudo apt-get -qq update
sudo apt install -y cuda-nvcc-11-3 cuda-libraries-dev-11-3
sudo apt clean

rm -f https://developer.download.nvidia.com/compute/cuda/11.3.0/local_installers/cuda-repo-${OS}-11-3-local_11.3.0-465.19.01-1_amd64.deb


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu113-Windows-env.sh
================================================
#!/bin/bash

CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
PATH=${CUDA_HOME}/bin:$PATH
PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="6.0+PTX"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu113-Windows.sh
================================================
#!/bin/bash

# Install NVIDIA drivers, see:
# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"

export CUDA_SHORT=11.3
export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe

# Install CUDA:
curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
echo ""
echo "Installing from ${CUDA_FILE}..."
PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
echo "Done!"
rm -f "${CUDA_FILE}"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu115-Linux-env.sh
================================================
#!/bin/bash

CUDA_HOME=/usr/local/cuda-11.5
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
PATH=${CUDA_HOME}/bin:${PATH}

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu115-Linux.sh
================================================
#!/bin/bash

OS=ubuntu1804

wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget -nv https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
sudo dpkg -i cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb
sudo apt-key add /var/cuda-repo-${OS}-11-5-local/7fa2af80.pub

sudo apt-get -qq update
sudo apt install -y cuda-nvcc-11-5 cuda-libraries-dev-11-5
sudo apt clean

rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-5-local_11.5.2-495.29.05-1_amd64.deb


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu115-Windows-env.sh
================================================
#!/bin/bash

CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
PATH=${CUDA_HOME}/bin:$PATH
PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="6.0+PTX"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu115-Windows.sh
================================================
#!/bin/bash

# TODO We currently use CUDA 11.3 to build CUDA 11.5 Windows wheels

# Install NVIDIA drivers, see:
# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"

export CUDA_SHORT=11.3
export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe

# Install CUDA:
curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
echo ""
echo "Installing from ${CUDA_FILE}..."
PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
echo "Done!"
rm -f "${CUDA_FILE}"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu116-Linux-env.sh
================================================
#!/bin/bash

CUDA_HOME=/usr/local/cuda-11.6
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
PATH=${CUDA_HOME}/bin:${PATH}

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu116-Linux.sh
================================================
#!/bin/bash

OS=ubuntu1804

wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget -nv https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
sudo dpkg -i cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb
sudo apt-key add /var/cuda-repo-${OS}-11-6-local/7fa2af80.pub

sudo apt-get -qq update
sudo apt install -y cuda-nvcc-11-6 cuda-libraries-dev-11-6
sudo apt clean

rm -f https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda-repo-${OS}-11-6-local_11.6.2-510.47.03-1_amd64.deb


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu116-Windows-env.sh
================================================
#!/bin/bash

CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
PATH=${CUDA_HOME}/bin:$PATH
PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="6.0+PTX"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu116-Windows.sh
================================================
#!/bin/bash

# TODO We currently use CUDA 11.3 to build CUDA 11.6 Windows wheels

# Install NVIDIA drivers, see:
# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"

export CUDA_SHORT=11.3
export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe

# Install CUDA:
curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
echo ""
echo "Installing from ${CUDA_FILE}..."
PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
echo "Done!"
rm -f "${CUDA_FILE}"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu117-Linux-env.sh
================================================
#!/bin/bash

CUDA_HOME=/usr/local/cuda-11.7
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
PATH=${CUDA_HOME}/bin:${PATH}

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu117-Linux.sh
================================================
#!/bin/bash

OS=ubuntu1804

wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin
sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget -nv https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb
sudo dpkg -i cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb
sudo cp /var/cuda-repo-${OS}-11-7-local/cuda-*-keyring.gpg /usr/share/keyrings/

sudo apt-get -qq update
sudo apt install -y cuda-nvcc-11-7 cuda-libraries-dev-11-7
sudo apt clean

rm -f https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda-repo-${OS}-11-7-local_11.7.1-515.65.01-1_amd64.deb


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu117-Windows-env.sh
================================================
#!/bin/bash

CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v11.3
PATH=${CUDA_HOME}/bin:$PATH
PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH

export FORCE_CUDA=1
export TORCH_CUDA_ARCH_LIST="6.0+PTX"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/cuda/cu117-Windows.sh
================================================
#!/bin/bash

# TODO We currently use CUDA 11.3 to build CUDA 11.7 Windows wheels

# Install NVIDIA drivers, see:
# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"

export CUDA_SHORT=11.3
export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.0/local_installers
export CUDA_FILE=cuda_${CUDA_SHORT}.0_465.89_win10.exe

# Install CUDA:
curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
echo ""
echo "Installing from ${CUDA_FILE}..."
PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
echo "Done!"
rm -f "${CUDA_FILE}"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.github/workflows/publish.yml
================================================
# This workflows will upload a Python Package using twine when a release is created
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries

name: Upload Python Package

on:
  release:
    types: [created]
    branches: [master]

jobs:
  deploy:
    runs-on: ubuntu-latest
    environment: production

    steps:
      - uses: actions/checkout@v2
      - name: Set up Python
        uses: actions/setup-python@v1
        with:
          python-version: '3.7'
      - name: Install dependencies
        run: |
          python -m pip install build twine
      - name: Strip unsupported tags in README
        run: |
          sed -i '/<!-- pypi-strip -->/,/<!-- \/pypi-strip -->/d' README.md
      - name: Build and publish
        env:
          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
        run: |
          BUILD_NO_CUDA=1 python -m build
          twine upload --username __token__ --password $PYPI_TOKEN dist/*

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.gitignore
================================================
# Visual Studio Code configs.
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
# lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

.DS_Store

# Direnv config.
.envrc

# line_profiler
*.lprof

# vscode
.vsocde

benchmarks/
outputs/

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.gitmodules
================================================
[submodule "examples/pycolmap"]
	path = examples/pycolmap
	url = https://github.com/rmbrualla/pycolmap.git

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.pre-commit-config.yaml
================================================
repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v2.3.0
    hooks:
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
    -   id: check-yaml
    -   id: check-merge-conflict
    -   id: requirements-txt-fixer
-   repo: https://github.com/psf/black
    rev: 22.10.0
    hooks:
      - id: black
        language_version: python3.8.12
        args: # arguments to configure black
          - --line-length=80

-   repo: https://github.com/pycqa/isort
    rev: 5.10.1
    hooks:
      - id: isort


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/.readthedocs.yaml
================================================
version: 2

build:
  os: ubuntu-20.04
  tools:
    python: "3.9"

sphinx:
  fail_on_warning: true
  configuration: docs/source/conf.py

python:
  install:
    # Equivalent to 'pip install .'
    - method: pip
      path: .
    # Equivalent to 'pip install -r docs/requirements.txt'
    - requirements: docs/requirements.txt

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/CMakeLists.txt
================================================
# cmake_minimum_required(VERSION 3.3)
# project(nerfacc LANGUAGES CXX CUDA)

# find_package(pybind11 REQUIRED)
# find_package(Torch REQUIRED)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

# set(SOURCE_DIR nerfacc/cuda/csrc)
# set(INCLUDE_DIR nerfacc/cuda/csrc/include)
# file(GLOB SOURCES ${SOURCE_DIR}/*.cu)

# pybind11_add_module(${PROJECT_NAME} SHARED ${SOURCES})
# target_link_libraries(${PROJECT_NAME} PRIVATE "${TORCH_LIBRARIES}")
# target_include_directories(${PROJECT_NAME} PRIVATE "${INCLUDE_DIR}")


# # message(STATUS "CUDA enabled")

# # set( CMAKE_CUDA_STANDARD 14 )
# # set( CMAKE_CUDA_STANDARD_REQUIRED ON)

# # find_package(pybind11 REQUIRED)

# # # find_package(Python3 REQUIRED COMPONENTS Development)
# # # target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)

# # find_package(Torch REQUIRED)
# # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
# # target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})

# # set(CSRC nerfacc/cuda/csrc)
# # file(GLOB_RECURSE ALL_SOURCES ${ALL_SOURCES} ${CSRC}/*.cu)
# # file(GLOB_RECURSE ALL_HEADERS ${CSRC}/include/*.h)
# # add_library(${PROJECT_NAME} SHARED ${ALL_SOURCES})
# # target_include_directories(${PROJECT_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")

# # set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")

# # message("-- CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
# # message("-- CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
# # message("-- CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")

# # set_target_properties(${PROJECT_NAME} PROPERTIES
# #   EXPORT_NAME nerfacc
# #   INSTALL_RPATH ${TORCH_INSTALL_PREFIX}/lib)

# # Cmake creates *.dylib by default, but python expects *.so by default
# # if (APPLE)
# #   set_property(TARGET ${PROJECT_NAME} PROPERTY SUFFIX .so)
# # endif()

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/LICENSE
================================================
MIT License

Copyright (c) 2022 Ruilong Li

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/MANIFEST.in
================================================
include nerfacc/cuda/csrc/include/*
include nerfacc/cuda/csrc/*


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/README.md
================================================
<p>
  <!-- pypi-strip -->
  <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://user-images.githubusercontent.com/3310961/199083722-881a2372-62c1-4255-8521-31a95a721851.png" />
  <source media="(prefers-color-scheme: light)" srcset="https://user-images.githubusercontent.com/3310961/199084143-0d63eb40-3f35-48d2-a9d5-78d1d60b7d66.png" />
  <!-- /pypi-strip -->
  <img alt="nerfacc logo" src="https://user-images.githubusercontent.com/3310961/199084143-0d63eb40-3f35-48d2-a9d5-78d1d60b7d66.png" width="350px" />
  <!-- pypi-strip -->
  </picture>
  <!-- /pypi-strip -->
</p>

[![Core Tests.](https://github.com/KAIR-BAIR/nerfacc/actions/workflows/code_checks.yml/badge.svg)](https://github.com/KAIR-BAIR/nerfacc/actions/workflows/code_checks.yml)
[![Documentation Status](https://readthedocs.com/projects/plenoptix-nerfacc/badge/?version=latest)](https://www.nerfacc.com/en/latest/?badge=latest)
[![Downloads](https://pepy.tech/badge/nerfacc)](https://pepy.tech/project/nerfacc)

https://www.nerfacc.com/

NerfAcc is a PyTorch Nerf acceleration toolbox for both training and inference. It focuses on efficient volumetric rendering of radiance fields, which is universal and plug-and-play for most of the NeRFs.

Using NerfAcc, 

- The `vanilla NeRF` model with 8-layer MLPs can be trained to *better quality* (+~0.5 PNSR)
  in *1 hour* rather than *days* as in the paper.
- The `Instant-NGP NeRF` model can be trained to *equal quality* in *4.5 minutes*,
  comparing to the official pure-CUDA implementation.
- The `D-NeRF` model for *dynamic* objects can also be trained in *1 hour*
  rather than *2 days* as in the paper, and with *better quality* (+~2.5 PSNR).
- Both *bounded* and *unbounded* scenes are supported.

**And it is a pure Python interface with flexible APIs!**

## Installation

**Dependence**: Please install [Pytorch](https://pytorch.org/get-started/locally/) first.

The easist way is to install from PyPI. In this way it will build the CUDA code **on the first run** (JIT).
```
pip install nerfacc
```

Or install from source. In this way it will build the CUDA code during installation.
```
pip install git+https://github.com/KAIR-BAIR/nerfacc.git
```

We also provide pre-built wheels covering major combinations of Pytorch + CUDA supported by [official Pytorch](https://pytorch.org/get-started/previous-versions/).

```
# e.g., torch 1.13.0 + cu117
pip install nerfacc -f https://nerfacc-bucket.s3.us-west-2.amazonaws.com/whl/torch-1.13.0_cu117.html
```

| Windows & Linux | `cu102` | `cu113` | `cu116` | `cu117` |
|-----------------|---------|---------|---------|---------|
| torch 1.10.0    | ✅      | ✅      |         |         |
| torch 1.11.0    | ✅*     | ✅      |         |         |
| torch 1.12.0    | ✅*     | ✅      | ✅      |         |
| torch 1.13.0    |         |         | ✅      | ✅      |

\* Pytorch does not support Windows pre-built wheels for those combinations thus we do not support as well.

## Usage

The idea of NerfAcc is to perform efficient ray marching and volumetric rendering. So NerfAcc can work with any user-defined radiance field. To plug the NerfAcc rendering pipeline into your code and enjoy the acceleration, you only need to define two functions with your radiance field.
- `sigma_fn`: Compute density at each sample. It will be used by `nerfacc.ray_marching()` to skip the empty and occluded space during ray marching, which is where the major speedup comes from. 
- `rgb_sigma_fn`: Compute color and density at each sample. It will be used by `nerfacc.rendering()` to conduct differentiable volumetric rendering. This function will receive gradients to update your network.

A simple example is like this:

``` python
import torch
from torch import Tensor
import nerfacc 

radiance_field = ...  # network: a NeRF model
rays_o: Tensor = ...  # ray origins. (n_rays, 3)
rays_d: Tensor = ...  # ray normalized directions. (n_rays, 3)
optimizer = ...  # optimizer

def sigma_fn(
    t_starts: Tensor, t_ends:Tensor, ray_indices: Tensor
) -> Tensor:
    """ Query density values from a user-defined radiance field.
    :params t_starts: Start of the sample interval along the ray. (n_samples, 1).
    :params t_ends: End of the sample interval along the ray. (n_samples, 1).
    :params ray_indices: Ray indices that each sample belongs to. (n_samples,).
    :returns The post-activation density values. (n_samples, 1).
    """
    t_origins = rays_o[ray_indices]  # (n_samples, 3)
    t_dirs = rays_d[ray_indices]  # (n_samples, 3)
    positions = t_origins + t_dirs * (t_starts + t_ends) / 2.0
    sigmas = radiance_field.query_density(positions) 
    return sigmas  # (n_samples, 1)

def rgb_sigma_fn(
    t_starts: Tensor, t_ends: Tensor, ray_indices: Tensor
) -> Tuple[Tensor, Tensor]:
    """ Query rgb and density values from a user-defined radiance field.
    :params t_starts: Start of the sample interval along the ray. (n_samples, 1).
    :params t_ends: End of the sample interval along the ray. (n_samples, 1).
    :params ray_indices: Ray indices that each sample belongs to. (n_samples,).
    :returns The post-activation rgb and density values. 
        (n_samples, 3), (n_samples, 1).
    """
    t_origins = rays_o[ray_indices]  # (n_samples, 3)
    t_dirs = rays_d[ray_indices]  # (n_samples, 3)
    positions = t_origins + t_dirs * (t_starts + t_ends) / 2.0
    rgbs, sigmas = radiance_field(positions, condition=t_dirs)  
    return rgbs, sigmas  # (n_samples, 3), (n_samples, 1)

# Efficient Raymarching: Skip empty and occluded space, pack samples from all rays.
# ray_indices: (n_samples,). t_starts: (n_samples, 1). t_ends: (n_samples, 1).
with torch.no_grad():
    ray_indices, t_starts, t_ends = nerfacc.ray_marching(
        rays_o, rays_d, sigma_fn=sigma_fn, near_plane=0.2, far_plane=1.0, 
        early_stop_eps=1e-4, alpha_thre=1e-2, 
    )

# Differentiable Volumetric Rendering.
# colors: (n_rays, 3). opaicity: (n_rays, 1). depth: (n_rays, 1).
color, opacity, depth = nerfacc.rendering(
    t_starts, t_ends, ray_indices, n_rays=rays_o.shape[0], rgb_sigma_fn=rgb_sigma_fn
)

# Optimize: Both the network and rays will receive gradients
optimizer.zero_grad()
loss = F.mse_loss(color, color_gt)
loss.backward()
optimizer.step()
```

## Examples: 

Before running those example scripts, please check the script about which dataset it is needed, and download the dataset first.

```bash
# clone the repo with submodules.
git clone --recursive git://github.com/KAIR-BAIR/nerfacc/
```

``` bash
# Instant-NGP NeRF in 4.5 minutes with reproduced performance!
# See results at here: https://www.nerfacc.com/en/latest/examples/ngp.html
python examples/train_ngp_nerf.py --train_split train --scene lego
```

``` bash
# Vanilla MLP NeRF in 1 hour with better performance!
# See results at here: https://www.nerfacc.com/en/latest/examples/vanilla.html
python examples/train_mlp_nerf.py --train_split train --scene lego
```

```bash
# D-NeRF for Dynamic objects in 1 hour with better performance!
# See results at here: https://www.nerfacc.com/en/latest/examples/dnerf.html
python examples/train_mlp_dnerf.py --train_split train --scene lego
```

```bash
# Instant-NGP on unbounded scenes in 20 minutes!
# See results at here: https://www.nerfacc.com/en/latest/examples/unbounded.html
python examples/train_ngp_nerf.py --train_split train --scene garden --auto_aabb --unbounded --cone_angle=0.004
```

Used by:
- [nerfstudio](https://github.com/nerfstudio-project/nerfstudio): A collaboration friendly studio for NeRFs.
- [instant-nsr-pl](https://github.com/bennyguo/instant-nsr-pl): NeuS in 10 minutes.


## Common Installation Issues


<details>
    <summary>ImportError: .../csrc.so: undefined symbol</summary>
    If you are installing a pre-built wheel, make sure the Pytorch and CUDA version matchs with the nerfacc version (nerfacc.__version__).
</details>

## Citation

```bibtex
@article{li2022nerfacc,
  title={NerfAcc: A General NeRF Accleration Toolbox.},
  author={Li, Ruilong and Tancik, Matthew and Kanazawa, Angjoo},
  journal={arXiv preprint arXiv:2210.04847},
  year={2022}
}
```


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = source
BUILDDIR      = build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/requirements.txt
================================================
pytorch_sphinx_theme @ git+https://github.com/liruilong940607/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
sphinx==5.2.1
sphinx-copybutton==0.5.0
sphinx-design==0.2.0

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/_static/css/readthedocs.css
================================================
.header-logo {
    background-image: url("../images/logo4x.png");
    background-size: 156px 35px;
    height: 35px;
    width: 156px;
}
code {
    word-break: normal;
}

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.accumulate_along_rays.rst
================================================
﻿nerfacc.accumulate\_along\_rays
===============================

.. currentmodule:: nerfacc

.. autofunction:: accumulate_along_rays

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.pack_data.rst
================================================
﻿nerfacc.pack\_data
==================

.. currentmodule:: nerfacc

.. autofunction:: pack_data

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.ray_aabb_intersect.rst
================================================
﻿nerfacc.ray\_aabb\_intersect
============================

.. currentmodule:: nerfacc

.. autofunction:: ray_aabb_intersect

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.ray_resampling.rst
================================================
﻿nerfacc.ray\_resampling
=======================

.. currentmodule:: nerfacc

.. autofunction:: ray_resampling

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.render_transmittance_from_alpha.rst
================================================
﻿nerfacc.render\_transmittance\_from\_alpha
==========================================

.. currentmodule:: nerfacc

.. autofunction:: render_transmittance_from_alpha

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.render_transmittance_from_density.rst
================================================
﻿nerfacc.render\_transmittance\_from\_density
============================================

.. currentmodule:: nerfacc

.. autofunction:: render_transmittance_from_density

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.render_visibility.rst
================================================
﻿nerfacc.render\_visibility
==========================

.. currentmodule:: nerfacc

.. autofunction:: render_visibility

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.render_weight_from_alpha.rst
================================================
﻿nerfacc.render\_weight\_from\_alpha
===================================

.. currentmodule:: nerfacc

.. autofunction:: render_weight_from_alpha

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.render_weight_from_density.rst
================================================
﻿nerfacc.render\_weight\_from\_density
=====================================

.. currentmodule:: nerfacc

.. autofunction:: render_weight_from_density

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.unpack_data.rst
================================================
﻿nerfacc.unpack\_data
====================

.. currentmodule:: nerfacc

.. autofunction:: unpack_data

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/generated/nerfacc.unpack_info.rst
================================================
﻿nerfacc.unpack\_info
====================

.. currentmodule:: nerfacc

.. autofunction:: unpack_info

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/grid.rst
================================================
.. _`Occupancy Grid`:

Occupancy Grid
===================================

.. currentmodule:: nerfacc

.. autoclass:: ContractionType
    :members:

.. autoclass:: Grid
    :members:

.. autoclass:: OccupancyGrid
    :members:


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/rendering.rst
================================================
Volumetric Rendering
===================================

In `nerfacc`, the volumetric rendering pipeline is broken down into 2 steps:

1. **Raymarching**: This is the process of shooting a ray through the scene and
   generate samples along the way. To perform efficient volumetric rendering, here we aim
   at skipping as many areas as possible. The emtpy space is skipped by using the cached
   occupancy grid (see :class:`nerfacc.OccupancyGrid`), and the invisible space is skipped by
   checking the transmittance of the ray while marching. Almost in all cases, those skipping
   won't result in a noticeable loss of quality as they would contribute very little to the
   final rendered image. But they will bring a significant speedup.

2. **Rendering**: This is the process of accumulating samples along the rays into final image.
   In this step we also need to query the attributes (a.k.a. color and density) of those samples
   generated by raymarching. Early stoping is supported in this step.

|

.. currentmodule:: nerfacc

.. autofunction:: ray_marching
.. autofunction:: rendering


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/apis/utils.rst
================================================
Utils
===================================

.. currentmodule:: nerfacc

.. autosummary::
   :nosignatures:
   :toctree: generated/

   ray_aabb_intersect
   unpack_info

   accumulate_along_rays
   render_transmittance_from_density
   render_transmittance_from_alpha
   render_weight_from_density
   render_weight_from_alpha
   render_visibility

   ray_resampling
   pack_data
   unpack_data
   

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/conf.py
================================================
import pytorch_sphinx_theme

__version__ = None
exec(open("../../nerfacc/version.py", "r").read())

# -- Project information

project = "nerfacc"
copyright = "2022, Ruilong"
author = "Ruilong"

release = __version__

# -- General configuration

extensions = [
    "sphinx.ext.napoleon",
    "sphinx.ext.duration",
    "sphinx.ext.doctest",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
]

intersphinx_mapping = {
    "python": ("https://docs.python.org/3/", None),
    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
}
intersphinx_disabled_domains = ["std"]

templates_path = ["_templates"]

# -- Options for HTML output

# html_theme = "furo"

html_theme = "pytorch_sphinx_theme"
html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
html_static_path = ["_static"]
html_css_files = ["css/readthedocs.css"]

# Ignore >>> when copying code
copybutton_prompt_text = r">>> |\.\.\. "
copybutton_prompt_is_regexp = True

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
html_theme_options = {
    # The target url that the logo directs to. Unset to do nothing
    "logo_url": "https://www.nerfacc.com/en/latest/index.html",
    # "menu" is a list of dictionaries where you can specify the content and the
    # behavior of each item in the menu. Each item can either be a link or a
    # dropdown menu containing a list of links.
    "menu": [
        # A link
        {"name": "GitHub", "url": "https://github.com/KAIR-BAIR/nerfacc"},
        # A dropdown menu
        # {
        #     "name": "Projects",
        #     "children": [
        #         # A vanilla dropdown item
        #         {
        #             "name": "nerfstudio",
        #             "url": "https://docs.nerf.studio/",
        #             "description": "The all-in-one repo for NeRFs",
        #         },
        #     ],
        #     # Optional, determining whether this dropdown menu will always be
        #     # highlighted.
        #     # "active": True,
        # },
    ],
}
# html_theme_options = {
#     "canonical_url": "",
#     "analytics_id": "",
#     "logo_only": False,
#     "display_version": True,
#     "prev_next_buttons_location": "bottom",
#     "style_external_links": False,
#     # Toc options
#     "collapse_navigation": True,
#     "sticky_navigation": True,
#     "navigation_depth": 4,
#     "includehidden": True,
#     "titles_only": False
# }

# -- Options for EPUB output
epub_show_urls = "footnote"

# typehints
autodoc_typehints = "description"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/examples/dnerf.rst
================================================
Dynamic Scene
====================

See code `examples/train_mlp_dnerf.py` at our `github repository`_ for details.

Benchmarks
------------
*updated on 2022-10-08*

Here we trained a 8-layer-MLP for the radiance field and a 4-layer-MLP for the warping field,
(similar to the T-Nerf model in the `D-Nerf`_ paper) on the `D-Nerf dataset`_. We used train 
split for training and test split for evaluation. Our experiments are conducted on a 
single NVIDIA TITAN RTX GPU. The training memory footprint is about 11GB.

.. note::

    The :ref:`Occupancy Grid` used in this example is shared by all the frames. In other words, 
    instead of using it to indicate the opacity of an area at a single timestamp, 
    Here we use it to indicate the `maximum` opacity at this area `over all the timestamps`.
    It is not optimal but still makes the rendering very efficient.

+----------------------+----------+---------+-------+---------+-------+--------+---------+-------+-------+
| PSNR                 | bouncing | hell    | hook  | jumping | lego  | mutant | standup | trex  | MEAN  |
|                      | balls    | warrior |       | jacks   |       |        |         |       |       |
+======================+==========+=========+=======+=========+=======+========+=========+=======+=======+
| D-Nerf (~ days)      | 32.80    | 25.02   | 29.25 | 32.80   | 21.64 | 31.29  | 32.79   | 31.75 | 29.67 |
+----------------------+----------+---------+-------+---------+-------+--------+---------+-------+-------+
| Ours  (~ 1 hr)       | 39.49    | 25.58   | 31.86 | 32.73   | 24.32 | 35.55  | 35.90   | 32.33 | 32.22 |
+----------------------+----------+---------+-------+---------+-------+--------+---------+-------+-------+
| Ours  (Training time)| 37min    | 52min   | 69min | 64min   | 44min | 79min  | 79min   | 39min | 58min |
+----------------------+----------+---------+-------+---------+-------+--------+---------+-------+-------+

.. _`D-Nerf`: https://arxiv.org/abs/2011.13961
.. _`D-Nerf dataset`: https://www.dropbox.com/s/0bf6fl0ye2vz3vr/data.zip?dl=0
.. _`github repository`: https://github.com/KAIR-BAIR/nerfacc/tree/76c0f9817da4c9c8b5ccf827eb069ee2ce854b75


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/examples/ngp.rst
================================================
.. _`Instant-NGP Example`:

Instant-NGP
====================

See code `examples/train_ngp_nerf.py` at our `github repository`_ for details.

Benchmarks
------------
*updated on 2022-10-12*

Here we trained a `Instant-NGP Nerf`_ model on the `Nerf-Synthetic dataset`_. We follow the same
settings with the Instant-NGP paper, which uses train split for training and test split for
evaluation. All experiments are conducted on a single NVIDIA TITAN RTX GPU. The training
memory footprint is about 3GB.

.. note::
    
    The Instant-NGP paper makes use of the alpha channel in the images to apply random background
    augmentation during training. For fair comparision, we rerun their code with a constant white
    background during both training and testing. Also it is worth to mention that we didn't strictly
    follow the training receipe in the Instant-NGP paper, such as the learning rate schedule etc, as
    the purpose of this benchmark is to showcase instead of reproducing the paper.

+-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
| PSNR                  | Lego  | Mic   |Materials| Chair |Hotdog | Ficus | Drums | Ship  | MEAN  |
|                       |       |       |         |       |       |       |       |       |       |
+=======================+=======+=======+=========+=======+=======+=======+=======+=======+=======+
|Instant-NGP 35k steps  | 35.87 | 36.22 | 29.08   | 35.10 | 37.48 | 30.61 | 23.85 | 30.62 | 32.35 |
+-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
|(training time)        | 309s  | 258s  | 256s    | 316s  | 292s  | 207s  | 218s  | 250s  | 263s  |
+-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
|Ours 20k steps         | 35.50 | 36.16 | 29.14   | 35.23 | 37.15 | 31.71 | 24.88 | 29.91 | 32.46 |
+-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
|(training time)        | 287s  | 274s  | 269s    | 317s  | 269s  | 244s  | 249s  | 257s  | 271s  |
+-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+

.. _`Instant-NGP Nerf`: https://github.com/NVlabs/instant-ngp/tree/51e4107edf48338e9ab0316d56a222e0adf87143
.. _`github repository`: https://github.com/KAIR-BAIR/nerfacc/tree/76c0f9817da4c9c8b5ccf827eb069ee2ce854b75
.. _`Nerf-Synthetic dataset`: https://drive.google.com/drive/folders/1JDdLGDruGNXWnM1eqY1FNL9PlStjaKWi


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/examples/unbounded.rst
================================================
Unbounded Scene
====================

See code `examples/train_ngp_nerf.py` at our `github repository`_ for details.

Benchmarks
------------
*updated on 2022-11-07*

Here we trained a `Instant-NGP Nerf`_  on the `MipNerf360`_ dataset. We used train 
split for training and test split for evaluation. Our experiments are conducted on a 
single NVIDIA TITAN RTX GPU. The training memory footprint is about 6-9GB.

The main difference between working with unbounded scenes and bounded scenes, is that
a contraction method is needed to map the infinite space to a finite :ref:`Occupancy Grid`.
We have difference options provided for this (see :ref:`Occupancy Grid`). The experiments
here is basically the Instant-NGP experiments (see :ref:`Instant-NGP Example`) with a contraction method
that takes from `MipNerf360`_.

.. note:: 
    Even though we are comparing with `Nerf++`_ and `MipNerf360`_, the model and everything are
    totally different with them. There are plenty of ideas from those papers that would be very
    helpful for the performance, but we didn't adopt them. As this is just a simple example to 
    show how to use the library, we didn't want to make it too complicated.


+----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
| PSNR                 |Garden |Bicycle|Bonsai |Counter|Kitchen| Room  | Stump | MEAN  |
|                      |       |       |       |       |       |       |       |       |
+======================+=======+=======+=======+=======+=======+=======+=======+=======+
| Nerf++ (~days)       | 24.32 | 22.64 | 29.15 | 26.38 | 27.80 | 28.87 | 24.34 | 26.21 |
+----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
| MipNerf360 (~days)   | 26.98 | 24.37 | 33.46 | 29.55 | 32.23 | 31.63 | 26.40 | 29.23 |
+----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
| Ours (~20 mins)      | 25.41 | 22.97 | 30.71 | 27.34 | 30.32 | 31.00 | 23.43 | 27.31 |
+----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
| Ours (Training time) | 25min | 17min | 19min | 23min | 28min | 20min | 17min | 21min |
+----------------------+-------+-------+-------+-------+-------+-------+-------+-------+

.. _`Instant-NGP Nerf`: https://arxiv.org/abs/2201.05989
.. _`MipNerf360`: https://arxiv.org/abs/2111.12077
.. _`Nerf++`: https://arxiv.org/abs/2010.07492
.. _`github repository`: https://github.com/KAIR-BAIR/nerfacc/tree/76c0f9817da4c9c8b5ccf827eb069ee2ce854b75


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/examples/vanilla.rst
================================================
Vanilla Nerf 
====================

See code `examples/train_mlp_nerf.py` at our `github repository`_ for details.

Benchmarks
------------
*updated on 2022-10-08*

Here we trained a 8-layer-MLP for the radiance field as in the `vanilla Nerf`_. We used the 
train split for training and test split for evaluation as in the Nerf paper. Our experiments are 
conducted on a single NVIDIA TITAN RTX GPU. The training memory footprint is about 10GB.

.. note:: 
    The vanilla Nerf paper uses two MLPs for course-to-fine sampling. Instead here we only use a 
    single MLP with more samples (1024). Both ways share the same spirit to do dense sampling 
    around the surface. Our fast rendering inheritly skip samples away from the surface 
    so we can simplly increase the number of samples with a single MLP, to achieve the same goal 
    with the coarse-to-fine sampling, without runtime or memory issue.

+----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
| PSNR                 | Lego  | Mic   |Materials| Chair |Hotdog | Ficus | Drums | Ship  | MEAN  |
|                      |       |       |         |       |       |       |       |       |       |
+======================+=======+=======+=========+=======+=======+=======+=======+=======+=======+
| NeRF  (~ days)       | 32.54 | 32.91 | 29.62   | 33.00 | 36.18 | 30.13 | 25.01 | 28.65 | 31.00 |
+----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
| Ours  (~ 50min)      | 33.69 | 33.76 | 29.73   | 33.32 | 35.80 | 32.52 | 25.39 | 28.18 | 31.55 |
+----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
| Ours  (Training time)| 58min | 53min | 46min   | 62min | 56min | 42min | 52min | 49min | 52min |
+----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+

.. _`github repository`: : https://github.com/KAIR-BAIR/nerfacc/tree/76c0f9817da4c9c8b5ccf827eb069ee2ce854b75
.. _`vanilla Nerf`: https://arxiv.org/abs/2003.08934


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/docs/source/index.rst
================================================
NerfAcc Documentation
===================================

NerfAcc is a PyTorch Nerf acceleration toolbox for both training and inference. It focus on
efficient volumetric rendering of radiance fields, which is universal and plug-and-play for most of the NeRFs.

Using NerfAcc, 

- The `vanilla Nerf`_ model with 8-layer MLPs can be trained to *better quality* (+~0.5 PNSR) \
  in *1 hour* rather than *1~2 days* as in the paper.
- The `Instant-NGP Nerf`_ model can be trained to *equal quality* in *4.5 minutes*, \
  comparing to the official pure-CUDA implementation.
- The `D-Nerf`_ model for *dynamic* objects can also be trained in *1 hour* \
  rather than *2 days* as in the paper, and with *better quality* (+~2.5 PSNR).
- Both *bounded* and *unbounded* scenes are supported.

**And it is pure Python interface with flexible APIs!**

| Github: https://github.com/KAIR-BAIR/nerfacc
| Paper: https://arxiv.org/pdf/2210.04847.pdf
| Authors: `Ruilong Li`_, `Matthew Tancik`_, `Angjoo Kanazawa`_

.. note::

   This repo is focusing on the single scene situation. Generalizable Nerfs across
   multiple scenes is currently out of the scope of this repo. But you may still find
   some useful tricks in this repo. :)


Installation:
-------------

.. code-block:: console

   $ pip install nerfacc

Usage:
-------------

The idea of NerfAcc is to perform efficient ray marching and volumetric rendering. 
So NerfAcc can work with any user-defined radiance field. To plug the NerfAcc rendering
pipeline into your code and enjoy the acceleration, you only need to define two functions 
with your radience field.

- `sigma_fn`: Compute density at each sample. It will be used by :func:`nerfacc.ray_marching` to skip the empty and occluded space during ray marching, which is where the major speedup comes from. 
- `rgb_sigma_fn`: Compute color and density at each sample. It will be used by :func:`nerfacc.rendering` to conduct differentiable volumetric rendering. This function will receive gradients to update your network.

An simple example is like this:

.. code-block:: python

   import torch
   from torch import Tensor
   import nerfacc 

   radiance_field = ...  # network: a NeRF model
   rays_o: Tensor = ...  # ray origins. (n_rays, 3)
   rays_d: Tensor = ...  # ray normalized directions. (n_rays, 3)
   optimizer = ...  # optimizer

   def sigma_fn(
      t_starts: Tensor, t_ends:Tensor, ray_indices: Tensor
   ) -> Tensor:
      """ Query density values from a user-defined radiance field.
      :params t_starts: Start of the sample interval along the ray. (n_samples, 1).
      :params t_ends: End of the sample interval along the ray. (n_samples, 1).
      :params ray_indices: Ray indices that each sample belongs to. (n_samples,).
      :returns The post-activation density values. (n_samples, 1).
      """
      t_origins = rays_o[ray_indices]  # (n_samples, 3)
      t_dirs = rays_d[ray_indices]  # (n_samples, 3)
      positions = t_origins + t_dirs * (t_starts + t_ends) / 2.0
      sigmas = radiance_field.query_density(positions) 
      return sigmas  # (n_samples, 1)

   def rgb_sigma_fn(
      t_starts: Tensor, t_ends: Tensor, ray_indices: Tensor
   ) -> Tuple[Tensor, Tensor]:
      """ Query rgb and density values from a user-defined radiance field.
      :params t_starts: Start of the sample interval along the ray. (n_samples, 1).
      :params t_ends: End of the sample interval along the ray. (n_samples, 1).
      :params ray_indices: Ray indices that each sample belongs to. (n_samples,).
      :returns The post-activation rgb and density values. 
         (n_samples, 3), (n_samples, 1).
      """
      t_origins = rays_o[ray_indices]  # (n_samples, 3)
      t_dirs = rays_d[ray_indices]  # (n_samples, 3)
      positions = t_origins + t_dirs * (t_starts + t_ends) / 2.0
      rgbs, sigmas = radiance_field(positions, condition=t_dirs)  
      return rgbs, sigmas  # (n_samples, 3), (n_samples, 1)

   # Efficient Raymarching: Skip empty and occluded space, pack samples from all rays.
   # ray_indices: (n_samples,). t_starts: (n_samples, 1). t_ends: (n_samples, 1).
   with torch.no_grad():
      ray_indices, t_starts, t_ends = nerfacc.ray_marching(
         rays_o, rays_d, sigma_fn=sigma_fn, near_plane=0.2, far_plane=1.0, 
         early_stop_eps=1e-4, alpha_thre=1e-2, 
      )

   # Differentiable Volumetric Rendering.
   # colors: (n_rays, 3). opaicity: (n_rays, 1). depth: (n_rays, 1).
   color, opacity, depth = nerfacc.rendering(
      t_starts, t_ends, ray_indices, n_rays=rays_o.shape[0], rgb_sigma_fn=rgb_sigma_fn
   )

   # Optimize: Both the network and rays will receive gradients
   optimizer.zero_grad()
   loss = F.mse_loss(color, color_gt)
   loss.backward()
   optimizer.step()


Links:
-------------

.. toctree::
   :glob:
   :maxdepth: 1
   :caption: Python API

   apis/*

.. toctree::
   :glob:
   :maxdepth: 1
   :caption: Example Usages

   examples/*

.. toctree::
   :maxdepth: 1
   :caption: Projects

   nerfstudio <https://docs.nerf.studio/>


.. _`vanilla Nerf`: https://arxiv.org/abs/2003.08934
.. _`Instant-NGP Nerf`: https://arxiv.org/abs/2201.05989
.. _`D-Nerf`: https://arxiv.org/abs/2011.13961
.. _`MipNerf360`: https://arxiv.org/abs/2111.12077
.. _`pixel-Nerf`: https://arxiv.org/abs/2012.02190
.. _`Nerf++`: https://arxiv.org/abs/2010.07492

.. _`Ruilong Li`: https://www.liruilong.cn/
.. _`Matthew Tancik`: https://www.matthewtancik.com/
.. _`Angjoo Kanazawa`: https://people.eecs.berkeley.edu/~kanazawa/

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/datasets/__init__.py
================================================


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/datasets/dnerf_synthetic.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import json
import os

import imageio.v2 as imageio
import numpy as np
import torch
import torch.nn.functional as F

from .utils import Rays


def _load_renderings(root_fp: str, subject_id: str, split: str):
    """Load images from disk."""
    if not root_fp.startswith("/"):
        # allow relative path. e.g., "./data/dnerf_synthetic/"
        root_fp = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "..",
            "..",
            root_fp,
        )

    data_dir = os.path.join(root_fp, subject_id)
    with open(
        os.path.join(data_dir, "transforms_{}.json".format(split)), "r"
    ) as fp:
        meta = json.load(fp)
    images = []
    camtoworlds = []
    timestamps = []

    for i in range(len(meta["frames"])):
        frame = meta["frames"][i]
        fname = os.path.join(data_dir, frame["file_path"] + ".png")
        rgba = imageio.imread(fname)
        timestamp = (
            frame["time"]
            if "time" in frame
            else float(i) / (len(meta["frames"]) - 1)
        )
        timestamps.append(timestamp)
        camtoworlds.append(frame["transform_matrix"])
        images.append(rgba)

    images = np.stack(images, axis=0)
    camtoworlds = np.stack(camtoworlds, axis=0)
    timestamps = np.stack(timestamps, axis=0)

    h, w = images.shape[1:3]
    camera_angle_x = float(meta["camera_angle_x"])
    focal = 0.5 * w / np.tan(0.5 * camera_angle_x)

    return images, camtoworlds, focal, timestamps


class SubjectLoader(torch.utils.data.Dataset):
    """Single subject data loader for training and evaluation."""

    SPLITS = ["train", "val", "test"]
    SUBJECT_IDS = [
        "bouncingballs",
        "hellwarrior",
        "hook",
        "jumpingjacks",
        "lego",
        "mutant",
        "standup",
        "trex",
    ]

    WIDTH, HEIGHT = 800, 800
    NEAR, FAR = 2.0, 6.0
    OPENGL_CAMERA = True

    def __init__(
        self,
        subject_id: str,
        root_fp: str,
        split: str,
        color_bkgd_aug: str = "white",
        num_rays: int = None,
        near: float = None,
        far: float = None,
        batch_over_images: bool = True,
    ):
        super().__init__()
        assert split in self.SPLITS, "%s" % split
        assert subject_id in self.SUBJECT_IDS, "%s" % subject_id
        assert color_bkgd_aug in ["white", "black", "random"]
        self.split = split
        self.num_rays = num_rays
        self.near = self.NEAR if near is None else near
        self.far = self.FAR if far is None else far
        self.training = (num_rays is not None) and (
            split in ["train", "trainval"]
        )
        self.color_bkgd_aug = color_bkgd_aug
        self.batch_over_images = batch_over_images
        (
            self.images,
            self.camtoworlds,
            self.focal,
            self.timestamps,
        ) = _load_renderings(root_fp, subject_id, split)
        self.images = torch.from_numpy(self.images).to(torch.uint8)
        self.camtoworlds = torch.from_numpy(self.camtoworlds).to(torch.float32)
        self.timestamps = torch.from_numpy(self.timestamps).to(torch.float32)[
            :, None
        ]
        self.K = torch.tensor(
            [
                [self.focal, 0, self.WIDTH / 2.0],
                [0, self.focal, self.HEIGHT / 2.0],
                [0, 0, 1],
            ],
            dtype=torch.float32,
        )  # (3, 3)
        assert self.images.shape[1:3] == (self.HEIGHT, self.WIDTH)

    def __len__(self):
        return len(self.images)

    @torch.no_grad()
    def __getitem__(self, index):
        data = self.fetch_data(index)
        data = self.preprocess(data)
        return data

    def preprocess(self, data):
        """Process the fetched / cached data with randomness."""
        rgba, rays = data["rgba"], data["rays"]
        pixels, alpha = torch.split(rgba, [3, 1], dim=-1)

        if self.training:
            if self.color_bkgd_aug == "random":
                color_bkgd = torch.rand(3, device=self.images.device)
            elif self.color_bkgd_aug == "white":
                color_bkgd = torch.ones(3, device=self.images.device)
            elif self.color_bkgd_aug == "black":
                color_bkgd = torch.zeros(3, device=self.images.device)
        else:
            # just use white during inference
            color_bkgd = torch.ones(3, device=self.images.device)

        pixels = pixels * alpha + color_bkgd * (1.0 - alpha)
        return {
            "pixels": pixels,  # [n_rays, 3] or [h, w, 3]
            "rays": rays,  # [n_rays,] or [h, w]
            "color_bkgd": color_bkgd,  # [3,]
            **{k: v for k, v in data.items() if k not in ["rgba", "rays"]},
        }

    def update_num_rays(self, num_rays):
        self.num_rays = num_rays

    def fetch_data(self, index):
        """Fetch the data (it maybe cached for multiple batches)."""
        num_rays = self.num_rays

        if self.training:
            if self.batch_over_images:
                image_id = torch.randint(
                    0,
                    len(self.images),
                    size=(num_rays,),
                    device=self.images.device,
                )
            else:
                image_id = [index]
            x = torch.randint(
                0, self.WIDTH, size=(num_rays,), device=self.images.device
            )
            y = torch.randint(
                0, self.HEIGHT, size=(num_rays,), device=self.images.device
            )
        else:
            image_id = [index]
            x, y = torch.meshgrid(
                torch.arange(self.WIDTH, device=self.images.device),
                torch.arange(self.HEIGHT, device=self.images.device),
                indexing="xy",
            )
            x = x.flatten()
            y = y.flatten()

        # generate rays
        rgba = self.images[image_id, y, x] / 255.0  # (num_rays, 4)
        c2w = self.camtoworlds[image_id]  # (num_rays, 3, 4)
        camera_dirs = F.pad(
            torch.stack(
                [
                    (x - self.K[0, 2] + 0.5) / self.K[0, 0],
                    (y - self.K[1, 2] + 0.5)
                    / self.K[1, 1]
                    * (-1.0 if self.OPENGL_CAMERA else 1.0),
                ],
                dim=-1,
            ),
            (0, 1),
            value=(-1.0 if self.OPENGL_CAMERA else 1.0),
        )  # [num_rays, 3]

        # [n_cams, height, width, 3]
        directions = (camera_dirs[:, None, :] * c2w[:, :3, :3]).sum(dim=-1)
        origins = torch.broadcast_to(c2w[:, :3, -1], directions.shape)
        viewdirs = directions / torch.linalg.norm(
            directions, dim=-1, keepdims=True
        )

        if self.training:
            origins = torch.reshape(origins, (num_rays, 3))
            viewdirs = torch.reshape(viewdirs, (num_rays, 3))
            rgba = torch.reshape(rgba, (num_rays, 4))
        else:
            origins = torch.reshape(origins, (self.HEIGHT, self.WIDTH, 3))
            viewdirs = torch.reshape(viewdirs, (self.HEIGHT, self.WIDTH, 3))
            rgba = torch.reshape(rgba, (self.HEIGHT, self.WIDTH, 4))

        rays = Rays(origins=origins, viewdirs=viewdirs)
        timestamps = self.timestamps[image_id]

        return {
            "rgba": rgba,  # [h, w, 4] or [num_rays, 4]
            "rays": rays,  # [h, w, 3] or [num_rays, 3]
            "timestamps": timestamps,  # [num_rays, 1]
        }


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/datasets/nerf_360_v2.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import collections
import os
import sys

import imageio
import numpy as np
import torch
import torch.nn.functional as F
import tqdm

from .utils import Rays

_PATH = os.path.abspath(__file__)

sys.path.insert(
    0, os.path.join(os.path.dirname(_PATH), "..", "pycolmap", "pycolmap")
)
from scene_manager import SceneManager


def _load_colmap(root_fp: str, subject_id: str, split: str, factor: int = 1):
    assert factor in [1, 2, 4, 8]

    data_dir = os.path.join(root_fp, subject_id)
    colmap_dir = os.path.join(data_dir, "sparse/0/")

    manager = SceneManager(colmap_dir)
    manager.load_cameras()
    manager.load_images()

    # Assume shared intrinsics between all cameras.
    cam = manager.cameras[1]
    fx, fy, cx, cy = cam.fx, cam.fy, cam.cx, cam.cy
    K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
    K[:2, :] /= factor

    # Extract extrinsic matrices in world-to-camera format.
    imdata = manager.images
    w2c_mats = []
    bottom = np.array([0, 0, 0, 1]).reshape(1, 4)
    for k in imdata:
        im = imdata[k]
        rot = im.R()
        trans = im.tvec.reshape(3, 1)
        w2c = np.concatenate([np.concatenate([rot, trans], 1), bottom], axis=0)
        w2c_mats.append(w2c)
    w2c_mats = np.stack(w2c_mats, axis=0)

    # Convert extrinsics to camera-to-world.
    camtoworlds = np.linalg.inv(w2c_mats)

    # Image names from COLMAP. No need for permuting the poses according to
    # image names anymore.
    image_names = [imdata[k].name for k in imdata]

    # # Switch from COLMAP (right, down, fwd) to Nerf (right, up, back) frame.
    # poses = poses @ np.diag([1, -1, -1, 1])

    # Get distortion parameters.
    type_ = cam.camera_type

    if type_ == 0 or type_ == "SIMPLE_PINHOLE":
        params = None
        camtype = "perspective"

    elif type_ == 1 or type_ == "PINHOLE":
        params = None
        camtype = "perspective"

    if type_ == 2 or type_ == "SIMPLE_RADIAL":
        params = {k: 0.0 for k in ["k1", "k2", "k3", "p1", "p2"]}
        params["k1"] = cam.k1
        camtype = "perspective"

    elif type_ == 3 or type_ == "RADIAL":
        params = {k: 0.0 for k in ["k1", "k2", "k3", "p1", "p2"]}
        params["k1"] = cam.k1
        params["k2"] = cam.k2
        camtype = "perspective"

    elif type_ == 4 or type_ == "OPENCV":
        params = {k: 0.0 for k in ["k1", "k2", "k3", "p1", "p2"]}
        params["k1"] = cam.k1
        params["k2"] = cam.k2
        params["p1"] = cam.p1
        params["p2"] = cam.p2
        camtype = "perspective"

    elif type_ == 5 or type_ == "OPENCV_FISHEYE":
        params = {k: 0.0 for k in ["k1", "k2", "k3", "k4"]}
        params["k1"] = cam.k1
        params["k2"] = cam.k2
        params["k3"] = cam.k3
        params["k4"] = cam.k4
        camtype = "fisheye"

    assert params is None, "Only support pinhole camera model."

    # Previous Nerf results were generated with images sorted by filename,
    # ensure metrics are reported on the same test set.
    inds = np.argsort(image_names)
    image_names = [image_names[i] for i in inds]
    camtoworlds = camtoworlds[inds]

    # Load images.
    if factor > 1:
        image_dir_suffix = f"_{factor}"
    else:
        image_dir_suffix = ""
    colmap_image_dir = os.path.join(data_dir, "images")
    image_dir = os.path.join(data_dir, "images" + image_dir_suffix)
    for d in [image_dir, colmap_image_dir]:
        if not os.path.exists(d):
            raise ValueError(f"Image folder {d} does not exist.")
    # Downsampled images may have different names vs images used for COLMAP,
    # so we need to map between the two sorted lists of files.
    colmap_files = sorted(os.listdir(colmap_image_dir))
    image_files = sorted(os.listdir(image_dir))
    colmap_to_image = dict(zip(colmap_files, image_files))
    image_paths = [
        os.path.join(image_dir, colmap_to_image[f]) for f in image_names
    ]
    print("loading images")
    images = [imageio.imread(x) for x in tqdm.tqdm(image_paths)]
    images = np.stack(images, axis=0)

    # Select the split.
    all_indices = np.arange(images.shape[0])
    split_indices = {
        "test": all_indices[all_indices % 8 == 0],
        "train": all_indices[all_indices % 8 != 0],
    }
    indices = split_indices[split]
    # All per-image quantities must be re-indexed using the split indices.
    images = images[indices]
    camtoworlds = camtoworlds[indices]

    return images, camtoworlds, K


class SubjectLoader(torch.utils.data.Dataset):
    """Single subject data loader for training and evaluation."""

    SPLITS = ["train", "test"]
    SUBJECT_IDS = [
        "garden",
        "bicycle",
        "bonsai",
        "counter",
        "kitchen",
        "room",
        "stump",
    ]

    OPENGL_CAMERA = False

    def __init__(
        self,
        subject_id: str,
        root_fp: str,
        split: str,
        color_bkgd_aug: str = "white",
        num_rays: int = None,
        near: float = None,
        far: float = None,
        batch_over_images: bool = True,
        factor: int = 1,
    ):
        super().__init__()
        assert split in self.SPLITS, "%s" % split
        assert subject_id in self.SUBJECT_IDS, "%s" % subject_id
        assert color_bkgd_aug in ["white", "black", "random"]
        self.split = split
        self.num_rays = num_rays
        self.near = near
        self.far = far
        self.training = (num_rays is not None) and (
            split in ["train", "trainval"]
        )
        self.color_bkgd_aug = color_bkgd_aug
        self.batch_over_images = batch_over_images
        self.images, self.camtoworlds, self.K = _load_colmap(
            root_fp, subject_id, split, factor
        )
        self.images = torch.from_numpy(self.images).to(torch.uint8)
        self.camtoworlds = torch.from_numpy(self.camtoworlds).to(torch.float32)
        self.K = torch.tensor(self.K).to(torch.float32)
        self.height, self.width = self.images.shape[1:3]

    def __len__(self):
        return len(self.images)

    @torch.no_grad()
    def __getitem__(self, index):
        data = self.fetch_data(index)
        data = self.preprocess(data)
        return data

    def preprocess(self, data):
        """Process the fetched / cached data with randomness."""
        pixels, rays = data["rgb"], data["rays"]

        if self.training:
            if self.color_bkgd_aug == "random":
                color_bkgd = torch.rand(3, device=self.images.device)
            elif self.color_bkgd_aug == "white":
                color_bkgd = torch.ones(3, device=self.images.device)
            elif self.color_bkgd_aug == "black":
                color_bkgd = torch.zeros(3, device=self.images.device)
        else:
            # just use white during inference
            color_bkgd = torch.ones(3, device=self.images.device)

        return {
            "pixels": pixels,  # [n_rays, 3] or [h, w, 3]
            "rays": rays,  # [n_rays,] or [h, w]
            "color_bkgd": color_bkgd,  # [3,]
            **{k: v for k, v in data.items() if k not in ["rgb", "rays"]},
        }

    def update_num_rays(self, num_rays):
        self.num_rays = num_rays

    def fetch_data(self, index):
        """Fetch the data (it maybe cached for multiple batches)."""
        num_rays = self.num_rays

        if self.training:
            if self.batch_over_images:
                image_id = torch.randint(
                    0,
                    len(self.images),
                    size=(num_rays,),
                    device=self.images.device,
                )
            else:
                image_id = [index]
            x = torch.randint(
                0, self.width, size=(num_rays,), device=self.images.device
            )
            y = torch.randint(
                0, self.height, size=(num_rays,), device=self.images.device
            )
        else:
            image_id = [index]
            x, y = torch.meshgrid(
                torch.arange(self.width, device=self.images.device),
                torch.arange(self.height, device=self.images.device),
                indexing="xy",
            )
            x = x.flatten()
            y = y.flatten()

        # generate rays
        rgb = self.images[image_id, y, x] / 255.0  # (num_rays, 3)
        c2w = self.camtoworlds[image_id]  # (num_rays, 3, 4)
        camera_dirs = F.pad(
            torch.stack(
                [
                    (x - self.K[0, 2] + 0.5) / self.K[0, 0],
                    (y - self.K[1, 2] + 0.5)
                    / self.K[1, 1]
                    * (-1.0 if self.OPENGL_CAMERA else 1.0),
                ],
                dim=-1,
            ),
            (0, 1),
            value=(-1.0 if self.OPENGL_CAMERA else 1.0),
        )  # [num_rays, 3]

        # [n_cams, height, width, 3]
        directions = (camera_dirs[:, None, :] * c2w[:, :3, :3]).sum(dim=-1)
        origins = torch.broadcast_to(c2w[:, :3, -1], directions.shape)
        viewdirs = directions / torch.linalg.norm(
            directions, dim=-1, keepdims=True
        )

        if self.training:
            origins = torch.reshape(origins, (num_rays, 3))
            viewdirs = torch.reshape(viewdirs, (num_rays, 3))
            rgb = torch.reshape(rgb, (num_rays, 3))
        else:
            origins = torch.reshape(origins, (self.height, self.width, 3))
            viewdirs = torch.reshape(viewdirs, (self.height, self.width, 3))
            rgb = torch.reshape(rgb, (self.height, self.width, 3))

        rays = Rays(origins=origins, viewdirs=viewdirs)

        return {
            "rgb": rgb,  # [h, w, 3] or [num_rays, 3]
            "rays": rays,  # [h, w, 3] or [num_rays, 3]
        }


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/datasets/nerf_synthetic.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import collections
import json
import os

import imageio.v2 as imageio
import numpy as np
import torch
import torch.nn.functional as F

from .utils import Rays


def _load_renderings(root_fp: str, subject_id: str, split: str):
    """Load images from disk."""
    if not root_fp.startswith("/"):
        # allow relative path. e.g., "./data/nerf_synthetic/"
        root_fp = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "..",
            "..",
            root_fp,
        )

    data_dir = os.path.join(root_fp, subject_id)
    with open(
        os.path.join(data_dir, "transforms_{}.json".format(split)), "r"
    ) as fp:
        meta = json.load(fp)
    images = []
    camtoworlds = []

    for i in range(len(meta["frames"])):
        frame = meta["frames"][i]
        fname = os.path.join(data_dir, frame["file_path"] + ".png")
        rgba = imageio.imread(fname)
        camtoworlds.append(frame["transform_matrix"])
        images.append(rgba)

    images = np.stack(images, axis=0)
    camtoworlds = np.stack(camtoworlds, axis=0)

    h, w = images.shape[1:3]
    camera_angle_x = float(meta["camera_angle_x"])
    focal = 0.5 * w / np.tan(0.5 * camera_angle_x)

    return images, camtoworlds, focal


class SubjectLoader(torch.utils.data.Dataset):
    """Single subject data loader for training and evaluation."""

    SPLITS = ["train", "val", "trainval", "test"]
    SUBJECT_IDS = [
        "chair",
        "drums",
        "ficus",
        "hotdog",
        "lego",
        "materials",
        "mic",
        "ship",
    ]

    WIDTH, HEIGHT = 800, 800
    NEAR, FAR = 2.0, 6.0
    OPENGL_CAMERA = True

    def __init__(
        self,
        subject_id: str,
        root_fp: str,
        split: str,
        color_bkgd_aug: str = "white",
        num_rays: int = None,
        near: float = None,
        far: float = None,
        batch_over_images: bool = True,
    ):
        super().__init__()
        assert split in self.SPLITS, "%s" % split
        assert subject_id in self.SUBJECT_IDS, "%s" % subject_id
        assert color_bkgd_aug in ["white", "black", "random"]
        self.split = split
        self.num_rays = num_rays
        self.near = self.NEAR if near is None else near
        self.far = self.FAR if far is None else far
        self.training = (num_rays is not None) and (
            split in ["train", "trainval"]
        )
        self.color_bkgd_aug = color_bkgd_aug
        self.batch_over_images = batch_over_images
        if split == "trainval":
            _images_train, _camtoworlds_train, _focal_train = _load_renderings(
                root_fp, subject_id, "train"
            )
            _images_val, _camtoworlds_val, _focal_val = _load_renderings(
                root_fp, subject_id, "val"
            )
            self.images = np.concatenate([_images_train, _images_val])
            self.camtoworlds = np.concatenate(
                [_camtoworlds_train, _camtoworlds_val]
            )
            self.focal = _focal_train
        else:
            self.images, self.camtoworlds, self.focal = _load_renderings(
                root_fp, subject_id, split
            )
        self.images = torch.from_numpy(self.images).to(torch.uint8)
        self.camtoworlds = torch.from_numpy(self.camtoworlds).to(torch.float32)
        self.K = torch.tensor(
            [
                [self.focal, 0, self.WIDTH / 2.0],
                [0, self.focal, self.HEIGHT / 2.0],
                [0, 0, 1],
            ],
            dtype=torch.float32,
        )  # (3, 3)
        assert self.images.shape[1:3] == (self.HEIGHT, self.WIDTH)

    def __len__(self):
        return len(self.images)

    @torch.no_grad()
    def __getitem__(self, index):
        data = self.fetch_data(index)
        data = self.preprocess(data)
        return data

    def preprocess(self, data):
        """Process the fetched / cached data with randomness."""
        rgba, rays = data["rgba"], data["rays"]
        pixels, alpha = torch.split(rgba, [3, 1], dim=-1)

        if self.training:
            if self.color_bkgd_aug == "random":
                color_bkgd = torch.rand(3, device=self.images.device)
            elif self.color_bkgd_aug == "white":
                color_bkgd = torch.ones(3, device=self.images.device)
            elif self.color_bkgd_aug == "black":
                color_bkgd = torch.zeros(3, device=self.images.device)
        else:
            # just use white during inference
            color_bkgd = torch.ones(3, device=self.images.device)

        pixels = pixels * alpha + color_bkgd * (1.0 - alpha)
        return {
            "pixels": pixels,  # [n_rays, 3] or [h, w, 3]
            "rays": rays,  # [n_rays,] or [h, w]
            "color_bkgd": color_bkgd,  # [3,]
            **{k: v for k, v in data.items() if k not in ["rgba", "rays"]},
        }

    def update_num_rays(self, num_rays):
        self.num_rays = num_rays

    def fetch_data(self, index):
        """Fetch the data (it maybe cached for multiple batches)."""
        num_rays = self.num_rays

        if self.training:
            if self.batch_over_images:
                image_id = torch.randint(
                    0,
                    len(self.images),
                    size=(num_rays,),
                    device=self.images.device,
                )
            else:
                image_id = [index]
            x = torch.randint(
                0, self.WIDTH, size=(num_rays,), device=self.images.device
            )
            y = torch.randint(
                0, self.HEIGHT, size=(num_rays,), device=self.images.device
            )
        else:
            image_id = [index]
            x, y = torch.meshgrid(
                torch.arange(self.WIDTH, device=self.images.device),
                torch.arange(self.HEIGHT, device=self.images.device),
                indexing="xy",
            )
            x = x.flatten()
            y = y.flatten()

        # generate rays
        rgba = self.images[image_id, y, x] / 255.0  # (num_rays, 4)
        c2w = self.camtoworlds[image_id]  # (num_rays, 3, 4)
        camera_dirs = F.pad(
            torch.stack(
                [
                    (x - self.K[0, 2] + 0.5) / self.K[0, 0],
                    (y - self.K[1, 2] + 0.5)
                    / self.K[1, 1]
                    * (-1.0 if self.OPENGL_CAMERA else 1.0),
                ],
                dim=-1,
            ),
            (0, 1),
            value=(-1.0 if self.OPENGL_CAMERA else 1.0),
        )  # [num_rays, 3]

        # [n_cams, height, width, 3]
        directions = (camera_dirs[:, None, :] * c2w[:, :3, :3]).sum(dim=-1)
        origins = torch.broadcast_to(c2w[:, :3, -1], directions.shape)
        viewdirs = directions / torch.linalg.norm(
            directions, dim=-1, keepdims=True
        )

        if self.training:
            origins = torch.reshape(origins, (num_rays, 3))
            viewdirs = torch.reshape(viewdirs, (num_rays, 3))
            rgba = torch.reshape(rgba, (num_rays, 4))
        else:
            origins = torch.reshape(origins, (self.HEIGHT, self.WIDTH, 3))
            viewdirs = torch.reshape(viewdirs, (self.HEIGHT, self.WIDTH, 3))
            rgba = torch.reshape(rgba, (self.HEIGHT, self.WIDTH, 4))

        rays = Rays(origins=origins, viewdirs=viewdirs)

        return {
            "rgba": rgba,  # [h, w, 4] or [num_rays, 4]
            "rays": rays,  # [h, w, 3] or [num_rays, 3]
        }


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/datasets/utils.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import collections

Rays = collections.namedtuple("Rays", ("origins", "viewdirs"))


def namedtuple_map(fn, tup):
    """Apply `fn` to each element of `tup` and cast to `tup`'s namedtuple."""
    return type(tup)(*(None if x is None else fn(x) for x in tup))


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/radiance_fields/__init__.py
================================================


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/radiance_fields/mlp.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import functools
import math
from typing import Callable, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F


class MLP(nn.Module):
    def __init__(
        self,
        input_dim: int,  # The number of input tensor channels.
        output_dim: int = None,  # The number of output tensor channels.
        net_depth: int = 8,  # The depth of the MLP.
        net_width: int = 256,  # The width of the MLP.
        skip_layer: int = 4,  # The layer to add skip layers to.
        hidden_init: Callable = nn.init.xavier_uniform_,
        hidden_activation: Callable = nn.ReLU(),
        output_enabled: bool = True,
        output_init: Optional[Callable] = nn.init.xavier_uniform_,
        output_activation: Optional[Callable] = nn.Identity(),
        bias_enabled: bool = True,
        bias_init: Callable = nn.init.zeros_,
    ):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.net_depth = net_depth
        self.net_width = net_width
        self.skip_layer = skip_layer
        self.hidden_init = hidden_init
        self.hidden_activation = hidden_activation
        self.output_enabled = output_enabled
        self.output_init = output_init
        self.output_activation = output_activation
        self.bias_enabled = bias_enabled
        self.bias_init = bias_init

        self.hidden_layers = nn.ModuleList()
        in_features = self.input_dim
        for i in range(self.net_depth):
            self.hidden_layers.append(
                nn.Linear(in_features, self.net_width, bias=bias_enabled)
            )
            if (
                (self.skip_layer is not None)
                and (i % self.skip_layer == 0)
                and (i > 0)
            ):
                in_features = self.net_width + self.input_dim
            else:
                in_features = self.net_width
        if self.output_enabled:
            self.output_layer = nn.Linear(
                in_features, self.output_dim, bias=bias_enabled
            )
        else:
            self.output_dim = in_features

        self.initialize()

    def initialize(self):
        def init_func_hidden(m):
            if isinstance(m, nn.Linear):
                if self.hidden_init is not None:
                    self.hidden_init(m.weight)
                if self.bias_enabled and self.bias_init is not None:
                    self.bias_init(m.bias)

        self.hidden_layers.apply(init_func_hidden)
        if self.output_enabled:

            def init_func_output(m):
                if isinstance(m, nn.Linear):
                    if self.output_init is not None:
                        self.output_init(m.weight)
                    if self.bias_enabled and self.bias_init is not None:
                        self.bias_init(m.bias)

            self.output_layer.apply(init_func_output)

    def forward(self, x):
        inputs = x
        for i in range(self.net_depth):
            x = self.hidden_layers[i](x)
            x = self.hidden_activation(x)
            if (
                (self.skip_layer is not None)
                and (i % self.skip_layer == 0)
                and (i > 0)
            ):
                x = torch.cat([x, inputs], dim=-1)
        if self.output_enabled:
            x = self.output_layer(x)
            x = self.output_activation(x)
        return x


class DenseLayer(MLP):
    def __init__(self, input_dim, output_dim, **kwargs):
        super().__init__(
            input_dim=input_dim,
            output_dim=output_dim,
            net_depth=0,  # no hidden layers
            **kwargs,
        )


class NerfMLP(nn.Module):
    def __init__(
        self,
        input_dim: int,  # The number of input tensor channels.
        condition_dim: int,  # The number of condition tensor channels.
        net_depth: int = 8,  # The depth of the MLP.
        net_width: int = 256,  # The width of the MLP.
        skip_layer: int = 4,  # The layer to add skip layers to.
        net_depth_condition: int = 1,  # The depth of the second part of MLP.
        net_width_condition: int = 128,  # The width of the second part of MLP.
    ):
        super().__init__()
        self.base = MLP(
            input_dim=input_dim,
            net_depth=net_depth,
            net_width=net_width,
            skip_layer=skip_layer,
            output_enabled=False,
        )
        hidden_features = self.base.output_dim
        self.sigma_layer = DenseLayer(hidden_features, 1)

        if condition_dim > 0:
            self.bottleneck_layer = DenseLayer(hidden_features, net_width)
            self.rgb_layer = MLP(
                input_dim=net_width + condition_dim,
                output_dim=3,
                net_depth=net_depth_condition,
                net_width=net_width_condition,
                skip_layer=None,
            )
        else:
            self.rgb_layer = DenseLayer(hidden_features, 3)

    def query_density(self, x):
        x = self.base(x)
        raw_sigma = self.sigma_layer(x)
        return raw_sigma

    def forward(self, x, condition=None):
        x = self.base(x)
        raw_sigma = self.sigma_layer(x)
        if condition is not None:
            if condition.shape[:-1] != x.shape[:-1]:
                num_rays, n_dim = condition.shape
                condition = condition.view(
                    [num_rays] + [1] * (x.dim() - condition.dim()) + [n_dim]
                ).expand(list(x.shape[:-1]) + [n_dim])
            bottleneck = self.bottleneck_layer(x)
            x = torch.cat([bottleneck, condition], dim=-1)
        raw_rgb = self.rgb_layer(x)
        return raw_rgb, raw_sigma


class SinusoidalEncoder(nn.Module):
    """Sinusoidal Positional Encoder used in Nerf."""

    def __init__(self, x_dim, min_deg, max_deg, use_identity: bool = True):
        super().__init__()
        self.x_dim = x_dim
        self.min_deg = min_deg
        self.max_deg = max_deg
        self.use_identity = use_identity
        self.register_buffer(
            "scales", torch.tensor([2**i for i in range(min_deg, max_deg)])
        )

    @property
    def latent_dim(self) -> int:
        return (
            int(self.use_identity) + (self.max_deg - self.min_deg) * 2
        ) * self.x_dim

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [..., x_dim]
        Returns:
            latent: [..., latent_dim]
        """
        if self.max_deg == self.min_deg:
            return x
        xb = torch.reshape(
            (x[Ellipsis, None, :] * self.scales[:, None]),
            list(x.shape[:-1]) + [(self.max_deg - self.min_deg) * self.x_dim],
        )
        latent = torch.sin(torch.cat([xb, xb + 0.5 * math.pi], dim=-1))
        if self.use_identity:
            latent = torch.cat([x] + [latent], dim=-1)
        return latent


class VanillaNeRFRadianceField(nn.Module):
    def __init__(
        self,
        net_depth: int = 8,  # The depth of the MLP.
        net_width: int = 256,  # The width of the MLP.
        skip_layer: int = 4,  # The layer to add skip layers to.
        net_depth_condition: int = 1,  # The depth of the second part of MLP.
        net_width_condition: int = 128,  # The width of the second part of MLP.
    ) -> None:
        super().__init__()
        self.posi_encoder = SinusoidalEncoder(3, 0, 10, True)
        self.view_encoder = SinusoidalEncoder(3, 0, 4, True)
        self.mlp = NerfMLP(
            input_dim=self.posi_encoder.latent_dim,
            condition_dim=self.view_encoder.latent_dim,
            net_depth=net_depth,
            net_width=net_width,
            skip_layer=skip_layer,
            net_depth_condition=net_depth_condition,
            net_width_condition=net_width_condition,
        )

    def query_opacity(self, x, step_size):
        density = self.query_density(x)
        # if the density is small enough those two are the same.
        # opacity = 1.0 - torch.exp(-density * step_size)
        opacity = density * step_size
        return opacity

    def query_density(self, x):
        x = self.posi_encoder(x)
        sigma = self.mlp.query_density(x)
        return F.relu(sigma)

    def forward(self, x, condition=None):
        x = self.posi_encoder(x)
        if condition is not None:
            condition = self.view_encoder(condition)
        rgb, sigma = self.mlp(x, condition=condition)
        return torch.sigmoid(rgb), F.relu(sigma)


class DNeRFRadianceField(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.posi_encoder = SinusoidalEncoder(3, 0, 4, True)
        self.time_encoder = SinusoidalEncoder(1, 0, 4, True)
        self.warp = MLP(
            input_dim=self.posi_encoder.latent_dim
            + self.time_encoder.latent_dim,
            output_dim=3,
            net_depth=4,
            net_width=64,
            skip_layer=2,
            output_init=functools.partial(torch.nn.init.uniform_, b=1e-4),
        )
        self.nerf = VanillaNeRFRadianceField()

    def query_opacity(self, x, timestamps, step_size):
        idxs = torch.randint(0, len(timestamps), (x.shape[0],), device=x.device)
        t = timestamps[idxs]
        density = self.query_density(x, t)
        # if the density is small enough those two are the same.
        # opacity = 1.0 - torch.exp(-density * step_size)
        opacity = density * step_size
        return opacity

    def query_density(self, x, t):
        x = x + self.warp(
            torch.cat([self.posi_encoder(x), self.time_encoder(t)], dim=-1)
        )
        return self.nerf.query_density(x)

    def forward(self, x, t, condition=None):
        x = x + self.warp(
            torch.cat([self.posi_encoder(x), self.time_encoder(t)], dim=-1)
        )
        return self.nerf(x, condition=condition)


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/radiance_fields/ngp.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

from typing import Callable, List, Union

import torch
from torch.autograd import Function
from torch.cuda.amp import custom_bwd, custom_fwd

try:
    import tinycudann as tcnn
except ImportError as e:
    print(
        f"Error: {e}! "
        "Please install tinycudann by: "
        "pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch"
    )
    exit()


class _TruncExp(Function):  # pylint: disable=abstract-method
    # Implementation from torch-ngp:
    # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, x):  # pylint: disable=arguments-differ
        ctx.save_for_backward(x)
        return torch.exp(x)

    @staticmethod
    @custom_bwd
    def backward(ctx, g):  # pylint: disable=arguments-differ
        x = ctx.saved_tensors[0]
        return g * torch.exp(torch.clamp(x, max=15))


trunc_exp = _TruncExp.apply


def contract_to_unisphere(
    x: torch.Tensor,
    aabb: torch.Tensor,
    eps: float = 1e-6,
    derivative: bool = False,
):
    aabb_min, aabb_max = torch.split(aabb, 3, dim=-1)
    x = (x - aabb_min) / (aabb_max - aabb_min)
    x = x * 2 - 1  # aabb is at [-1, 1]
    mag = x.norm(dim=-1, keepdim=True)
    mask = mag.squeeze(-1) > 1

    if derivative:
        dev = (2 * mag - 1) / mag**2 + 2 * x**2 * (
            1 / mag**3 - (2 * mag - 1) / mag**4
        )
        dev[~mask] = 1.0
        dev = torch.clamp(dev, min=eps)
        return dev
    else:
        x[mask] = (2 - 1 / mag[mask]) * (x[mask] / mag[mask])
        x = x / 4 + 0.5  # [-inf, inf] is at [0, 1]
        return x


class NGPradianceField(torch.nn.Module):
    """Instance-NGP radiance Field"""

    def __init__(
        self,
        aabb: Union[torch.Tensor, List[float]],
        num_dim: int = 3,
        use_viewdirs: bool = True,
        density_activation: Callable = lambda x: trunc_exp(x - 1),
        unbounded: bool = False,
        geo_feat_dim: int = 15,
        n_levels: int = 16,
        log2_hashmap_size: int = 19,
    ) -> None:
        super().__init__()
        if not isinstance(aabb, torch.Tensor):
            aabb = torch.tensor(aabb, dtype=torch.float32)
        self.register_buffer("aabb", aabb)
        self.num_dim = num_dim
        self.use_viewdirs = use_viewdirs
        self.density_activation = density_activation
        self.unbounded = unbounded

        self.geo_feat_dim = geo_feat_dim
        per_level_scale = 1.4472692012786865

        if self.use_viewdirs:
            self.direction_encoding = tcnn.Encoding(
                n_input_dims=num_dim,
                encoding_config={
                    "otype": "Composite",
                    "nested": [
                        {
                            "n_dims_to_encode": 3,
                            "otype": "SphericalHarmonics",
                            "degree": 4,
                        },
                        # {"otype": "Identity", "n_bins": 4, "degree": 4},
                    ],
                },
            )

        self.mlp_base = tcnn.NetworkWithInputEncoding(
            n_input_dims=num_dim,
            n_output_dims=1 + self.geo_feat_dim,
            encoding_config={
                "otype": "HashGrid",
                "n_levels": n_levels,
                "n_features_per_level": 2,
                "log2_hashmap_size": log2_hashmap_size,
                "base_resolution": 16,
                "per_level_scale": per_level_scale,
            },
            network_config={
                "otype": "FullyFusedMLP",
                "activation": "ReLU",
                "output_activation": "None",
                "n_neurons": 64,
                "n_hidden_layers": 1,
            },
        )
        if self.geo_feat_dim > 0:
            self.mlp_head = tcnn.Network(
                n_input_dims=(
                    (
                        self.direction_encoding.n_output_dims
                        if self.use_viewdirs
                        else 0
                    )
                    + self.geo_feat_dim
                ),
                n_output_dims=3,
                network_config={
                    "otype": "FullyFusedMLP",
                    "activation": "ReLU",
                    "output_activation": "Sigmoid",
                    "n_neurons": 64,
                    "n_hidden_layers": 2,
                },
            )

    def query_density(self, x, return_feat: bool = False):
        if self.unbounded:
            x = contract_to_unisphere(x, self.aabb)
        else:
            aabb_min, aabb_max = torch.split(self.aabb, self.num_dim, dim=-1)
            x = (x - aabb_min) / (aabb_max - aabb_min)
        selector = ((x > 0.0) & (x < 1.0)).all(dim=-1)
        x = (
            self.mlp_base(x.view(-1, self.num_dim))
            .view(list(x.shape[:-1]) + [1 + self.geo_feat_dim])
            .to(x)
        )
        density_before_activation, base_mlp_out = torch.split(
            x, [1, self.geo_feat_dim], dim=-1
        )
        density = (
            self.density_activation(density_before_activation)
            * selector[..., None]
        )
        if return_feat:
            return density, base_mlp_out
        else:
            return density

    def _query_rgb(self, dir, embedding):
        # tcnn requires directions in the range [0, 1]
        if self.use_viewdirs:
            dir = (dir + 1.0) / 2.0
            d = self.direction_encoding(dir.view(-1, dir.shape[-1]))
            h = torch.cat([d, embedding.view(-1, self.geo_feat_dim)], dim=-1)
        else:
            h = embedding.view(-1, self.geo_feat_dim)
        rgb = (
            self.mlp_head(h)
            .view(list(embedding.shape[:-1]) + [3])
            .to(embedding)
        )
        return rgb

    def forward(
        self,
        positions: torch.Tensor,
        directions: torch.Tensor = None,
    ):
        if self.use_viewdirs and (directions is not None):
            assert (
                positions.shape == directions.shape
            ), f"{positions.shape} v.s. {directions.shape}"
            density, embedding = self.query_density(positions, return_feat=True)
            rgb = self._query_rgb(directions, embedding=embedding)
        return rgb, density


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/requirements.txt
================================================
git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
opencv-python
imageio
numpy
tqdm
scipy

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/train_mlp_dnerf.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import argparse
import math
import os
import time

import imageio
import numpy as np
import torch
import torch.nn.functional as F
import tqdm
from datasets.dnerf_synthetic import SubjectLoader
from radiance_fields.mlp import DNeRFRadianceField
from utils import render_image, set_random_seed

from nerfacc import ContractionType, OccupancyGrid

if __name__ == "__main__":

    device = "cuda:0"
    set_random_seed(42)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_split",
        type=str,
        default="train",
        choices=["train"],
        help="which train split to use",
    )
    parser.add_argument(
        "--scene",
        type=str,
        default="lego",
        choices=[
            # dnerf
            "bouncingballs",
            "hellwarrior",
            "hook",
            "jumpingjacks",
            "lego",
            "mutant",
            "standup",
            "trex",
        ],
        help="which scene to use",
    )
    parser.add_argument(
        "--aabb",
        type=lambda s: [float(item) for item in s.split(",")],
        default="-1.5,-1.5,-1.5,1.5,1.5,1.5",
        help="delimited list input",
    )
    parser.add_argument(
        "--test_chunk_size",
        type=int,
        default=8192,
    )
    parser.add_argument("--cone_angle", type=float, default=0.0)
    args = parser.parse_args()

    render_n_samples = 1024

    # setup the scene bounding box.
    contraction_type = ContractionType.AABB
    scene_aabb = torch.tensor(args.aabb, dtype=torch.float32, device=device)
    near_plane = None
    far_plane = None
    render_step_size = (
        (scene_aabb[3:] - scene_aabb[:3]).max()
        * math.sqrt(3)
        / render_n_samples
    ).item()

    # setup the radiance field we want to train.
    max_steps = 30000
    grad_scaler = torch.cuda.amp.GradScaler(1)
    radiance_field = DNeRFRadianceField().to(device)
    optimizer = torch.optim.Adam(radiance_field.parameters(), lr=5e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[
            max_steps // 2,
            max_steps * 3 // 4,
            max_steps * 5 // 6,
            max_steps * 9 // 10,
        ],
        gamma=0.33,
    )
    # setup the dataset
    data_root_fp = "/home/ruilongli/data/dnerf/"
    target_sample_batch_size = 1 << 16
    grid_resolution = 128

    train_dataset = SubjectLoader(
        subject_id=args.scene,
        root_fp=data_root_fp,
        split=args.train_split,
        num_rays=target_sample_batch_size // render_n_samples,
    )
    train_dataset.images = train_dataset.images.to(device)
    train_dataset.camtoworlds = train_dataset.camtoworlds.to(device)
    train_dataset.K = train_dataset.K.to(device)
    train_dataset.timestamps = train_dataset.timestamps.to(device)

    test_dataset = SubjectLoader(
        subject_id=args.scene,
        root_fp=data_root_fp,
        split="test",
        num_rays=None,
    )
    test_dataset.images = test_dataset.images.to(device)
    test_dataset.camtoworlds = test_dataset.camtoworlds.to(device)
    test_dataset.K = test_dataset.K.to(device)
    test_dataset.timestamps = test_dataset.timestamps.to(device)

    occupancy_grid = OccupancyGrid(
        roi_aabb=args.aabb,
        resolution=grid_resolution,
        contraction_type=contraction_type,
    ).to(device)

    # training
    step = 0
    tic = time.time()
    for epoch in range(10000000):
        for i in range(len(train_dataset)):
            radiance_field.train()
            data = train_dataset[i]

            render_bkgd = data["color_bkgd"]
            rays = data["rays"]
            pixels = data["pixels"]
            timestamps = data["timestamps"]

            # update occupancy grid
            occupancy_grid.every_n_step(
                step=step,
                occ_eval_fn=lambda x: radiance_field.query_opacity(
                    x, timestamps, render_step_size
                ),
            )

            # render
            rgb, acc, depth, n_rendering_samples = render_image(
                radiance_field,
                occupancy_grid,
                rays,
                scene_aabb,
                # rendering options
                near_plane=near_plane,
                far_plane=far_plane,
                render_step_size=render_step_size,
                render_bkgd=render_bkgd,
                cone_angle=args.cone_angle,
                alpha_thre=0.01 if step > 1000 else 0.00,
                # dnerf options
                timestamps=timestamps,
            )
            if n_rendering_samples == 0:
                continue

            # dynamic batch size for rays to keep sample batch size constant.
            num_rays = len(pixels)
            num_rays = int(
                num_rays
                * (target_sample_batch_size / float(n_rendering_samples))
            )
            train_dataset.update_num_rays(num_rays)
            alive_ray_mask = acc.squeeze(-1) > 0

            # compute loss
            loss = F.smooth_l1_loss(rgb[alive_ray_mask], pixels[alive_ray_mask])

            optimizer.zero_grad()
            # do not unscale it because we are using Adam.
            grad_scaler.scale(loss).backward()
            optimizer.step()
            scheduler.step()

            if step % 5000 == 0:
                elapsed_time = time.time() - tic
                loss = F.mse_loss(rgb[alive_ray_mask], pixels[alive_ray_mask])
                print(
                    f"elapsed_time={elapsed_time:.2f}s | step={step} | "
                    f"loss={loss:.5f} | "
                    f"alive_ray_mask={alive_ray_mask.long().sum():d} | "
                    f"n_rendering_samples={n_rendering_samples:d} | num_rays={len(pixels):d} |"
                )

            if step >= 0 and step % max_steps == 0 and step > 0:
                # evaluation
                radiance_field.eval()

                psnrs = []
                with torch.no_grad():
                    for i in tqdm.tqdm(range(len(test_dataset))):
                        data = test_dataset[i]
                        render_bkgd = data["color_bkgd"]
                        rays = data["rays"]
                        pixels = data["pixels"]
                        timestamps = data["timestamps"]

                        # rendering
                        rgb, acc, depth, _ = render_image(
                            radiance_field,
                            occupancy_grid,
                            rays,
                            scene_aabb,
                            # rendering options
                            near_plane=None,
                            far_plane=None,
                            render_step_size=render_step_size,
                            render_bkgd=render_bkgd,
                            cone_angle=args.cone_angle,
                            alpha_thre=0.01,
                            # test options
                            test_chunk_size=args.test_chunk_size,
                            # dnerf options
                            timestamps=timestamps,
                        )
                        mse = F.mse_loss(rgb, pixels)
                        psnr = -10.0 * torch.log(mse) / np.log(10.0)
                        psnrs.append(psnr.item())
                        # imageio.imwrite(
                        #     "acc_binary_test.png",
                        #     ((acc > 0).float().cpu().numpy() * 255).astype(np.uint8),
                        # )
                        # imageio.imwrite(
                        #     "rgb_test.png",
                        #     (rgb.cpu().numpy() * 255).astype(np.uint8),
                        # )
                        # break
                psnr_avg = sum(psnrs) / len(psnrs)
                print(f"evaluation: psnr_avg={psnr_avg}")
                train_dataset.training = True

            if step == max_steps:
                print("training stops")
                exit()

            step += 1


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/train_mlp_nerf.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import argparse
import math
import os
import time

import imageio
import numpy as np
import torch
import torch.nn.functional as F
import tqdm
from radiance_fields.mlp import VanillaNeRFRadianceField
from utils import render_image, set_random_seed

from nerfacc import ContractionType, OccupancyGrid

if __name__ == "__main__":

    device = "cuda:0"
    set_random_seed(42)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_split",
        type=str,
        default="trainval",
        choices=["train", "trainval"],
        help="which train split to use",
    )
    parser.add_argument(
        "--scene",
        type=str,
        default="lego",
        choices=[
            # nerf synthetic
            "chair",
            "drums",
            "ficus",
            "hotdog",
            "lego",
            "materials",
            "mic",
            "ship",
            # mipnerf360 unbounded
            "garden",
        ],
        help="which scene to use",
    )
    parser.add_argument(
        "--aabb",
        type=lambda s: [float(item) for item in s.split(",")],
        default="-1.5,-1.5,-1.5,1.5,1.5,1.5",
        help="delimited list input",
    )
    parser.add_argument(
        "--test_chunk_size",
        type=int,
        default=8192,
    )
    parser.add_argument(
        "--unbounded",
        action="store_true",
        help="whether to use unbounded rendering",
    )
    parser.add_argument("--cone_angle", type=float, default=0.0)
    args = parser.parse_args()

    render_n_samples = 1024

    # setup the scene bounding box.
    if args.unbounded:
        print("Using unbounded rendering")
        contraction_type = ContractionType.UN_BOUNDED_SPHERE
        # contraction_type = ContractionType.UN_BOUNDED_TANH
        scene_aabb = None
        near_plane = 0.2
        far_plane = 1e4
        render_step_size = 1e-2
    else:
        contraction_type = ContractionType.AABB
        scene_aabb = torch.tensor(args.aabb, dtype=torch.float32, device=device)
        near_plane = None
        far_plane = None
        render_step_size = (
            (scene_aabb[3:] - scene_aabb[:3]).max()
            * math.sqrt(3)
            / render_n_samples
        ).item()

    # setup the radiance field we want to train.
    max_steps = 50000
    grad_scaler = torch.cuda.amp.GradScaler(1)
    radiance_field = VanillaNeRFRadianceField().to(device)
    optimizer = torch.optim.Adam(radiance_field.parameters(), lr=5e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[
            max_steps // 2,
            max_steps * 3 // 4,
            max_steps * 5 // 6,
            max_steps * 9 // 10,
        ],
        gamma=0.33,
    )

    # setup the dataset
    train_dataset_kwargs = {}
    test_dataset_kwargs = {}
    if args.scene == "garden":
        from datasets.nerf_360_v2 import SubjectLoader

        data_root_fp = "/home/ruilongli/data/360_v2/"
        target_sample_batch_size = 1 << 16
        train_dataset_kwargs = {"color_bkgd_aug": "random", "factor": 4}
        test_dataset_kwargs = {"factor": 4}
        grid_resolution = 128
    else:
        from datasets.nerf_synthetic import SubjectLoader

        data_root_fp = "/home/ruilongli/data/nerf_synthetic/"
        target_sample_batch_size = 1 << 16
        grid_resolution = 128

    train_dataset = SubjectLoader(
        subject_id=args.scene,
        root_fp=data_root_fp,
        split=args.train_split,
        num_rays=target_sample_batch_size // render_n_samples,
        **train_dataset_kwargs,
    )

    train_dataset.images = train_dataset.images.to(device)
    train_dataset.camtoworlds = train_dataset.camtoworlds.to(device)
    train_dataset.K = train_dataset.K.to(device)

    test_dataset = SubjectLoader(
        subject_id=args.scene,
        root_fp=data_root_fp,
        split="test",
        num_rays=None,
        **test_dataset_kwargs,
    )
    test_dataset.images = test_dataset.images.to(device)
    test_dataset.camtoworlds = test_dataset.camtoworlds.to(device)
    test_dataset.K = test_dataset.K.to(device)

    occupancy_grid = OccupancyGrid(
        roi_aabb=args.aabb,
        resolution=grid_resolution,
        contraction_type=contraction_type,
    ).to(device)

    # training
    step = 0
    tic = time.time()
    for epoch in range(10000000):
        for i in range(len(train_dataset)):
            radiance_field.train()
            data = train_dataset[i]

            render_bkgd = data["color_bkgd"]
            rays = data["rays"]
            pixels = data["pixels"]

            # update occupancy grid
            occupancy_grid.every_n_step(
                step=step,
                occ_eval_fn=lambda x: radiance_field.query_opacity(
                    x, render_step_size
                ),
            )

            # render
            rgb, acc, depth, n_rendering_samples = render_image(
                radiance_field,
                occupancy_grid,
                rays,
                scene_aabb,
                # rendering options
                near_plane=near_plane,
                far_plane=far_plane,
                render_step_size=render_step_size,
                render_bkgd=render_bkgd,
                cone_angle=args.cone_angle,
            )
            if n_rendering_samples == 0:
                continue

            # dynamic batch size for rays to keep sample batch size constant.
            num_rays = len(pixels)
            num_rays = int(
                num_rays
                * (target_sample_batch_size / float(n_rendering_samples))
            )
            train_dataset.update_num_rays(num_rays)
            alive_ray_mask = acc.squeeze(-1) > 0

            # compute loss
            loss = F.smooth_l1_loss(rgb[alive_ray_mask], pixels[alive_ray_mask])

            optimizer.zero_grad()
            # do not unscale it because we are using Adam.
            grad_scaler.scale(loss).backward()
            optimizer.step()
            scheduler.step()

            if step % 5000 == 0:
                elapsed_time = time.time() - tic
                loss = F.mse_loss(rgb[alive_ray_mask], pixels[alive_ray_mask])
                print(
                    f"elapsed_time={elapsed_time:.2f}s | step={step} | "
                    f"loss={loss:.5f} | "
                    f"alive_ray_mask={alive_ray_mask.long().sum():d} | "
                    f"n_rendering_samples={n_rendering_samples:d} | num_rays={len(pixels):d} |"
                )

            if step >= 0 and step % max_steps == 0 and step > 0:
                # evaluation
                radiance_field.eval()

                psnrs = []
                with torch.no_grad():
                    for i in tqdm.tqdm(range(len(test_dataset))):
                        data = test_dataset[i]
                        render_bkgd = data["color_bkgd"]
                        rays = data["rays"]
                        pixels = data["pixels"]

                        # rendering
                        rgb, acc, depth, _ = render_image(
                            radiance_field,
                            occupancy_grid,
                            rays,
                            scene_aabb,
                            # rendering options
                            near_plane=None,
                            far_plane=None,
                            render_step_size=render_step_size,
                            render_bkgd=render_bkgd,
                            cone_angle=args.cone_angle,
                            # test options
                            test_chunk_size=args.test_chunk_size,
                        )
                        mse = F.mse_loss(rgb, pixels)
                        psnr = -10.0 * torch.log(mse) / np.log(10.0)
                        psnrs.append(psnr.item())
                        # imageio.imwrite(
                        #     "acc_binary_test.png",
                        #     ((acc > 0).float().cpu().numpy() * 255).astype(np.uint8),
                        # )
                        # imageio.imwrite(
                        #     "rgb_test.png",
                        #     (rgb.cpu().numpy() * 255).astype(np.uint8),
                        # )
                        # break
                psnr_avg = sum(psnrs) / len(psnrs)
                print(f"evaluation: psnr_avg={psnr_avg}")
                train_dataset.training = True

            if step == max_steps:
                print("training stops")
                exit()

            step += 1


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/train_ngp_nerf.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import argparse
import math
import os
import time

import imageio
import numpy as np
import torch
import torch.nn.functional as F
import tqdm
from radiance_fields.ngp import NGPradianceField
from utils import render_image, set_random_seed

from nerfacc import ContractionType, OccupancyGrid

if __name__ == "__main__":

    device = "cuda:0"
    set_random_seed(42)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--train_split",
        type=str,
        default="trainval",
        choices=["train", "trainval"],
        help="which train split to use",
    )
    parser.add_argument(
        "--scene",
        type=str,
        default="lego",
        choices=[
            # nerf synthetic
            "chair",
            "drums",
            "ficus",
            "hotdog",
            "lego",
            "materials",
            "mic",
            "ship",
            # mipnerf360 unbounded
            "garden",
            "bicycle",
            "bonsai",
            "counter",
            "kitchen",
            "room",
            "stump",
        ],
        help="which scene to use",
    )
    parser.add_argument(
        "--aabb",
        type=lambda s: [float(item) for item in s.split(",")],
        default="-1.5,-1.5,-1.5,1.5,1.5,1.5",
        help="delimited list input",
    )
    parser.add_argument(
        "--test_chunk_size",
        type=int,
        default=8192,
    )
    parser.add_argument(
        "--unbounded",
        action="store_true",
        help="whether to use unbounded rendering",
    )
    parser.add_argument(
        "--auto_aabb",
        action="store_true",
        help="whether to automatically compute the aabb",
    )
    parser.add_argument("--cone_angle", type=float, default=0.0)
    args = parser.parse_args()

    render_n_samples = 1024

    # setup the dataset
    train_dataset_kwargs = {}
    test_dataset_kwargs = {}
    if args.unbounded:
        from datasets.nerf_360_v2 import SubjectLoader

        data_root_fp = "/home/ruilongli/data/360_v2/"
        target_sample_batch_size = 1 << 20
        train_dataset_kwargs = {"color_bkgd_aug": "random", "factor": 4}
        test_dataset_kwargs = {"factor": 4}
        grid_resolution = 256
    else:
        from datasets.nerf_synthetic import SubjectLoader

        data_root_fp = "/home/ruilongli/data/nerf_synthetic/"
        target_sample_batch_size = 1 << 18
        grid_resolution = 128

    train_dataset = SubjectLoader(
        subject_id=args.scene,
        root_fp=data_root_fp,
        split=args.train_split,
        num_rays=target_sample_batch_size // render_n_samples,
        **train_dataset_kwargs,
    )

    train_dataset.images = train_dataset.images.to(device)
    train_dataset.camtoworlds = train_dataset.camtoworlds.to(device)
    train_dataset.K = train_dataset.K.to(device)

    test_dataset = SubjectLoader(
        subject_id=args.scene,
        root_fp=data_root_fp,
        split="test",
        num_rays=None,
        **test_dataset_kwargs,
    )
    test_dataset.images = test_dataset.images.to(device)
    test_dataset.camtoworlds = test_dataset.camtoworlds.to(device)
    test_dataset.K = test_dataset.K.to(device)

    if args.auto_aabb:
        camera_locs = torch.cat(
            [train_dataset.camtoworlds, test_dataset.camtoworlds]
        )[:, :3, -1]
        args.aabb = torch.cat(
            [camera_locs.min(dim=0).values, camera_locs.max(dim=0).values]
        ).tolist()
        print("Using auto aabb", args.aabb)

    # setup the scene bounding box.
    if args.unbounded:
        print("Using unbounded rendering")
        contraction_type = ContractionType.UN_BOUNDED_SPHERE
        # contraction_type = ContractionType.UN_BOUNDED_TANH
        scene_aabb = None
        near_plane = 0.2
        far_plane = 1e4
        render_step_size = 1e-2
        alpha_thre = 1e-2
    else:
        contraction_type = ContractionType.AABB
        scene_aabb = torch.tensor(args.aabb, dtype=torch.float32, device=device)
        near_plane = None
        far_plane = None
        render_step_size = (
            (scene_aabb[3:] - scene_aabb[:3]).max()
            * math.sqrt(3)
            / render_n_samples
        ).item()
        alpha_thre = 0.0

    # setup the radiance field we want to train.
    max_steps = 20000
    grad_scaler = torch.cuda.amp.GradScaler(2**10)
    radiance_field = NGPradianceField(
        aabb=args.aabb,
        unbounded=args.unbounded,
    ).to(device)
    optimizer = torch.optim.Adam(
        radiance_field.parameters(), lr=1e-2, eps=1e-15
    )
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[max_steps // 2, max_steps * 3 // 4, max_steps * 9 // 10],
        gamma=0.33,
    )

    occupancy_grid = OccupancyGrid(
        roi_aabb=args.aabb,
        resolution=grid_resolution,
        contraction_type=contraction_type,
    ).to(device)

    # training
    step = 0
    tic = time.time()
    for epoch in range(10000000):
        for i in range(len(train_dataset)):
            radiance_field.train()
            data = train_dataset[i]

            render_bkgd = data["color_bkgd"]
            rays = data["rays"]
            pixels = data["pixels"]

            def occ_eval_fn(x):
                if args.cone_angle > 0.0:
                    # randomly sample a camera for computing step size.
                    camera_ids = torch.randint(
                        0, len(train_dataset), (x.shape[0],), device=device
                    )
                    origins = train_dataset.camtoworlds[camera_ids, :3, -1]
                    t = (origins - x).norm(dim=-1, keepdim=True)
                    # compute actual step size used in marching, based on the distance to the camera.
                    step_size = torch.clamp(
                        t * args.cone_angle, min=render_step_size
                    )
                    # filter out the points that are not in the near far plane.
                    if (near_plane is not None) and (far_plane is not None):
                        step_size = torch.where(
                            (t > near_plane) & (t < far_plane),
                            step_size,
                            torch.zeros_like(step_size),
                        )
                else:
                    step_size = render_step_size
                # compute occupancy
                density = radiance_field.query_density(x)
                return density * step_size

            # update occupancy grid
            occupancy_grid.every_n_step(step=step, occ_eval_fn=occ_eval_fn)

            # render
            rgb, acc, depth, n_rendering_samples = render_image(
                radiance_field,
                occupancy_grid,
                rays,
                scene_aabb,
                # rendering options
                near_plane=near_plane,
                far_plane=far_plane,
                render_step_size=render_step_size,
                render_bkgd=render_bkgd,
                cone_angle=args.cone_angle,
                alpha_thre=alpha_thre,
            )
            if n_rendering_samples == 0:
                continue

            # dynamic batch size for rays to keep sample batch size constant.
            num_rays = len(pixels)
            num_rays = int(
                num_rays
                * (target_sample_batch_size / float(n_rendering_samples))
            )
            train_dataset.update_num_rays(num_rays)
            alive_ray_mask = acc.squeeze(-1) > 0

            # compute loss
            loss = F.smooth_l1_loss(rgb[alive_ray_mask], pixels[alive_ray_mask])

            optimizer.zero_grad()
            # do not unscale it because we are using Adam.
            grad_scaler.scale(loss).backward()
            optimizer.step()
            scheduler.step()

            if step % 10000 == 0:
                elapsed_time = time.time() - tic
                loss = F.mse_loss(rgb[alive_ray_mask], pixels[alive_ray_mask])
                print(
                    f"elapsed_time={elapsed_time:.2f}s | step={step} | "
                    f"loss={loss:.5f} | "
                    f"alive_ray_mask={alive_ray_mask.long().sum():d} | "
                    f"n_rendering_samples={n_rendering_samples:d} | num_rays={len(pixels):d} |"
                )

            if step >= 0 and step % max_steps == 0 and step > 0:
                # evaluation
                radiance_field.eval()

                psnrs = []
                with torch.no_grad():
                    for i in tqdm.tqdm(range(len(test_dataset))):
                        data = test_dataset[i]
                        render_bkgd = data["color_bkgd"]
                        rays = data["rays"]
                        pixels = data["pixels"]

                        # rendering
                        rgb, acc, depth, _ = render_image(
                            radiance_field,
                            occupancy_grid,
                            rays,
                            scene_aabb,
                            # rendering options
                            near_plane=near_plane,
                            far_plane=far_plane,
                            render_step_size=render_step_size,
                            render_bkgd=render_bkgd,
                            cone_angle=args.cone_angle,
                            alpha_thre=alpha_thre,
                            # test options
                            test_chunk_size=args.test_chunk_size,
                        )
                        mse = F.mse_loss(rgb, pixels)
                        psnr = -10.0 * torch.log(mse) / np.log(10.0)
                        psnrs.append(psnr.item())
                        # imageio.imwrite(
                        #     "acc_binary_test.png",
                        #     ((acc > 0).float().cpu().numpy() * 255).astype(np.uint8),
                        # )
                        # imageio.imwrite(
                        #     "rgb_test.png",
                        #     (rgb.cpu().numpy() * 255).astype(np.uint8),
                        # )
                        # break
                psnr_avg = sum(psnrs) / len(psnrs)
                print(f"evaluation: psnr_avg={psnr_avg}")
                train_dataset.training = True

            if step == max_steps:
                print("training stops")
                exit()

            step += 1


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/examples/utils.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import random
from typing import Optional

import numpy as np
import torch
from datasets.utils import Rays, namedtuple_map

from nerfacc import OccupancyGrid, ray_marching, rendering


def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


def render_image(
    # scene
    radiance_field: torch.nn.Module,
    occupancy_grid: OccupancyGrid,
    rays: Rays,
    scene_aabb: torch.Tensor,
    # rendering options
    near_plane: Optional[float] = None,
    far_plane: Optional[float] = None,
    render_step_size: float = 1e-3,
    render_bkgd: Optional[torch.Tensor] = None,
    cone_angle: float = 0.0,
    alpha_thre: float = 0.0,
    # test options
    test_chunk_size: int = 8192,
    # only useful for dnerf
    timestamps: Optional[torch.Tensor] = None,
):
    """Render the pixels of an image."""
    rays_shape = rays.origins.shape
    if len(rays_shape) == 3:
        height, width, _ = rays_shape
        num_rays = height * width
        rays = namedtuple_map(
            lambda r: r.reshape([num_rays] + list(r.shape[2:])), rays
        )
    else:
        num_rays, _ = rays_shape

    def sigma_fn(t_starts, t_ends, ray_indices):
        t_origins = chunk_rays.origins[ray_indices]
        t_dirs = chunk_rays.viewdirs[ray_indices]
        positions = t_origins + t_dirs * (t_starts + t_ends) / 2.0
        if timestamps is not None:
            # dnerf
            t = (
                timestamps[ray_indices]
                if radiance_field.training
                else timestamps.expand_as(positions[:, :1])
            )
            return radiance_field.query_density(positions, t)
        return radiance_field.query_density(positions)

    def rgb_sigma_fn(t_starts, t_ends, ray_indices):
        t_origins = chunk_rays.origins[ray_indices]
        t_dirs = chunk_rays.viewdirs[ray_indices]
        positions = t_origins + t_dirs * (t_starts + t_ends) / 2.0
        if timestamps is not None:
            # dnerf
            t = (
                timestamps[ray_indices]
                if radiance_field.training
                else timestamps.expand_as(positions[:, :1])
            )
            return radiance_field(positions, t, t_dirs)
        return radiance_field(positions, t_dirs)

    results = []
    chunk = (
        torch.iinfo(torch.int32).max
        if radiance_field.training
        else test_chunk_size
    )
    for i in range(0, num_rays, chunk):
        chunk_rays = namedtuple_map(lambda r: r[i : i + chunk], rays)
        ray_indices, t_starts, t_ends = ray_marching(
            chunk_rays.origins,
            chunk_rays.viewdirs,
            scene_aabb=scene_aabb,
            grid=occupancy_grid,
            sigma_fn=sigma_fn,
            near_plane=near_plane,
            far_plane=far_plane,
            render_step_size=render_step_size,
            stratified=radiance_field.training,
            cone_angle=cone_angle,
            alpha_thre=alpha_thre,
        )
        rgb, opacity, depth = rendering(
            t_starts,
            t_ends,
            ray_indices,
            n_rays=chunk_rays.origins.shape[0],
            rgb_sigma_fn=rgb_sigma_fn,
            render_bkgd=render_bkgd,
        )
        chunk_results = [rgb, opacity, depth, len(t_starts)]
        results.append(chunk_results)
    colors, opacities, depths, n_rendering_samples = [
        torch.cat(r, dim=0) if isinstance(r[0], torch.Tensor) else r
        for r in zip(*results)
    ]
    return (
        colors.view((*rays_shape[:-1], -1)),
        opacities.view((*rays_shape[:-1], -1)),
        depths.view((*rays_shape[:-1], -1)),
        sum(n_rendering_samples),
    )


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/__init__.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""
import warnings

from .cdf import ray_resampling
from .contraction import ContractionType, contract, contract_inv
from .grid import Grid, OccupancyGrid, query_grid
from .intersection import ray_aabb_intersect
from .losses import distortion as loss_distortion
from .pack import pack_data, pack_info, unpack_data, unpack_info
from .ray_marching import ray_marching
from .version import __version__
from .vol_rendering import (
    accumulate_along_rays,
    accumulate_along_rays_patch_based,
    render_transmittance_from_alpha,
    render_transmittance_from_density,
    render_visibility,
    render_visibility_patch_based,
    render_weight_from_alpha,
    render_weight_from_density,
    render_weight_from_alpha_patch_based,
    render_weight_and_transmittance_from_alpha_patch_based,
    rendering,
)


# About to be deprecated
def unpack_to_ray_indices(*args, **kwargs):
    warnings.warn(
        "`unpack_to_ray_indices` will be deprecated. Please use `unpack_info` instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    return unpack_info(*args, **kwargs)


__all__ = [
    "__version__",
    "Grid",
    "OccupancyGrid",
    "query_grid",
    "ContractionType",
    "contract",
    "contract_inv",
    "ray_aabb_intersect",
    "ray_marching",
    "accumulate_along_rays",
    "accumulate_along_rays_patch_based",
    "render_visibility",
    "render_visibility_patch_based",
    "render_weight_from_alpha",
    "render_weight_from_alpha_patch_based",
    "render_weight_from_density",
    "rendering",
    "pack_data",
    "unpack_data",
    "unpack_info",
    "pack_info",
    "ray_resampling",
    "loss_distortion",
    "unpack_to_ray_indices",
    "render_transmittance_from_density",
    "render_transmittance_from_alpha",
    "render_weight_and_transmittance_from_alpha_patch_based"
]


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cdf.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

from typing import Tuple

from torch import Tensor

import nerfacc.cuda as _C


def ray_resampling(
    packed_info: Tensor,
    t_starts: Tensor,
    t_ends: Tensor,
    weights: Tensor,
    n_samples: int,
) -> Tuple[Tensor, Tensor, Tensor]:
    """Resample a set of rays based on the CDF of the weights.

    Args:
        packed_info (Tensor): Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
        t_starts: Where the frustum-shape sample starts along a ray. Tensor with \
            shape (n_samples, 1).
        t_ends: Where the frustum-shape sample ends along a ray. Tensor with \
            shape (n_samples, 1).
        weights: Volumetric rendering weights for those samples. Tensor with shape \
            (n_samples,).
        n_samples (int): Number of samples per ray to resample.

    Returns:
        Resampled packed info (n_rays, 2), t_starts (n_samples, 1), and t_ends (n_samples, 1).
    """
    (
        resampled_packed_info,
        resampled_t_starts,
        resampled_t_ends,
    ) = _C.ray_resampling(
        packed_info.contiguous(),
        t_starts.contiguous(),
        t_ends.contiguous(),
        weights.contiguous(),
        n_samples,
    )
    return resampled_packed_info, resampled_t_starts, resampled_t_ends


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/contraction.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

from enum import Enum

import torch

import nerfacc.cuda as _C


class ContractionType(Enum):
    """Space contraction options.

    This is an enum class that describes how a :class:`nerfacc.Grid` covers the 3D space.
    It is also used by :func:`nerfacc.ray_marching` to determine how to perform ray marching
    within the grid.

    The options in this enum class are:

    Attributes:
        AABB: Linearly map the region of interest :math:`[x_0, x_1]` to a
            unit cube in :math:`[0, 1]`.

            .. math:: f(x) = \\frac{x - x_0}{x_1 - x_0}

        UN_BOUNDED_TANH: Contract an unbounded space into a unit cube in :math:`[0, 1]`
            using tanh. The region of interest :math:`[x_0, x_1]` is first
            mapped into :math:`[-0.5, +0.5]` before applying tanh.

            .. math:: f(x) = \\frac{1}{2}(tanh(\\frac{x - x_0}{x_1 - x_0} - \\frac{1}{2}) + 1)

        UN_BOUNDED_SPHERE: Contract an unbounded space into a unit sphere. Used in
            `Mip-Nerf 360: Unbounded Anti-Aliased Neural Radiance Fields`_.

            .. math:: 
                f(x) = 
                \\begin{cases}
                z(x) & ||z(x)|| \\leq 1 \\\\
                (2 - \\frac{1}{||z(x)||})(\\frac{z(x)}{||z(x)||}) & ||z(x)|| > 1
                \\end{cases}
            
            .. math::
                z(x) = \\frac{x - x_0}{x_1 - x_0} * 2 - 1

            .. _Mip-Nerf 360\: Unbounded Anti-Aliased Neural Radiance Fields:
                https://arxiv.org/abs/2111.12077

    """

    AABB = 0
    UN_BOUNDED_TANH = 1
    UN_BOUNDED_SPHERE = 2

    def to_cpp_version(self):
        """Convert to the C++ version of the enum class.

        Returns:
            The C++ version of the enum class.

        """
        return _C.ContractionTypeGetter(self.value)


@torch.no_grad()
def contract(
    x: torch.Tensor,
    roi: torch.Tensor,
    type: ContractionType = ContractionType.AABB,
) -> torch.Tensor:
    """Contract the space into [0, 1]^3.

    Args:
        x (torch.Tensor): Un-contracted points.
        roi (torch.Tensor): Region of interest.
        type (ContractionType): Contraction type.

    Returns:
        torch.Tensor: Contracted points ([0, 1]^3).
    """
    ctype = type.to_cpp_version()
    return _C.contract(x.contiguous(), roi.contiguous(), ctype)


@torch.no_grad()
def contract_inv(
    x: torch.Tensor,
    roi: torch.Tensor,
    type: ContractionType = ContractionType.AABB,
) -> torch.Tensor:
    """Recover the space from [0, 1]^3 by inverse contraction.

    Args:
        x (torch.Tensor): Contracted points ([0, 1]^3).
        roi (torch.Tensor): Region of interest.
        type (ContractionType): Contraction type.

    Returns:
        torch.Tensor: Un-contracted points.
    """
    ctype = type.to_cpp_version()
    return _C.contract_inv(x.contiguous(), roi.contiguous(), ctype)


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/__init__.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

from typing import Any, Callable


def _make_lazy_cuda_func(name: str) -> Callable:
    def call_cuda(*args, **kwargs):
        # pylint: disable=import-outside-toplevel
        from ._backend import _C

        return getattr(_C, name)(*args, **kwargs)

    return call_cuda


ContractionTypeGetter = _make_lazy_cuda_func("ContractionType")
contract = _make_lazy_cuda_func("contract")
contract_inv = _make_lazy_cuda_func("contract_inv")

grid_query = _make_lazy_cuda_func("grid_query")

ray_aabb_intersect = _make_lazy_cuda_func("ray_aabb_intersect")
ray_marching = _make_lazy_cuda_func("ray_marching")
ray_resampling = _make_lazy_cuda_func("ray_resampling")

is_cub_available = _make_lazy_cuda_func("is_cub_available")
transmittance_from_sigma_forward_cub = _make_lazy_cuda_func(
    "transmittance_from_sigma_forward_cub"
)
transmittance_from_sigma_backward_cub = _make_lazy_cuda_func(
    "transmittance_from_sigma_backward_cub"
)
transmittance_from_alpha_forward_cub = _make_lazy_cuda_func(
    "transmittance_from_alpha_forward_cub"
)
transmittance_from_alpha_backward_cub = _make_lazy_cuda_func(
    "transmittance_from_alpha_backward_cub"
)

transmittance_from_sigma_forward_naive = _make_lazy_cuda_func(
    "transmittance_from_sigma_forward_naive"
)
transmittance_from_sigma_backward_naive = _make_lazy_cuda_func(
    "transmittance_from_sigma_backward_naive"
)
transmittance_from_alpha_forward_naive = _make_lazy_cuda_func(
    "transmittance_from_alpha_forward_naive"
)
transmittance_from_alpha_backward_naive = _make_lazy_cuda_func(
    "transmittance_from_alpha_backward_naive"
)

transmittance_from_alpha_patch_based_forward_naive = _make_lazy_cuda_func(
    "transmittance_from_alpha_patch_based_forward_naive"
)
transmittance_from_alpha_patch_based_backward_naive = _make_lazy_cuda_func(
    "transmittance_from_alpha_patch_based_backward_naive"
)

weight_from_sigma_forward_naive = _make_lazy_cuda_func(
    "weight_from_sigma_forward_naive"
)
weight_from_sigma_backward_naive = _make_lazy_cuda_func(
    "weight_from_sigma_backward_naive"
)
weight_from_alpha_forward_naive = _make_lazy_cuda_func(
    "weight_from_alpha_forward_naive"
)
weight_from_alpha_backward_naive = _make_lazy_cuda_func(
    "weight_from_alpha_backward_naive"
)

# weight_from_alpha_importance_sampling_forward_naive = _make_lazy_cuda_func(
#     "weight_from_alpha_importance_sampling_forward_naive"
# )
#
# weight_from_alpha_importance_sampling_backward_naive = _make_lazy_cuda_func(
#     "weight_from_alpha_importance_sampling_backward_naive"
# )

weight_from_alpha_patch_based_forward_naive = _make_lazy_cuda_func(
    "weight_from_alpha_patch_based_forward_naive"
)
weight_from_alpha_patch_based_backward_naive = _make_lazy_cuda_func(
    "weight_from_alpha_patch_based_backward_naive"
)
weight_and_transmittance_from_alpha_patch_based_forward_naive = _make_lazy_cuda_func(
    "weight_and_transmittance_from_alpha_patch_based_forward_naive"
)
weight_and_transmittance_from_alpha_patch_based_backward_naive = _make_lazy_cuda_func(
    "weight_and_transmittance_from_alpha_patch_based_backward_naive"
)
unpack_data = _make_lazy_cuda_func("unpack_data")
unpack_info = _make_lazy_cuda_func("unpack_info")
unpack_info_to_mask = _make_lazy_cuda_func("unpack_info_to_mask")


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/_backend.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

import glob
import json
import os
import shutil
from subprocess import DEVNULL, call

from rich.console import Console
from torch.utils.cpp_extension import _get_build_directory, load

PATH = os.path.dirname(os.path.abspath(__file__))


def cuda_toolkit_available():
    """Check if the nvcc is avaiable on the machine."""
    try:
        call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
        return True
    except FileNotFoundError:
        return False


def cuda_toolkit_version():
    """Get the cuda toolkit version."""
    cuda_home = os.path.join(os.path.dirname(shutil.which("nvcc")), "..")
    if os.path.exists(os.path.join(cuda_home, "version.txt")):
        with open(os.path.join(cuda_home, "version.txt")) as f:
            cuda_version = f.read().strip().split()[-1]
    elif os.path.exists(os.path.join(cuda_home, "version.json")):
        with open(os.path.join(cuda_home, "version.json")) as f:
            cuda_version = json.load(f)["cuda"]["version"]
    else:
        raise RuntimeError("Cannot find the cuda version.")
    return cuda_version


name = "nerfacc_cuda"
build_dir = _get_build_directory(name, verbose=False)
extra_include_paths = []
extra_cflags = ["-O3"]
extra_cuda_cflags = ["-O3"]

_C = None

try:
    # try to import the compiled module (via setup.py)
    from nerfacc import csrc as _C
except ImportError:
    # if failed, try with JIT compilation
    if cuda_toolkit_available():
        if os.listdir(build_dir) != []:
            # If the build exists, we assume the extension has been built
            # and we can load it.

            _C = load(
                name=name,
                sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
                extra_cflags=extra_cflags,
                extra_cuda_cflags=extra_cuda_cflags,
                extra_include_paths=extra_include_paths,
            )
        else:
            # Build from scratch. Remove the build directory just to be safe: pytorch jit might stuck
            # if the build directory exists.
            shutil.rmtree(build_dir)
            with Console().status(
                "[bold yellow]NerfAcc: Setting up CUDA (This may take a few minutes the first time)",
                spinner="bouncingBall",
            ):
                _C = load(
                    name=name,
                    sources=glob.glob(os.path.join(PATH, "csrc/*.cu")),
                    extra_cflags=extra_cflags,
                    extra_cuda_cflags=extra_cuda_cflags,
                    extra_include_paths=extra_include_paths,
                )
    else:
        Console().print(
            "[yellow]NerfAcc: No CUDA toolkit found. NerfAcc will be disabled.[/yellow]"
        )


__all__ = ["_C"]


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/cdf.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"

template <typename scalar_t>
__global__ void cdf_resampling_kernel(
    const uint32_t n_rays,
    const int *packed_info,  // input ray & point indices.
    const scalar_t *starts,  // input start t
    const scalar_t *ends,    // input end t
    const scalar_t *weights, // transmittance weights
    const int *resample_packed_info,
    scalar_t *resample_starts,
    scalar_t *resample_ends)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];  // point idx start.
    const int steps = packed_info[i * 2 + 1]; // point idx shift.
    const int resample_base = resample_packed_info[i * 2 + 0];
    const int resample_steps = resample_packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    starts += base;
    ends += base;
    weights += base;
    resample_starts += resample_base;
    resample_ends += resample_base;

    // normalize weights **per ray**
    scalar_t weights_sum = 0.0f;
    for (int j = 0; j < steps; j++)
        weights_sum += weights[j];
    scalar_t padding = fmaxf(1e-5f - weights_sum, 0.0f);
    scalar_t padding_step = padding / steps;
    weights_sum += padding;

    int num_bins = resample_steps + 1;
    scalar_t cdf_step_size = (1.0f - 1.0 / num_bins) / resample_steps;

    int idx = 0, j = 0;
    scalar_t cdf_prev = 0.0f, cdf_next = (weights[idx] + padding_step) / weights_sum;
    scalar_t cdf_u = 1.0 / (2 * num_bins);
    while (j < num_bins)
    {
        if (cdf_u < cdf_next)
        {
            // printf("cdf_u: %f, cdf_next: %f\n", cdf_u, cdf_next);
            // resample in this interval
            scalar_t scaling = (ends[idx] - starts[idx]) / (cdf_next - cdf_prev);
            scalar_t t = (cdf_u - cdf_prev) * scaling + starts[idx];
            if (j < num_bins - 1)
                resample_starts[j] = t;
            if (j > 0)
                resample_ends[j - 1] = t;
            // going further to next resample
            cdf_u += cdf_step_size;
            j += 1;
        }
        else
        {
            // going to next interval
            idx += 1;
            cdf_prev = cdf_next;
            cdf_next += (weights[idx] + padding_step) / weights_sum;
        }
    }
    if (j != num_bins)
    {
        printf("Error: %d %d %f\n", j, num_bins, weights_sum);
    }
    return;
}

// template <typename scalar_t>
// __global__ void cdf_resampling_kernel(
//     const uint32_t n_rays,
//     const int *packed_info,   // input ray & point indices.
//     const scalar_t *starts,   // input start t
//     const scalar_t *ends,     // input end t
//     const scalar_t *weights,  // transmittance weights
//     const int *resample_packed_info,
//     scalar_t *resample_starts,
//     scalar_t *resample_ends)
// {
//     CUDA_GET_THREAD_ID(i, n_rays);

//     // locate
//     const int base = packed_info[i * 2 + 0];  // point idx start.
//     const int steps = packed_info[i * 2 + 1]; // point idx shift.
//     const int resample_base = resample_packed_info[i * 2 + 0];
//     const int resample_steps = resample_packed_info[i * 2 + 1];
//     if (steps == 0)
//         return;

//     starts += base;
//     ends += base;
//     weights += base;
//     resample_starts += resample_base;
//     resample_ends += resample_base;

//     scalar_t cdf_step_size = 1.0f / resample_steps;

//     // normalize weights **per ray**
//     scalar_t weights_sum = 0.0f;
//     for (int j = 0; j < steps; j++)
//         weights_sum += weights[j];

//     scalar_t padding = fmaxf(1e-5f - weights_sum, 0.0f);
//     scalar_t padding_step = padding / steps;
//     weights_sum += padding;

//     int idx = 0, j = 0;
//     scalar_t cdf_prev = 0.0f, cdf_next = (weights[idx] + padding_step) / weights_sum;
//     scalar_t cdf_u = 0.5f * cdf_step_size;
//     while (cdf_u < 1.0f)
//     {
//         if (cdf_u < cdf_next)
//         {
//             // resample in this interval
//             scalar_t scaling = (ends[idx] - starts[idx]) / (cdf_next - cdf_prev);
//             scalar_t resample_mid = (cdf_u - cdf_prev) * scaling + starts[idx];
//             scalar_t resample_half_size = cdf_step_size * scaling * 0.5;
//             resample_starts[j] = fmaxf(resample_mid - resample_half_size, starts[idx]);
//             resample_ends[j] = fminf(resample_mid + resample_half_size, ends[idx]);
//             // going further to next resample
//             cdf_u += cdf_step_size;
//             j += 1;
//         }
//         else
//         {
//             // go to next interval
//             idx += 1;
//             if (idx == steps)
//                 break;
//             cdf_prev = cdf_next;
//             cdf_next += (weights[idx] + padding_step) / weights_sum;
//         }
//     }
//     if (j != resample_steps)
//     {
//         printf("Error: %d %d %f\n", j, resample_steps, weights_sum);
//     }
//     return;
// }

std::vector<torch::Tensor> ray_resampling(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor weights,
    const int steps)
{
    DEVICE_GUARD(packed_info);

    CHECK_INPUT(packed_info);
    CHECK_INPUT(starts);
    CHECK_INPUT(ends);
    CHECK_INPUT(weights);

    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
    TORCH_CHECK(weights.ndimension() == 1);

    const uint32_t n_rays = packed_info.size(0);
    const uint32_t n_samples = weights.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    torch::Tensor num_steps = torch::split(packed_info, 1, 1)[1];
    torch::Tensor resample_num_steps = (num_steps > 0).to(num_steps.options()) * steps;
    torch::Tensor resample_cum_steps = resample_num_steps.cumsum(0, torch::kInt32);
    torch::Tensor resample_packed_info = torch::cat(
        {resample_cum_steps - resample_num_steps, resample_num_steps}, 1);

    int total_steps = resample_cum_steps[resample_cum_steps.size(0) - 1].item<int>();
    torch::Tensor resample_starts = torch::zeros({total_steps, 1}, starts.options());
    torch::Tensor resample_ends = torch::zeros({total_steps, 1}, ends.options());

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        weights.scalar_type(),
        "ray_resampling",
        ([&]
         { cdf_resampling_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
               n_rays,
               // inputs
               packed_info.data_ptr<int>(),
               starts.data_ptr<scalar_t>(),
               ends.data_ptr<scalar_t>(),
               weights.data_ptr<scalar_t>(),
               resample_packed_info.data_ptr<int>(),
               // outputs
               resample_starts.data_ptr<scalar_t>(),
               resample_ends.data_ptr<scalar_t>()); }));

    return {resample_packed_info, resample_starts, resample_ends};
}


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/contraction.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"
#include "include/helpers_math.h"
#include "include/helpers_contraction.h"

__global__ void contract_kernel(
    // samples info
    const uint32_t n_samples,
    const float *samples, // (n_samples, 3)
    // contraction
    const float *roi,
    const ContractionType type,
    // outputs
    float *out_samples)
{
    CUDA_GET_THREAD_ID(i, n_samples);

    // locate
    samples += i * 3;
    out_samples += i * 3;

    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
    const float3 xyz = make_float3(samples[0], samples[1], samples[2]);
    float3 xyz_unit = apply_contraction(xyz, roi_min, roi_max, type);

    out_samples[0] = xyz_unit.x;
    out_samples[1] = xyz_unit.y;
    out_samples[2] = xyz_unit.z;
    return;
}

__global__ void contract_inv_kernel(
    // samples info
    const uint32_t n_samples,
    const float *samples, // (n_samples, 3)
    // contraction
    const float *roi,
    const ContractionType type,
    // outputs
    float *out_samples)
{
    CUDA_GET_THREAD_ID(i, n_samples);

    // locate
    samples += i * 3;
    out_samples += i * 3;

    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
    const float3 xyz_unit = make_float3(samples[0], samples[1], samples[2]);
    float3 xyz = apply_contraction_inv(xyz_unit, roi_min, roi_max, type);

    out_samples[0] = xyz.x;
    out_samples[1] = xyz.y;
    out_samples[2] = xyz.z;
    return;
}

torch::Tensor contract(
    const torch::Tensor samples,
    // contraction
    const torch::Tensor roi,
    const ContractionType type)
{
    DEVICE_GUARD(samples);
    CHECK_INPUT(samples);

    const int n_samples = samples.size(0);
    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);

    torch::Tensor out_samples = torch::empty({n_samples, 3}, samples.options());

    contract_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_samples,
        samples.data_ptr<float>(),
        // contraction
        roi.data_ptr<float>(),
        type,
        // outputs
        out_samples.data_ptr<float>());
    return out_samples;
}

torch::Tensor contract_inv(
    const torch::Tensor samples,
    // contraction
    const torch::Tensor roi,
    const ContractionType type)
{
    DEVICE_GUARD(samples);
    CHECK_INPUT(samples);

    const int n_samples = samples.size(0);
    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);

    torch::Tensor out_samples = torch::empty({n_samples, 3}, samples.options());

    contract_inv_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_samples,
        samples.data_ptr<float>(),
        // contraction
        roi.data_ptr<float>(),
        type,
        // outputs
        out_samples.data_ptr<float>());
    return out_samples;
}


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/include/helpers_contraction.h
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#pragma once

#include "helpers_math.h"

enum ContractionType
{
    AABB = 0,
    UN_BOUNDED_TANH = 1,
    UN_BOUNDED_SPHERE = 2,
};

inline __device__ __host__ float3 roi_to_unit(
    const float3 xyz, const float3 roi_min, const float3 roi_max)
{
    // roi -> [0, 1]^3
    return (xyz - roi_min) / (roi_max - roi_min);
}

inline __device__ __host__ float3 unit_to_roi(
    const float3 xyz, const float3 roi_min, const float3 roi_max)
{
    // [0, 1]^3 -> roi
    return xyz * (roi_max - roi_min) + roi_min;
}

inline __device__ __host__ float3 inf_to_unit_tanh(
    const float3 xyz, float3 roi_min, const float3 roi_max)
{
    /**
      [-inf, inf]^3 -> [0, 1]^3
      roi -> cube of [0.25, 0.75]^3
    **/
    float3 xyz_unit = roi_to_unit(xyz, roi_min, roi_max); // roi -> [0, 1]^3
    xyz_unit = xyz_unit - 0.5f;                           // roi -> [-0.5, 0.5]^3
    return make_float3(tanhf(xyz_unit.x), tanhf(xyz_unit.y), tanhf(xyz_unit.z)) * 0.5f + 0.5f;
}

inline __device__ __host__ float3 unit_to_inf_tanh(
    const float3 xyz, float3 roi_min, const float3 roi_max)
{
    /**
      [0, 1]^3 -> [-inf, inf]^3
      cube of [0.25, 0.75]^3 -> roi
    **/
    float3 xyz_unit = clamp(
        make_float3(
            atanhf(xyz.x * 2.0f - 1.0f),
            atanhf(xyz.y * 2.0f - 1.0f),
            atanhf(xyz.z * 2.0f - 1.0f)),
        -1e10f,
        1e10f);
    xyz_unit = xyz_unit + 0.5f;
    xyz_unit = unit_to_roi(xyz_unit, roi_min, roi_max);
    return xyz_unit;
}

inline __device__ __host__ float3 inf_to_unit_sphere(
    const float3 xyz, const float3 roi_min, const float3 roi_max)
{
    /** From MipNeRF360
        [-inf, inf]^3 -> sphere of [0, 1]^3;
        roi -> sphere of [0.25, 0.75]^3
    **/
    float3 xyz_unit = roi_to_unit(xyz, roi_min, roi_max); // roi -> [0, 1]^3
    xyz_unit = xyz_unit * 2.0f - 1.0f;                    // roi -> [-1, 1]^3

    float norm_sq = dot(xyz_unit, xyz_unit);
    float norm = sqrt(norm_sq);
    if (norm > 1.0f)
    {
        xyz_unit = (2.0f - 1.0f / norm) * (xyz_unit / norm);
    }
    xyz_unit = xyz_unit * 0.25f + 0.5f; // [-1, 1]^3 -> [0.25, 0.75]^3
    return xyz_unit;
}

inline __device__ __host__ float3 unit_sphere_to_inf(
    const float3 xyz, const float3 roi_min, const float3 roi_max)
{
    /** From MipNeRF360
        sphere of [0, 1]^3 -> [-inf, inf]^3;
        sphere of [0.25, 0.75]^3 -> roi
    **/
    float3 xyz_unit = (xyz - 0.5f) * 4.0f; // [0.25, 0.75]^3 -> [-1, 1]^3

    float norm_sq = dot(xyz_unit, xyz_unit);
    float norm = sqrt(norm_sq);
    if (norm > 1.0f)
    {
        xyz_unit = xyz_unit / fmaxf((2.0f * norm - 1.0f * norm_sq), 1e-10f);
    }
    xyz_unit = xyz_unit * 0.5f + 0.5f;                  // [-1, 1]^3 -> [0, 1]^3
    xyz_unit = unit_to_roi(xyz_unit, roi_min, roi_max); // [0, 1]^3 -> roi
    return xyz_unit;
}

inline __device__ __host__ float3 apply_contraction(
    const float3 xyz, const float3 roi_min, const float3 roi_max,
    const ContractionType type)
{
    switch (type)
    {
    case AABB:
        return roi_to_unit(xyz, roi_min, roi_max);
    case UN_BOUNDED_TANH:
        return inf_to_unit_tanh(xyz, roi_min, roi_max);
    case UN_BOUNDED_SPHERE:
        return inf_to_unit_sphere(xyz, roi_min, roi_max);
    }
}

inline __device__ __host__ float3 apply_contraction_inv(
    const float3 xyz, const float3 roi_min, const float3 roi_max,
    const ContractionType type)
{
    switch (type)
    {
    case AABB:
        return unit_to_roi(xyz, roi_min, roi_max);
    case UN_BOUNDED_TANH:
        return unit_to_inf_tanh(xyz, roi_min, roi_max);
    case UN_BOUNDED_SPHERE:
        return unit_sphere_to_inf(xyz, roi_min, roi_max);
    }
}


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/include/helpers_cuda.h
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#pragma once

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/Exceptions.h>
#include <cmath>
// #include <ATen/cuda/cub_definitions.cuh>

// cub support for scan by key is added to cub 1.15
// in https://github.com/NVIDIA/cub/pull/376
#if CUB_VERSION >= 101500
#define CUB_SUPPORTS_SCAN_BY_KEY() 1
#else
#define CUB_SUPPORTS_SCAN_BY_KEY() 0
#endif

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
    TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
    CHECK_CUDA(x);     \
    CHECK_CONTIGUOUS(x)
#define CUDA_GET_THREAD_ID(tid, Q)                         \
    const int tid = blockIdx.x * blockDim.x + threadIdx.x; \
    if (tid >= Q)                                          \
    return
#define CUDA_GET_THREAD_ID_2D(tidx, tidy, P, Q)                         \
    const int tidx = blockIdx.x * blockDim.x + threadIdx.x; \
    const int tidy = blockIdx.y * blockDim.y + threadIdx.y; \
    if (tidx >= P || tidy >= Q)                                          \
    return
#define CUDA_N_BLOCKS_NEEDED(Q, CUDA_N_THREADS) ((Q - 1) / CUDA_N_THREADS + 1)
#define DEVICE_GUARD(_ten) \
    const at::cuda::OptionalCUDAGuard device_guard(device_of(_ten));

// https://github.com/pytorch/pytorch/blob/233305a852e1cd7f319b15b5137074c9eac455f6/aten/src/ATen/cuda/cub.cuh#L38-L46
#define CUB_WRAPPER(func, ...) do {                                       \
  size_t temp_storage_bytes = 0;                                          \
  func(nullptr, temp_storage_bytes, __VA_ARGS__);                         \
  auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get();    \
  auto temp_storage = caching_allocator.allocate(temp_storage_bytes);     \
  func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__);              \
  AT_CUDA_CHECK(cudaGetLastError());                                      \
} while (false)


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/include/helpers_math.h
================================================
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 * Modified by Ruilong Li, 2022
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 *  This file implements common mathematical operations on vector types
 *  (float3, float4 etc.) since these are not provided as standard by CUDA.
 *
 *  The syntax is modeled on the Cg standard library.
 *
 *  This is part of the Helper library includes
 *
 *    Thanks to Linh Hah for additions and fixes.
 */

#ifndef HELPER_MATH_H
#define HELPER_MATH_H

#include "cuda_runtime.h"

typedef unsigned int uint;
typedef unsigned short ushort;

#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif

#ifndef __CUDACC__
#include <math.h>

////////////////////////////////////////////////////////////////////////////////
// host implementations of CUDA functions
////////////////////////////////////////////////////////////////////////////////

inline float fminf(float a, float b)
{
    return a < b ? a : b;
}

inline float fmaxf(float a, float b)
{
    return a > b ? a : b;
}

inline int max(int a, int b)
{
    return a > b ? a : b;
}

inline int min(int a, int b)
{
    return a < b ? a : b;
}

inline float rsqrtf(float x)
{
    return 1.0f / sqrtf(x);
}
#endif

////////////////////////////////////////////////////////////////////////////////
// constructors
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 make_float2(float s)
{
    return make_float2(s, s);
}
inline __host__ __device__ float2 make_float2(float3 a)
{
    return make_float2(a.x, a.y);
}
inline __host__ __device__ float2 make_float2(int2 a)
{
    return make_float2(float(a.x), float(a.y));
}
inline __host__ __device__ float2 make_float2(uint2 a)
{
    return make_float2(float(a.x), float(a.y));
}

inline __host__ __device__ int2 make_int2(int s)
{
    return make_int2(s, s);
}
inline __host__ __device__ int2 make_int2(int3 a)
{
    return make_int2(a.x, a.y);
}
inline __host__ __device__ int2 make_int2(uint2 a)
{
    return make_int2(int(a.x), int(a.y));
}
inline __host__ __device__ int2 make_int2(float2 a)
{
    return make_int2(int(a.x), int(a.y));
}

inline __host__ __device__ uint2 make_uint2(uint s)
{
    return make_uint2(s, s);
}
inline __host__ __device__ uint2 make_uint2(uint3 a)
{
    return make_uint2(a.x, a.y);
}
inline __host__ __device__ uint2 make_uint2(int2 a)
{
    return make_uint2(uint(a.x), uint(a.y));
}

inline __host__ __device__ float3 make_float3(float s)
{
    return make_float3(s, s, s);
}
inline __host__ __device__ float3 make_float3(float2 a)
{
    return make_float3(a.x, a.y, 0.0f);
}
inline __host__ __device__ float3 make_float3(float2 a, float s)
{
    return make_float3(a.x, a.y, s);
}
inline __host__ __device__ float3 make_float3(float4 a)
{
    return make_float3(a.x, a.y, a.z);
}
inline __host__ __device__ float3 make_float3(int3 a)
{
    return make_float3(float(a.x), float(a.y), float(a.z));
}
inline __host__ __device__ float3 make_float3(uint3 a)
{
    return make_float3(float(a.x), float(a.y), float(a.z));
}

inline __host__ __device__ int3 make_int3(int s)
{
    return make_int3(s, s, s);
}
inline __host__ __device__ int3 make_int3(int2 a)
{
    return make_int3(a.x, a.y, 0);
}
inline __host__ __device__ int3 make_int3(int2 a, int s)
{
    return make_int3(a.x, a.y, s);
}
inline __host__ __device__ int3 make_int3(uint3 a)
{
    return make_int3(int(a.x), int(a.y), int(a.z));
}
inline __host__ __device__ int3 make_int3(float3 a)
{
    return make_int3(int(a.x), int(a.y), int(a.z));
}

inline __host__ __device__ uint3 make_uint3(uint s)
{
    return make_uint3(s, s, s);
}
inline __host__ __device__ uint3 make_uint3(uint2 a)
{
    return make_uint3(a.x, a.y, 0);
}
inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
{
    return make_uint3(a.x, a.y, s);
}
inline __host__ __device__ uint3 make_uint3(uint4 a)
{
    return make_uint3(a.x, a.y, a.z);
}
inline __host__ __device__ uint3 make_uint3(int3 a)
{
    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
}

inline __host__ __device__ float4 make_float4(float s)
{
    return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 make_float4(float3 a)
{
    return make_float4(a.x, a.y, a.z, 0.0f);
}
inline __host__ __device__ float4 make_float4(float3 a, float w)
{
    return make_float4(a.x, a.y, a.z, w);
}
inline __host__ __device__ float4 make_float4(int4 a)
{
    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
}
inline __host__ __device__ float4 make_float4(uint4 a)
{
    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
}

inline __host__ __device__ int4 make_int4(int s)
{
    return make_int4(s, s, s, s);
}
inline __host__ __device__ int4 make_int4(int3 a)
{
    return make_int4(a.x, a.y, a.z, 0);
}
inline __host__ __device__ int4 make_int4(int3 a, int w)
{
    return make_int4(a.x, a.y, a.z, w);
}
inline __host__ __device__ int4 make_int4(uint4 a)
{
    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
}
inline __host__ __device__ int4 make_int4(float4 a)
{
    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
}

inline __host__ __device__ uint4 make_uint4(uint s)
{
    return make_uint4(s, s, s, s);
}
inline __host__ __device__ uint4 make_uint4(uint3 a)
{
    return make_uint4(a.x, a.y, a.z, 0);
}
inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
{
    return make_uint4(a.x, a.y, a.z, w);
}
inline __host__ __device__ uint4 make_uint4(int4 a)
{
    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
}

////////////////////////////////////////////////////////////////////////////////
// negate
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator-(float2 &a)
{
    return make_float2(-a.x, -a.y);
}
inline __host__ __device__ int2 operator-(int2 &a)
{
    return make_int2(-a.x, -a.y);
}
inline __host__ __device__ float3 operator-(float3 &a)
{
    return make_float3(-a.x, -a.y, -a.z);
}
inline __host__ __device__ int3 operator-(int3 &a)
{
    return make_int3(-a.x, -a.y, -a.z);
}
inline __host__ __device__ float4 operator-(float4 &a)
{
    return make_float4(-a.x, -a.y, -a.z, -a.w);
}
inline __host__ __device__ int4 operator-(int4 &a)
{
    return make_int4(-a.x, -a.y, -a.z, -a.w);
}

////////////////////////////////////////////////////////////////////////////////
// addition
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator+(float2 a, float2 b)
{
    return make_float2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(float2 &a, float2 b)
{
    a.x += b.x;
    a.y += b.y;
}
inline __host__ __device__ float2 operator+(float2 a, float b)
{
    return make_float2(a.x + b, a.y + b);
}
inline __host__ __device__ float2 operator+(float b, float2 a)
{
    return make_float2(a.x + b, a.y + b);
}
inline __host__ __device__ void operator+=(float2 &a, float b)
{
    a.x += b;
    a.y += b;
}

inline __host__ __device__ int2 operator+(int2 a, int2 b)
{
    return make_int2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(int2 &a, int2 b)
{
    a.x += b.x;
    a.y += b.y;
}
inline __host__ __device__ int2 operator+(int2 a, int b)
{
    return make_int2(a.x + b, a.y + b);
}
inline __host__ __device__ int2 operator+(int b, int2 a)
{
    return make_int2(a.x + b, a.y + b);
}
inline __host__ __device__ void operator+=(int2 &a, int b)
{
    a.x += b;
    a.y += b;
}

inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
{
    return make_uint2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
{
    a.x += b.x;
    a.y += b.y;
}
inline __host__ __device__ uint2 operator+(uint2 a, uint b)
{
    return make_uint2(a.x + b, a.y + b);
}
inline __host__ __device__ uint2 operator+(uint b, uint2 a)
{
    return make_uint2(a.x + b, a.y + b);
}
inline __host__ __device__ void operator+=(uint2 &a, uint b)
{
    a.x += b;
    a.y += b;
}

inline __host__ __device__ float3 operator+(float3 a, float3 b)
{
    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(float3 &a, float3 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
}
inline __host__ __device__ float3 operator+(float3 a, float b)
{
    return make_float3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(float3 &a, float b)
{
    a.x += b;
    a.y += b;
    a.z += b;
}

inline __host__ __device__ int3 operator+(int3 a, int3 b)
{
    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(int3 &a, int3 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
}
inline __host__ __device__ int3 operator+(int3 a, int b)
{
    return make_int3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(int3 &a, int b)
{
    a.x += b;
    a.y += b;
    a.z += b;
}

inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
{
    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
}
inline __host__ __device__ uint3 operator+(uint3 a, uint b)
{
    return make_uint3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(uint3 &a, uint b)
{
    a.x += b;
    a.y += b;
    a.z += b;
}

inline __host__ __device__ int3 operator+(int b, int3 a)
{
    return make_int3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ uint3 operator+(uint b, uint3 a)
{
    return make_uint3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ float3 operator+(float b, float3 a)
{
    return make_float3(a.x + b, a.y + b, a.z + b);
}

inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ void operator+=(float4 &a, float4 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
    a.w += b.w;
}
inline __host__ __device__ float4 operator+(float4 a, float b)
{
    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ float4 operator+(float b, float4 a)
{
    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ void operator+=(float4 &a, float b)
{
    a.x += b;
    a.y += b;
    a.z += b;
    a.w += b;
}

inline __host__ __device__ int4 operator+(int4 a, int4 b)
{
    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ void operator+=(int4 &a, int4 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
    a.w += b.w;
}
inline __host__ __device__ int4 operator+(int4 a, int b)
{
    return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ int4 operator+(int b, int4 a)
{
    return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ void operator+=(int4 &a, int b)
{
    a.x += b;
    a.y += b;
    a.z += b;
    a.w += b;
}

inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
{
    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
{
    a.x += b.x;
    a.y += b.y;
    a.z += b.z;
    a.w += b.w;
}
inline __host__ __device__ uint4 operator+(uint4 a, uint b)
{
    return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ uint4 operator+(uint b, uint4 a)
{
    return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ void operator+=(uint4 &a, uint b)
{
    a.x += b;
    a.y += b;
    a.z += b;
    a.w += b;
}

////////////////////////////////////////////////////////////////////////////////
// subtract
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator-(float2 a, float2 b)
{
    return make_float2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(float2 &a, float2 b)
{
    a.x -= b.x;
    a.y -= b.y;
}
inline __host__ __device__ float2 operator-(float2 a, float b)
{
    return make_float2(a.x - b, a.y - b);
}
inline __host__ __device__ float2 operator-(float b, float2 a)
{
    return make_float2(b - a.x, b - a.y);
}
inline __host__ __device__ void operator-=(float2 &a, float b)
{
    a.x -= b;
    a.y -= b;
}

inline __host__ __device__ int2 operator-(int2 a, int2 b)
{
    return make_int2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(int2 &a, int2 b)
{
    a.x -= b.x;
    a.y -= b.y;
}
inline __host__ __device__ int2 operator-(int2 a, int b)
{
    return make_int2(a.x - b, a.y - b);
}
inline __host__ __device__ int2 operator-(int b, int2 a)
{
    return make_int2(b - a.x, b - a.y);
}
inline __host__ __device__ void operator-=(int2 &a, int b)
{
    a.x -= b;
    a.y -= b;
}

inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
{
    return make_uint2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
{
    a.x -= b.x;
    a.y -= b.y;
}
inline __host__ __device__ uint2 operator-(uint2 a, uint b)
{
    return make_uint2(a.x - b, a.y - b);
}
inline __host__ __device__ uint2 operator-(uint b, uint2 a)
{
    return make_uint2(b - a.x, b - a.y);
}
inline __host__ __device__ void operator-=(uint2 &a, uint b)
{
    a.x -= b;
    a.y -= b;
}

inline __host__ __device__ float3 operator-(float3 a, float3 b)
{
    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(float3 &a, float3 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
}
inline __host__ __device__ float3 operator-(float3 a, float b)
{
    return make_float3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ float3 operator-(float b, float3 a)
{
    return make_float3(b - a.x, b - a.y, b - a.z);
}
inline __host__ __device__ void operator-=(float3 &a, float b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
}

inline __host__ __device__ int3 operator-(int3 a, int3 b)
{
    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(int3 &a, int3 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
}
inline __host__ __device__ int3 operator-(int3 a, int b)
{
    return make_int3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ int3 operator-(int b, int3 a)
{
    return make_int3(b - a.x, b - a.y, b - a.z);
}
inline __host__ __device__ void operator-=(int3 &a, int b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
}

inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
{
    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
}
inline __host__ __device__ uint3 operator-(uint3 a, uint b)
{
    return make_uint3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ uint3 operator-(uint b, uint3 a)
{
    return make_uint3(b - a.x, b - a.y, b - a.z);
}
inline __host__ __device__ void operator-=(uint3 &a, uint b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
}

inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ void operator-=(float4 &a, float4 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
    a.w -= b.w;
}
inline __host__ __device__ float4 operator-(float4 a, float b)
{
    return make_float4(a.x - b, a.y - b, a.z - b, a.w - b);
}
inline __host__ __device__ void operator-=(float4 &a, float b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
    a.w -= b;
}

inline __host__ __device__ int4 operator-(int4 a, int4 b)
{
    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ void operator-=(int4 &a, int4 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
    a.w -= b.w;
}
inline __host__ __device__ int4 operator-(int4 a, int b)
{
    return make_int4(a.x - b, a.y - b, a.z - b, a.w - b);
}
inline __host__ __device__ int4 operator-(int b, int4 a)
{
    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
}
inline __host__ __device__ void operator-=(int4 &a, int b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
    a.w -= b;
}

inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
{
    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
{
    a.x -= b.x;
    a.y -= b.y;
    a.z -= b.z;
    a.w -= b.w;
}
inline __host__ __device__ uint4 operator-(uint4 a, uint b)
{
    return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b);
}
inline __host__ __device__ uint4 operator-(uint b, uint4 a)
{
    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
}
inline __host__ __device__ void operator-=(uint4 &a, uint b)
{
    a.x -= b;
    a.y -= b;
    a.z -= b;
    a.w -= b;
}

////////////////////////////////////////////////////////////////////////////////
// multiply
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator*(float2 a, float2 b)
{
    return make_float2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ void operator*=(float2 &a, float2 b)
{
    a.x *= b.x;
    a.y *= b.y;
}
inline __host__ __device__ float2 operator*(float2 a, float b)
{
    return make_float2(a.x * b, a.y * b);
}
inline __host__ __device__ float2 operator*(float b, float2 a)
{
    return make_float2(b * a.x, b * a.y);
}
inline __host__ __device__ void operator*=(float2 &a, float b)
{
    a.x *= b;
    a.y *= b;
}

inline __host__ __device__ int2 operator*(int2 a, int2 b)
{
    return make_int2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ void operator*=(int2 &a, int2 b)
{
    a.x *= b.x;
    a.y *= b.y;
}
inline __host__ __device__ int2 operator*(int2 a, int b)
{
    return make_int2(a.x * b, a.y * b);
}
inline __host__ __device__ int2 operator*(int b, int2 a)
{
    return make_int2(b * a.x, b * a.y);
}
inline __host__ __device__ void operator*=(int2 &a, int b)
{
    a.x *= b;
    a.y *= b;
}

inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
{
    return make_uint2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
{
    a.x *= b.x;
    a.y *= b.y;
}
inline __host__ __device__ uint2 operator*(uint2 a, uint b)
{
    return make_uint2(a.x * b, a.y * b);
}
inline __host__ __device__ uint2 operator*(uint b, uint2 a)
{
    return make_uint2(b * a.x, b * a.y);
}
inline __host__ __device__ void operator*=(uint2 &a, uint b)
{
    a.x *= b;
    a.y *= b;
}

inline __host__ __device__ float3 operator*(float3 a, float3 b)
{
    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ void operator*=(float3 &a, float3 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
}
inline __host__ __device__ float3 operator*(float3 a, float b)
{
    return make_float3(a.x * b, a.y * b, a.z * b);
}
inline __host__ __device__ float3 operator*(float b, float3 a)
{
    return make_float3(b * a.x, b * a.y, b * a.z);
}
inline __host__ __device__ void operator*=(float3 &a, float b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
}

inline __host__ __device__ int3 operator*(int3 a, int3 b)
{
    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ void operator*=(int3 &a, int3 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
}
inline __host__ __device__ int3 operator*(int3 a, int b)
{
    return make_int3(a.x * b, a.y * b, a.z * b);
}
inline __host__ __device__ int3 operator*(int b, int3 a)
{
    return make_int3(b * a.x, b * a.y, b * a.z);
}
inline __host__ __device__ void operator*=(int3 &a, int b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
}

inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
{
    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
}
inline __host__ __device__ uint3 operator*(uint3 a, uint b)
{
    return make_uint3(a.x * b, a.y * b, a.z * b);
}
inline __host__ __device__ uint3 operator*(uint b, uint3 a)
{
    return make_uint3(b * a.x, b * a.y, b * a.z);
}
inline __host__ __device__ void operator*=(uint3 &a, uint b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
}

inline __host__ __device__ float4 operator*(float4 a, float4 b)
{
    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
}
inline __host__ __device__ void operator*=(float4 &a, float4 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
    a.w *= b.w;
}
inline __host__ __device__ float4 operator*(float4 a, float b)
{
    return make_float4(a.x * b, a.y * b, a.z * b, a.w * b);
}
inline __host__ __device__ float4 operator*(float b, float4 a)
{
    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
}
inline __host__ __device__ void operator*=(float4 &a, float b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
    a.w *= b;
}

inline __host__ __device__ int4 operator*(int4 a, int4 b)
{
    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
}
inline __host__ __device__ void operator*=(int4 &a, int4 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
    a.w *= b.w;
}
inline __host__ __device__ int4 operator*(int4 a, int b)
{
    return make_int4(a.x * b, a.y * b, a.z * b, a.w * b);
}
inline __host__ __device__ int4 operator*(int b, int4 a)
{
    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
}
inline __host__ __device__ void operator*=(int4 &a, int b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
    a.w *= b;
}

inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
{
    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
}
inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
{
    a.x *= b.x;
    a.y *= b.y;
    a.z *= b.z;
    a.w *= b.w;
}
inline __host__ __device__ uint4 operator*(uint4 a, uint b)
{
    return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b);
}
inline __host__ __device__ uint4 operator*(uint b, uint4 a)
{
    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
}
inline __host__ __device__ void operator*=(uint4 &a, uint b)
{
    a.x *= b;
    a.y *= b;
    a.z *= b;
    a.w *= b;
}

////////////////////////////////////////////////////////////////////////////////
// divide
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 operator/(float2 a, float2 b)
{
    return make_float2(a.x / b.x, a.y / b.y);
}
inline __host__ __device__ void operator/=(float2 &a, float2 b)
{
    a.x /= b.x;
    a.y /= b.y;
}
inline __host__ __device__ float2 operator/(float2 a, float b)
{
    return make_float2(a.x / b, a.y / b);
}
inline __host__ __device__ void operator/=(float2 &a, float b)
{
    a.x /= b;
    a.y /= b;
}
inline __host__ __device__ float2 operator/(float b, float2 a)
{
    return make_float2(b / a.x, b / a.y);
}

inline __host__ __device__ float3 operator/(float3 a, float3 b)
{
    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ void operator/=(float3 &a, float3 b)
{
    a.x /= b.x;
    a.y /= b.y;
    a.z /= b.z;
}
inline __host__ __device__ float3 operator/(float3 a, float b)
{
    return make_float3(a.x / b, a.y / b, a.z / b);
}
inline __host__ __device__ void operator/=(float3 &a, float b)
{
    a.x /= b;
    a.y /= b;
    a.z /= b;
}
inline __host__ __device__ float3 operator/(float b, float3 a)
{
    return make_float3(b / a.x, b / a.y, b / a.z);
}

inline __host__ __device__ float4 operator/(float4 a, float4 b)
{
    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
}
inline __host__ __device__ void operator/=(float4 &a, float4 b)
{
    a.x /= b.x;
    a.y /= b.y;
    a.z /= b.z;
    a.w /= b.w;
}
inline __host__ __device__ float4 operator/(float4 a, float b)
{
    return make_float4(a.x / b, a.y / b, a.z / b, a.w / b);
}
inline __host__ __device__ void operator/=(float4 &a, float b)
{
    a.x /= b;
    a.y /= b;
    a.z /= b;
    a.w /= b;
}
inline __host__ __device__ float4 operator/(float b, float4 a)
{
    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
}

////////////////////////////////////////////////////////////////////////////////
// min
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 fminf(float2 a, float2 b)
{
    return make_float2(fminf(a.x, b.x), fminf(a.y, b.y));
}
inline __host__ __device__ float3 fminf(float3 a, float3 b)
{
    return make_float3(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z));
}
inline __host__ __device__ float4 fminf(float4 a, float4 b)
{
    return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
}

inline __host__ __device__ int2 min(int2 a, int2 b)
{
    return make_int2(min(a.x, b.x), min(a.y, b.y));
}
inline __host__ __device__ int3 min(int3 a, int3 b)
{
    return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
}
inline __host__ __device__ int4 min(int4 a, int4 b)
{
    return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
}

inline __host__ __device__ uint2 min(uint2 a, uint2 b)
{
    return make_uint2(min(a.x, b.x), min(a.y, b.y));
}
inline __host__ __device__ uint3 min(uint3 a, uint3 b)
{
    return make_uint3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
}
inline __host__ __device__ uint4 min(uint4 a, uint4 b)
{
    return make_uint4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
}

////////////////////////////////////////////////////////////////////////////////
// max
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
{
    return make_float2(fmaxf(a.x, b.x), fmaxf(a.y, b.y));
}
inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
{
    return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z));
}
inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
{
    return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
}

inline __host__ __device__ int2 max(int2 a, int2 b)
{
    return make_int2(max(a.x, b.x), max(a.y, b.y));
}
inline __host__ __device__ int3 max(int3 a, int3 b)
{
    return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
}
inline __host__ __device__ int4 max(int4 a, int4 b)
{
    return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
}

inline __host__ __device__ uint2 max(uint2 a, uint2 b)
{
    return make_uint2(max(a.x, b.x), max(a.y, b.y));
}
inline __host__ __device__ uint3 max(uint3 a, uint3 b)
{
    return make_uint3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
}
inline __host__ __device__ uint4 max(uint4 a, uint4 b)
{
    return make_uint4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
}

////////////////////////////////////////////////////////////////////////////////
// lerp
// - linear interpolation between a and b, based on value t in [0, 1] range
////////////////////////////////////////////////////////////////////////////////

inline __device__ __host__ float lerp(float a, float b, float t)
{
    return a + t * (b - a);
}
inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
{
    return a + t * (b - a);
}
inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
{
    return a + t * (b - a);
}
inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
{
    return a + t * (b - a);
}

////////////////////////////////////////////////////////////////////////////////
// clamp
// - clamp the value v to be in the range [a, b]
////////////////////////////////////////////////////////////////////////////////

inline __device__ __host__ float clamp(float f, float a, float b)
{
    return fmaxf(a, fminf(f, b));
}
inline __device__ __host__ int clamp(int f, int a, int b)
{
    return max(a, min(f, b));
}
inline __device__ __host__ uint clamp(uint f, uint a, uint b)
{
    return max(a, min(f, b));
}

inline __device__ __host__ float2 clamp(float2 v, float a, float b)
{
    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
{
    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ float4 clamp(float4 v, float a, float b)
{
    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
{
    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}

inline __device__ __host__ int2 clamp(int2 v, int a, int b)
{
    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
{
    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
inline __device__ __host__ int3 clamp(int3 v, int a, int b)
{
    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
{
    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ int4 clamp(int4 v, int a, int b)
{
    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
{
    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}

inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
{
    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
{
    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
{
    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
{
    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
{
    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
{
    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}

////////////////////////////////////////////////////////////////////////////////
// dot product
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float dot(float2 a, float2 b)
{
    return a.x * b.x + a.y * b.y;
}
inline __host__ __device__ float dot(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __host__ __device__ float dot(float4 a, float4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

inline __host__ __device__ int dot(int2 a, int2 b)
{
    return a.x * b.x + a.y * b.y;
}
inline __host__ __device__ int dot(int3 a, int3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __host__ __device__ int dot(int4 a, int4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

inline __host__ __device__ uint dot(uint2 a, uint2 b)
{
    return a.x * b.x + a.y * b.y;
}
inline __host__ __device__ uint dot(uint3 a, uint3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __host__ __device__ uint dot(uint4 a, uint4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

////////////////////////////////////////////////////////////////////////////////
// length
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float length(float2 v)
{
    return sqrtf(dot(v, v));
}
inline __host__ __device__ float length(float3 v)
{
    return sqrtf(dot(v, v));
}
inline __host__ __device__ float length(float4 v)
{
    return sqrtf(dot(v, v));
}

////////////////////////////////////////////////////////////////////////////////
// normalize
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 normalize(float2 v)
{
    float invLen = rsqrtf(dot(v, v));
    return v * invLen;
}
inline __host__ __device__ float3 normalize(float3 v)
{
    float invLen = rsqrtf(dot(v, v));
    return v * invLen;
}
inline __host__ __device__ float4 normalize(float4 v)
{
    float invLen = rsqrtf(dot(v, v));
    return v * invLen;
}

////////////////////////////////////////////////////////////////////////////////
// floor
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 floorf(float2 v)
{
    return make_float2(floorf(v.x), floorf(v.y));
}
inline __host__ __device__ float3 floorf(float3 v)
{
    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
}
inline __host__ __device__ float4 floorf(float4 v)
{
    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
}

////////////////////////////////////////////////////////////////////////////////
// frac - returns the fractional portion of a scalar or each vector component
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float fracf(float v)
{
    return v - floorf(v);
}
inline __host__ __device__ float2 fracf(float2 v)
{
    return make_float2(fracf(v.x), fracf(v.y));
}
inline __host__ __device__ float3 fracf(float3 v)
{
    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
}
inline __host__ __device__ float4 fracf(float4 v)
{
    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
}

////////////////////////////////////////////////////////////////////////////////
// fmod
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 fmodf(float2 a, float2 b)
{
    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
}
inline __host__ __device__ float3 fmodf(float3 a, float3 b)
{
    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
}
inline __host__ __device__ float4 fmodf(float4 a, float4 b)
{
    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
}

////////////////////////////////////////////////////////////////////////////////
// absolute value
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float2 fabs(float2 v)
{
    return make_float2(fabs(v.x), fabs(v.y));
}
inline __host__ __device__ float3 fabs(float3 v)
{
    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
}
inline __host__ __device__ float4 fabs(float4 v)
{
    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
}

inline __host__ __device__ int2 abs(int2 v)
{
    return make_int2(abs(v.x), abs(v.y));
}
inline __host__ __device__ int3 abs(int3 v)
{
    return make_int3(abs(v.x), abs(v.y), abs(v.z));
}
inline __host__ __device__ int4 abs(int4 v)
{
    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
}

////////////////////////////////////////////////////////////////////////////////
// reflect
// - returns reflection of incident ray I around surface normal N
// - N should be normalized, reflected vector's length is equal to length of I
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float3 reflect(float3 i, float3 n)
{
    return i - 2.0f * n * dot(n, i);
}

////////////////////////////////////////////////////////////////////////////////
// cross product
////////////////////////////////////////////////////////////////////////////////

inline __host__ __device__ float3 cross(float3 a, float3 b)
{
    return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
}

////////////////////////////////////////////////////////////////////////////////
// smoothstep
// - returns 0 if x < a
// - returns 1 if x > b
// - otherwise returns smooth interpolation between 0 and 1 based on x
////////////////////////////////////////////////////////////////////////////////

inline __device__ __host__ float smoothstep(float a, float b, float x)
{
    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y * y * (3.0f - (2.0f * y)));
}
inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
{
    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y * y * (make_float2(3.0f) - (make_float2(2.0f) * y)));
}
inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
{
    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y * y * (make_float3(3.0f) - (make_float3(2.0f) * y)));
}
inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
{
    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
    return (y * y * (make_float4(3.0f) - (make_float4(2.0f) * y)));
}

////////////////////////////////////////////////////////////////////////////////
// sign
////////////////////////////////////////////////////////////////////////////////
inline __device__ __host__ float3 sign(float3 a)
{
    return make_float3(
        copysignf(1.0f, a.x), copysignf(1.0f, a.y), copysignf(1.0f, a.z));
}

#endif

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/intersection.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"

template <typename scalar_t>
inline __host__ __device__ void _swap(scalar_t &a, scalar_t &b)
{
    scalar_t c = a;
    a = b;
    b = c;
}

template <typename scalar_t>
inline __host__ __device__ void _ray_aabb_intersect(
    const scalar_t *rays_o,
    const scalar_t *rays_d,
    const scalar_t *aabb,
    scalar_t *near,
    scalar_t *far)
{
    // aabb is [xmin, ymin, zmin, xmax, ymax, zmax]
    scalar_t tmin = (aabb[0] - rays_o[0]) / rays_d[0];
    scalar_t tmax = (aabb[3] - rays_o[0]) / rays_d[0];
    if (tmin > tmax)
        _swap(tmin, tmax);

    scalar_t tymin = (aabb[1] - rays_o[1]) / rays_d[1];
    scalar_t tymax = (aabb[4] - rays_o[1]) / rays_d[1];
    if (tymin > tymax)
        _swap(tymin, tymax);

    if (tmin > tymax || tymin > tmax)
    {
        *near = 1e10;
        *far = 1e10;
        return;
    }

    if (tymin > tmin)
        tmin = tymin;
    if (tymax < tmax)
        tmax = tymax;

    scalar_t tzmin = (aabb[2] - rays_o[2]) / rays_d[2];
    scalar_t tzmax = (aabb[5] - rays_o[2]) / rays_d[2];
    if (tzmin > tzmax)
        _swap(tzmin, tzmax);

    if (tmin > tzmax || tzmin > tmax)
    {
        *near = 1e10;
        *far = 1e10;
        return;
    }

    if (tzmin > tmin)
        tmin = tzmin;
    if (tzmax < tmax)
        tmax = tzmax;

    *near = tmin;
    *far = tmax;
    return;
}

template <typename scalar_t>
__global__ void ray_aabb_intersect_kernel(
    const int N,
    const scalar_t *rays_o,
    const scalar_t *rays_d,
    const scalar_t *aabb,
    scalar_t *t_min,
    scalar_t *t_max)
{
    // aabb is [xmin, ymin, zmin, xmax, ymax, zmax]
    CUDA_GET_THREAD_ID(thread_id, N);

    // locate
    rays_o += thread_id * 3;
    rays_d += thread_id * 3;
    t_min += thread_id;
    t_max += thread_id;

    _ray_aabb_intersect<scalar_t>(rays_o, rays_d, aabb, t_min, t_max);

    scalar_t zero = static_cast<scalar_t>(0.f);
    *t_min = *t_min > zero ? *t_min : zero;
    return;
}

/**
 * @brief Ray AABB Test
 *
 * @param rays_o Ray origins. Tensor with shape [N, 3].
 * @param rays_d Normalized ray directions. Tensor with shape [N, 3].
 * @param aabb Scene AABB [xmin, ymin, zmin, xmax, ymax, zmax]. Tensor with shape [6].
 * @return std::vector<torch::Tensor>
 *  Ray AABB intersection {t_min, t_max} with shape [N] respectively. Note the t_min is
 *  clipped to minimum zero. 1e10 is returned if no intersection.
 */
std::vector<torch::Tensor> ray_aabb_intersect(
    const torch::Tensor rays_o, const torch::Tensor rays_d, const torch::Tensor aabb)
{
    DEVICE_GUARD(rays_o);
    CHECK_INPUT(rays_o);
    CHECK_INPUT(rays_d);
    CHECK_INPUT(aabb);
    TORCH_CHECK(rays_o.ndimension() == 2 & rays_o.size(1) == 3)
    TORCH_CHECK(rays_d.ndimension() == 2 & rays_d.size(1) == 3)
    TORCH_CHECK(aabb.ndimension() == 1 & aabb.size(0) == 6)

    const int N = rays_o.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(N, threads);

    torch::Tensor t_min = torch::empty({N}, rays_o.options());
    torch::Tensor t_max = torch::empty({N}, rays_o.options());

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
        rays_o.scalar_type(), "ray_aabb_intersect",
        ([&]
         { ray_aabb_intersect_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
               N,
               rays_o.data_ptr<scalar_t>(),
               rays_d.data_ptr<scalar_t>(),
               aabb.data_ptr<scalar_t>(),
               t_min.data_ptr<scalar_t>(),
               t_max.data_ptr<scalar_t>()); }));

    return {t_min, t_max};
}

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/pack.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"

__global__ void unpack_info_kernel(
    // input
    const int n_rays,
    const int *packed_info,
    // output
    int64_t *ray_indices)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];  // point idx start.
    const int steps = packed_info[i * 2 + 1]; // point idx shift.
    if (steps == 0)
        return;

    ray_indices += base;

    for (int j = 0; j < steps; ++j)
    {
        ray_indices[j] = i;
    }
}

__global__ void unpack_info_to_mask_kernel(
    // input
    const int n_rays,
    const int *packed_info,
    const int n_samples,
    // output
    bool *masks) // [n_rays, n_samples]
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];  // point idx start.
    const int steps = packed_info[i * 2 + 1]; // point idx shift.
    if (steps == 0)
        return;

    masks += i * n_samples;

    for (int j = 0; j < steps; ++j)
    {
        masks[j] = true;
    }
}

template <typename scalar_t>
__global__ void unpack_data_kernel(
    const uint32_t n_rays,
    const int *packed_info, // input ray & point indices.
    const int data_dim,
    const scalar_t *data,
    const int n_sampler_per_ray,
    scalar_t *unpacked_data) // (n_rays, n_sampler_per_ray, data_dim)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];  // point idx start.
    const int steps = packed_info[i * 2 + 1]; // point idx shift.
    if (steps == 0)
        return;

    data += base * data_dim;
    unpacked_data += i * n_sampler_per_ray * data_dim;

    for (int j = 0; j < steps; j++)
    {
        for (int k = 0; k < data_dim; k++)
        {
            unpacked_data[j * data_dim + k] = data[j * data_dim + k];
        }
    }
    return;
}

torch::Tensor unpack_info(const torch::Tensor packed_info, const int n_samples)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);

    const int n_rays = packed_info.size(0);
    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // int n_samples = packed_info[n_rays - 1].sum(0).item<int>();
    torch::Tensor ray_indices = torch::empty(
        {n_samples}, packed_info.options().dtype(torch::kLong));

    unpack_info_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        packed_info.data_ptr<int>(),
        ray_indices.data_ptr<int64_t>());
    return ray_indices;
}


torch::Tensor unpack_info_to_mask(
    const torch::Tensor packed_info, const int n_samples)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);

    const int n_rays = packed_info.size(0);
    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    torch::Tensor masks = torch::zeros(
        {n_rays, n_samples}, packed_info.options().dtype(torch::kBool));

    unpack_info_to_mask_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        packed_info.data_ptr<int>(),
        n_samples,
        masks.data_ptr<bool>());
    return masks;
}

torch::Tensor unpack_data(
    torch::Tensor packed_info,
    torch::Tensor data,
    int n_samples_per_ray)
{
    DEVICE_GUARD(packed_info);

    CHECK_INPUT(packed_info);
    CHECK_INPUT(data);

    TORCH_CHECK(packed_info.ndimension() == 2 & packed_info.size(1) == 2);
    TORCH_CHECK(data.ndimension() == 2);

    const int n_rays = packed_info.size(0);
    const int n_samples = data.size(0);
    const int data_dim = data.size(1);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    torch::Tensor unpacked_data = torch::zeros(
        {n_rays, n_samples_per_ray, data_dim}, data.options());

    AT_DISPATCH_ALL_TYPES(
        data.scalar_type(),
        "unpack_data",
        ([&]
         { unpack_data_kernel<scalar_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
               n_rays,
               // inputs
               packed_info.data_ptr<int>(),
               data_dim,
               data.data_ptr<scalar_t>(),
               n_samples_per_ray,
               // outputs
               unpacked_data.data_ptr<scalar_t>()); }));

    return unpacked_data;
}


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/pybind.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"
#include "include/helpers_math.h"
#include "include/helpers_contraction.h"


std::vector<torch::Tensor> ray_aabb_intersect(
    const torch::Tensor rays_o,
    const torch::Tensor rays_d,
    const torch::Tensor aabb);

std::vector<torch::Tensor> ray_marching(
    // rays
    const torch::Tensor rays_o,
    const torch::Tensor rays_d,
    const torch::Tensor t_min,
    const torch::Tensor t_max,
    // occupancy grid & contraction
    const torch::Tensor roi,
    const torch::Tensor grid_binary,
    const ContractionType type,
    // sampling
    const float step_size,
    const float cone_angle);

torch::Tensor unpack_info(
    const torch::Tensor packed_info, const int n_samples);

torch::Tensor unpack_info_to_mask(
    const torch::Tensor packed_info, const int n_samples);

torch::Tensor grid_query(
    const torch::Tensor samples,
    // occupancy grid & contraction
    const torch::Tensor roi,
    const torch::Tensor grid_value,
    const ContractionType type);

torch::Tensor contract(
    const torch::Tensor samples,
    // contraction
    const torch::Tensor roi,
    const ContractionType type);

torch::Tensor contract_inv(
    const torch::Tensor samples,
    // contraction
    const torch::Tensor roi,
    const ContractionType type);

std::vector<torch::Tensor> ray_resampling(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor weights,
    const int steps);

torch::Tensor unpack_data(
    torch::Tensor packed_info,
    torch::Tensor data,
    int n_samples_per_ray);

// cub implementations: parallel across samples
bool is_cub_available() {
    return (bool) CUB_SUPPORTS_SCAN_BY_KEY();
}
torch::Tensor transmittance_from_sigma_forward_cub(
    torch::Tensor ray_indices,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas);
torch::Tensor transmittance_from_sigma_backward_cub(
    torch::Tensor ray_indices,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad);
torch::Tensor transmittance_from_alpha_forward_cub(
    torch::Tensor ray_indices, torch::Tensor alphas);
torch::Tensor transmittance_from_alpha_backward_cub(
    torch::Tensor ray_indices,
    torch::Tensor alphas,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad);

// naive implementations: parallel across rays
torch::Tensor transmittance_from_sigma_forward_naive(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas);
torch::Tensor transmittance_from_sigma_backward_naive(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad);
torch::Tensor transmittance_from_alpha_forward_naive(
    torch::Tensor packed_info, 
    torch::Tensor alphas);
torch::Tensor transmittance_from_alpha_backward_naive(
    torch::Tensor packed_info,
    torch::Tensor alphas,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad);

torch::Tensor weight_from_sigma_forward_naive(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas);
torch::Tensor weight_from_sigma_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas);
torch::Tensor weight_from_alpha_forward_naive(
    torch::Tensor packed_info, 
    torch::Tensor alphas);
torch::Tensor weight_from_alpha_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,
    torch::Tensor packed_info,
    torch::Tensor alphas);

torch::Tensor weight_from_alpha_patch_based_forward_naive(
    torch::Tensor packed_info,
    torch::Tensor alphas);

torch::Tensor weight_from_alpha_patch_based_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,
    torch::Tensor packed_info,
    torch::Tensor alphas);

std::vector<torch::Tensor> weight_and_transmittance_from_alpha_patch_based_forward_naive(
    torch::Tensor packed_info, // (n_patches, 2)
    torch::Tensor alphas // (n_samples, patches_size, 1)
    );

torch::Tensor weight_and_transmittance_from_alpha_patch_based_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,  // (n_samples, patches_size, 1)
    torch::Tensor packed_info,
    torch::Tensor alphas);

torch::Tensor transmittance_from_alpha_patch_based_forward_naive(
    torch::Tensor packed_info, torch::Tensor alphas);

torch::Tensor transmittance_from_alpha_patch_based_backward_naive(
    torch::Tensor packed_info,
    torch::Tensor alphas,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad);


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
    // contraction
    py::enum_<ContractionType>(m, "ContractionType")
        .value("AABB", ContractionType::AABB)
        .value("UN_BOUNDED_TANH", ContractionType::UN_BOUNDED_TANH)
        .value("UN_BOUNDED_SPHERE", ContractionType::UN_BOUNDED_SPHERE);
    m.def("contract", &contract);
    m.def("contract_inv", &contract_inv);

    // grid
    m.def("grid_query", &grid_query);

    // marching
    m.def("ray_aabb_intersect", &ray_aabb_intersect);
    m.def("ray_marching", &ray_marching);
    m.def("ray_resampling", &ray_resampling);

    // rendering
    m.def("is_cub_available", is_cub_available);
    m.def("transmittance_from_sigma_forward_cub", transmittance_from_sigma_forward_cub);
    m.def("transmittance_from_sigma_backward_cub", transmittance_from_sigma_backward_cub);
    m.def("transmittance_from_alpha_forward_cub", transmittance_from_alpha_forward_cub);
    m.def("transmittance_from_alpha_backward_cub", transmittance_from_alpha_backward_cub);
    
    m.def("transmittance_from_sigma_forward_naive", transmittance_from_sigma_forward_naive);
    m.def("transmittance_from_sigma_backward_naive", transmittance_from_sigma_backward_naive);
    m.def("transmittance_from_alpha_forward_naive", transmittance_from_alpha_forward_naive);
    m.def("transmittance_from_alpha_backward_naive", transmittance_from_alpha_backward_naive);

    m.def("weight_from_sigma_forward_naive", weight_from_sigma_forward_naive);
    m.def("weight_from_sigma_backward_naive", weight_from_sigma_backward_naive);
    m.def("weight_from_alpha_forward_naive", weight_from_alpha_forward_naive);
    m.def("weight_from_alpha_backward_naive", weight_from_alpha_backward_naive);
    m.def("weight_from_alpha_patch_based_forward_naive", weight_from_alpha_patch_based_forward_naive);
    m.def("weight_from_alpha_patch_based_backward_naive", weight_from_alpha_patch_based_backward_naive);
    m.def("weight_and_transmittance_from_alpha_patch_based_forward_naive", weight_and_transmittance_from_alpha_patch_based_forward_naive);
    m.def("weight_and_transmittance_from_alpha_patch_based_backward_naive", weight_and_transmittance_from_alpha_patch_based_backward_naive);
    m.def("transmittance_from_alpha_patch_based_forward_naive", transmittance_from_alpha_patch_based_forward_naive);
    m.def("transmittance_from_alpha_patch_based_backward_naive", transmittance_from_alpha_patch_based_backward_naive);
    // pack & unpack
    m.def("unpack_data", &unpack_data);
    m.def("unpack_info", &unpack_info);
    m.def("unpack_info_to_mask", &unpack_info_to_mask);
}

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/ray_marching.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"
#include "include/helpers_math.h"
#include "include/helpers_contraction.h"

inline __device__ __host__ float calc_dt(
    const float t, const float cone_angle,
    const float dt_min, const float dt_max)
{
    return clamp(t * cone_angle, dt_min, dt_max);
}

inline __device__ __host__ int grid_idx_at(
    const float3 xyz_unit, const int3 grid_res)
{
    // xyz should be always in [0, 1]^3.
    int3 ixyz = make_int3(xyz_unit * make_float3(grid_res));
    ixyz = clamp(ixyz, make_int3(0, 0, 0), grid_res - 1);
    int3 grid_offset = make_int3(grid_res.y * grid_res.z, grid_res.z, 1);
    int idx = dot(ixyz, grid_offset);
    return idx;
}

template <typename scalar_t>
inline __device__ __host__ scalar_t grid_occupied_at(
    const float3 xyz,
    const float3 roi_min, const float3 roi_max,
    ContractionType type,
    const int3 grid_res, const scalar_t *grid_value)
{
    if (type == ContractionType::AABB &&
        (xyz.x < roi_min.x || xyz.x > roi_max.x ||
         xyz.y < roi_min.y || xyz.y > roi_max.y ||
         xyz.z < roi_min.z || xyz.z > roi_max.z))
    {
        return false;
    }
    float3 xyz_unit = apply_contraction(
        xyz, roi_min, roi_max, type);
    int idx = grid_idx_at(xyz_unit, grid_res);
    return grid_value[idx];
}

// dda like step
inline __device__ __host__ float distance_to_next_voxel(
    const float3 xyz, const float3 dir, const float3 inv_dir,
    const float3 roi_min, const float3 roi_max, const int3 grid_res)
{
    float3 _occ_res = make_float3(grid_res);
    float3 _xyz = roi_to_unit(xyz, roi_min, roi_max) * _occ_res;
    float3 txyz = ((floorf(_xyz + 0.5f + 0.5f * sign(dir)) - _xyz) * inv_dir) / _occ_res * (roi_max - roi_min);
    float t = min(min(txyz.x, txyz.y), txyz.z);
    return fmaxf(t, 0.0f);
}

inline __device__ __host__ float advance_to_next_voxel(
    const float t, const float dt_min,
    const float3 xyz, const float3 dir, const float3 inv_dir,
    const float3 roi_min, const float3 roi_max, const int3 grid_res, const float far)
{
    // Regular stepping (may be slower but matches non-empty space)
    float t_target = t + distance_to_next_voxel(
                             xyz, dir, inv_dir, roi_min, roi_max, grid_res);
    
    t_target = min(t_target, far);
    float _t = t;
    do
    {
        _t += dt_min;
    } while (_t < t_target);
    return _t;
}

// -------------------------------------------------------------------------------
// Raymarching
// -------------------------------------------------------------------------------

__global__ void ray_marching_kernel(
    // rays info
    const uint32_t n_rays,
    const float *rays_o, // shape (n_rays, 3)
    const float *rays_d, // shape (n_rays, 3)
    const float *t_min,  // shape (n_rays,)
    const float *t_max,  // shape (n_rays,)
    // occupancy grid & contraction
    const float *roi,
    const int3 grid_res,
    const bool *grid_binary, // shape (reso_x, reso_y, reso_z)
    const ContractionType type,
    // sampling
    const float step_size,
    const float cone_angle,
    const int *packed_info,
    // first round outputs
    int *num_steps,
    // second round outputs
    int64_t *ray_indices,
    float *t_starts,
    float *t_ends)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    bool is_first_round = (packed_info == nullptr);

    // locate
    rays_o += i * 3;
    rays_d += i * 3;
    t_min += i;
    t_max += i;

    if (is_first_round)
    {
        num_steps += i;
    }
    else
    {
        int base = packed_info[i * 2 + 0];
        int steps = packed_info[i * 2 + 1];
        t_starts += base;
        t_ends += base;
        ray_indices += base;
    }

    const float3 origin = make_float3(rays_o[0], rays_o[1], rays_o[2]);
    const float3 dir = make_float3(rays_d[0], rays_d[1], rays_d[2]);
    const float3 inv_dir = 1.0f / dir;
    const float near = t_min[0], far = t_max[0];

    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);

    // TODO: compute dt_max from occ resolution.
    float dt_min = step_size;
    float dt_max = 1e10f;

    int j = 0;
    float t0 = near;
    float dt = calc_dt(t0, cone_angle, dt_min, dt_max);
    float t1 = t0 + dt;
    float t_mid = (t0 + t1) * 0.5f;

    while (t_mid < far)
    {
        // current center
        const float3 xyz = origin + t_mid * dir;
        if (grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_binary))
        {
            if (!is_first_round)
            {
                t_starts[j] = t0;
                t_ends[j] = t1;
                ray_indices[j] = i;
            }
            ++j;
            // march to next sample
            t0 = t1;
            t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
            t_mid = (t0 + t1) * 0.5f;
        }
        else
        {
            // march to next sample
            switch (type)
            {
            case ContractionType::AABB:
                // no contraction
                t_mid = advance_to_next_voxel(
                    t_mid, dt_min, xyz, dir, inv_dir, roi_min, roi_max, grid_res, far);
                dt = calc_dt(t_mid, cone_angle, dt_min, dt_max);
                t0 = t_mid - dt * 0.5f;
                t1 = t_mid + dt * 0.5f;
                break;

            default:
                // any type of scene contraction does not work with DDA.
                t0 = t1;
                t1 = t0 + calc_dt(t0, cone_angle, dt_min, dt_max);
                t_mid = (t0 + t1) * 0.5f;
                break;
            }
        }
    }

    if (is_first_round)
    {
        *num_steps = j;
    }
    return;
}

std::vector<torch::Tensor> ray_marching(
    // rays
    const torch::Tensor rays_o,
    const torch::Tensor rays_d,
    const torch::Tensor t_min,
    const torch::Tensor t_max,
    // occupancy grid & contraction
    const torch::Tensor roi,
    const torch::Tensor grid_binary,
    const ContractionType type,
    // sampling
    const float step_size,
    const float cone_angle)
{
    DEVICE_GUARD(rays_o);

    CHECK_INPUT(rays_o);
    CHECK_INPUT(rays_d);
    CHECK_INPUT(t_min);
    CHECK_INPUT(t_max);
    CHECK_INPUT(roi);
    CHECK_INPUT(grid_binary);
    TORCH_CHECK(rays_o.ndimension() == 2 & rays_o.size(1) == 3)
    TORCH_CHECK(rays_d.ndimension() == 2 & rays_d.size(1) == 3)
    TORCH_CHECK(t_min.ndimension() == 1)
    TORCH_CHECK(t_max.ndimension() == 1)
    TORCH_CHECK(roi.ndimension() == 1 & roi.size(0) == 6)
    TORCH_CHECK(grid_binary.ndimension() == 3)

    const int n_rays = rays_o.size(0);
    const int3 grid_res = make_int3(
        grid_binary.size(0), grid_binary.size(1), grid_binary.size(2));

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // helper counter
    torch::Tensor num_steps = torch::empty(
        {n_rays}, rays_o.options().dtype(torch::kInt32));

    // count number of samples per ray
    ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        // rays
        n_rays,
        rays_o.data_ptr<float>(),
        rays_d.data_ptr<float>(),
        t_min.data_ptr<float>(),
        t_max.data_ptr<float>(),
        // occupancy grid & contraction
        roi.data_ptr<float>(),
        grid_res,
        grid_binary.data_ptr<bool>(),
        type,
        // sampling
        step_size,
        cone_angle,
        nullptr, /* packed_info */
        // outputs
        num_steps.data_ptr<int>(),
        nullptr, /* ray_indices */
        nullptr, /* t_starts */
        nullptr /* t_ends */);

    torch::Tensor cum_steps = num_steps.cumsum(0, torch::kInt32);
    torch::Tensor packed_info = torch::stack({cum_steps - num_steps, num_steps}, 1);

    // output samples starts and ends
    int total_steps = cum_steps[cum_steps.size(0) - 1].item<int>();
    torch::Tensor t_starts = torch::empty({total_steps, 1}, rays_o.options());
    torch::Tensor t_ends = torch::empty({total_steps, 1}, rays_o.options());
    torch::Tensor ray_indices = torch::empty({total_steps}, cum_steps.options().dtype(torch::kLong));

    ray_marching_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        // rays
        n_rays,
        rays_o.data_ptr<float>(),
        rays_d.data_ptr<float>(),
        t_min.data_ptr<float>(),
        t_max.data_ptr<float>(),
        // occupancy grid & contraction
        roi.data_ptr<float>(),
        grid_res,
        grid_binary.data_ptr<bool>(),
        type,
        // sampling
        step_size,
        cone_angle,
        packed_info.data_ptr<int>(),
        // outputs
        nullptr, /* num_steps */
        ray_indices.data_ptr<int64_t>(),
        t_starts.data_ptr<float>(),
        t_ends.data_ptr<float>());

    return {packed_info, ray_indices, t_starts, t_ends};
}

// ----------------------------------------------------------------------------
// Query the occupancy grid
// ----------------------------------------------------------------------------

template <typename scalar_t>
__global__ void query_occ_kernel(
    // rays info
    const uint32_t n_samples,
    const float *samples, // shape (n_samples, 3)
    // occupancy grid & contraction
    const float *roi,
    const int3 grid_res,
    const scalar_t *grid_value, // shape (reso_x, reso_y, reso_z)
    const ContractionType type,
    // outputs
    scalar_t *occs)
{
    CUDA_GET_THREAD_ID(i, n_samples);

    // locate
    samples += i * 3;
    occs += i;

    const float3 roi_min = make_float3(roi[0], roi[1], roi[2]);
    const float3 roi_max = make_float3(roi[3], roi[4], roi[5]);
    const float3 xyz = make_float3(samples[0], samples[1], samples[2]);

    *occs = grid_occupied_at(xyz, roi_min, roi_max, type, grid_res, grid_value);
    return;
}

torch::Tensor grid_query(
    const torch::Tensor samples,
    // occupancy grid & contraction
    const torch::Tensor roi,
    const torch::Tensor grid_value,
    const ContractionType type)
{
    DEVICE_GUARD(samples);
    CHECK_INPUT(samples);

    const int n_samples = samples.size(0);
    const int3 grid_res = make_int3(
        grid_value.size(0), grid_value.size(1), grid_value.size(2));

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);

    torch::Tensor occs = torch::empty({n_samples}, grid_value.options());

    AT_DISPATCH_FLOATING_TYPES_AND(
        at::ScalarType::Bool,
        occs.scalar_type(),
        "grid_query",
        ([&]
         { query_occ_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
               n_samples,
               samples.data_ptr<float>(),
               // grid
               roi.data_ptr<float>(),
               grid_res,
               grid_value.data_ptr<scalar_t>(),
               type,
               // outputs
               occs.data_ptr<scalar_t>()); }));

    return occs;
}


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/render_transmittance.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"

__global__ void transmittance_from_sigma_forward_kernel(
    const uint32_t n_rays,
    // inputs
    const int *packed_info,
    const float *starts,
    const float *ends,
    const float *sigmas,
    // outputs
    float *transmittance)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    starts += base;
    ends += base;
    sigmas += base;
    transmittance += base;

    // accumulation
    float cumsum = 0.0f;
    for (int j = 0; j < steps; ++j)
    {
        transmittance[j] = __expf(-cumsum);
        cumsum += sigmas[j] * (ends[j] - starts[j]);
    }

    // // another way to impl:
    // float T = 1.f;
    // for (int j = 0; j < steps; ++j)
    // {
    //     const float delta = ends[j] - starts[j];
    //     const float alpha = 1.f - __expf(-sigmas[j] * delta);
    //     transmittance[j] = T;
    //     T *= (1.f - alpha);
    // }
    return;
}

__global__ void transmittance_from_sigma_backward_kernel(
    const uint32_t n_rays,
    // inputs
    const int *packed_info,
    const float *starts,
    const float *ends,
    const float *transmittance,
    const float *transmittance_grad,
    // outputs
    float *sigmas_grad)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    transmittance += base;
    transmittance_grad += base;
    starts += base;
    ends += base;
    sigmas_grad += base;

    // accumulation
    float cumsum = 0.0f;
    for (int j = steps - 1; j >= 0; --j)
    {
        sigmas_grad[j] = cumsum * (ends[j] - starts[j]);
        cumsum += -transmittance_grad[j] * transmittance[j];
    }
    return;
}

__global__ void transmittance_from_alpha_forward_kernel(
    const uint32_t n_rays,
    // inputs
    const int *packed_info,
    const float *alphas,
    // outputs
    float *transmittance)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    alphas += base;
    transmittance += base;

    // accumulation
    float T = 1.0f;
    for (int j = 0; j < steps; ++j)
    {
        transmittance[j] = T;
        T *= (1.0f - alphas[j]);
    }
    return;
}

__global__ void transmittance_from_alpha_backward_kernel(
    const uint32_t n_rays,
    // inputs
    const int *packed_info,
    const float *alphas,
    const float *transmittance,
    const float *transmittance_grad,
    // outputs
    float *alphas_grad)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    alphas += base;
    transmittance += base;
    transmittance_grad += base;
    alphas_grad += base;

    // accumulation
    float cumsum = 0.0f;
    for (int j = steps - 1; j >= 0; --j)
    {
        alphas_grad[j] = cumsum / fmax(1.0f - alphas[j], 1e-10f);
        cumsum += -transmittance_grad[j] * transmittance[j];
    }
    return;
}

__global__ void transmittance_from_alpha_patch_based_forward_kernel(
    const uint32_t n_patches,
    const uint32_t patch_size,
    // inputs
    const int *packed_info,
    const float *alphas,
    // outputs
    float *transmittance)
{
    CUDA_GET_THREAD_ID_2D(i, k, n_patches, patch_size);  // i is the patch id, k is the ray id within the patch

    // locate
    const int base = packed_info[i * 2 + 0];  // get the base of the patch
    const int steps = packed_info[i * 2 + 1]; // get the steps of the patch
    if (steps == 0)
        return;

    alphas += base * patch_size;  // move the pointer to the base
    transmittance += base * patch_size;  // move the pointer to the base

    // accumulation
    float T = 1.0f;
    for (int j = 0; j < steps; ++j)
    {
        const uint32_t ray_id = j * patch_size + k;
        transmittance[ray_id] = T;
        T *= (1.0f - alphas[j]);
    }
    return;
}

__global__ void transmittance_from_alpha_patch_based_backward_kernel(
    const uint32_t n_patches,
    const uint32_t patch_size,
    // inputs
    const int *packed_info,
    const float *alphas,
    const float *transmittance,
    const float *transmittance_grad,
    // outputs
    float *alphas_grad)
{
    CUDA_GET_THREAD_ID_2D(i, k, n_patches, patch_size);  // i is the patch id, k is the ray id within the patch

    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    alphas += base * patch_size;
    transmittance += base * patch_size;
    transmittance_grad += base * patch_size;
    alphas_grad += base * patch_size;

    // accumulation
    float cumsum = 0.0f;
    for (int j = steps - 1; j >= 0; --j)
    {
        const uint32_t sample_idx = j * patch_size + k;
        alphas_grad[sample_idx] = cumsum / fmax(1.0f - alphas[sample_idx], 1e-10f);
        cumsum += -transmittance_grad[sample_idx] * transmittance[sample_idx];
    }
    return;
}

torch::Tensor transmittance_from_sigma_forward_naive(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(starts);
    CHECK_INPUT(ends);
    CHECK_INPUT(sigmas);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);

    const uint32_t n_samples = sigmas.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor transmittance = torch::empty_like(sigmas);

    // parallel across rays
    transmittance_from_sigma_forward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        starts.data_ptr<float>(),
        ends.data_ptr<float>(),
        sigmas.data_ptr<float>(),
        // outputs
        transmittance.data_ptr<float>());
    return transmittance;
}

torch::Tensor transmittance_from_sigma_backward_naive(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(starts);
    CHECK_INPUT(ends);
    CHECK_INPUT(transmittance);
    CHECK_INPUT(transmittance_grad);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
    TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
    TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);

    const uint32_t n_samples = transmittance.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor sigmas_grad = torch::empty_like(transmittance);

    // parallel across rays
    transmittance_from_sigma_backward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        starts.data_ptr<float>(),
        ends.data_ptr<float>(),
        transmittance.data_ptr<float>(),
        transmittance_grad.data_ptr<float>(),
        // outputs
        sigmas_grad.data_ptr<float>());
    return sigmas_grad;
}

torch::Tensor transmittance_from_alpha_forward_naive(
    torch::Tensor packed_info, torch::Tensor alphas)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
    TORCH_CHECK(packed_info.ndimension() == 2);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor transmittance = torch::empty_like(alphas);

    // parallel across rays
    transmittance_from_alpha_forward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        // outputs
        transmittance.data_ptr<float>());
    return transmittance;
}

torch::Tensor transmittance_from_alpha_backward_naive(
    torch::Tensor packed_info,
    torch::Tensor alphas,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(transmittance);
    CHECK_INPUT(transmittance_grad);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
    TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);

    const uint32_t n_samples = transmittance.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor alphas_grad = torch::empty_like(alphas);

    // parallel across rays
    transmittance_from_alpha_backward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        transmittance.data_ptr<float>(),
        transmittance_grad.data_ptr<float>(),
        // outputs
        alphas_grad.data_ptr<float>());
    return alphas_grad;
}

torch::Tensor transmittance_from_alpha_patch_based_forward_naive(
    torch::Tensor packed_info, torch::Tensor alphas)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(alphas.ndimension() == 3 & alphas.size(2) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_patches = packed_info.size(0);
    const uint32_t patch_size  = alphas.size(1);

    // compute the required number of thread.y from patch size
    // take the log2 of patch size and round up to the next power of 2
    const uint32_t thread_for_a_patch = pow(2, ceil(log2(patch_size)));
    const uint32_t thread_for_n_samples = 256 / thread_for_a_patch;

    const dim3 threads(thread_for_n_samples, thread_for_a_patch);
    const dim3 blocks((n_patches+threads.x-1)/threads.x, (patch_size+threads.y-1)/threads.y);

    // outputs
    torch::Tensor transmittance = torch::empty_like(alphas);

    // parallel across rays
    transmittance_from_alpha_patch_based_forward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_patches,
        patch_size,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        // outputs
        transmittance.data_ptr<float>());
    return transmittance;
}

torch::Tensor transmittance_from_alpha_patch_based_backward_naive(
    torch::Tensor packed_info,
    torch::Tensor alphas,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(transmittance);
    CHECK_INPUT(transmittance_grad);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(transmittance.ndimension() == 3 & transmittance.size(2) == 1);
    TORCH_CHECK(transmittance_grad.ndimension() == 3 & transmittance_grad.size(2) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_patches = packed_info.size(0);
    const uint32_t patch_size = alphas.size(1);

    // compute the required number of thread.y from patch size
    // take the log2 of patch size and round up to the next power of 2
    const uint32_t thread_for_a_patch = pow(2, ceil(log2(patch_size)));
    const uint32_t thread_for_n_samples = 256 / thread_for_a_patch;

    const dim3 threads(thread_for_n_samples, thread_for_a_patch);
    const dim3 blocks((n_patches+threads.x-1)/threads.x, (patch_size+threads.y-1)/threads.y);


    // outputs
    torch::Tensor alphas_grad = torch::empty_like(alphas);

    // parallel across rays
    transmittance_from_alpha_patch_based_backward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_patches,
        patch_size,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        transmittance.data_ptr<float>(),
        transmittance_grad.data_ptr<float>(),
        // outputs
        alphas_grad.data_ptr<float>());
    return alphas_grad;
}

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/render_transmittance_cub.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */
// CUB is supported in CUDA >= 11.0
// ExclusiveScanByKey is supported in CUB >= 1.15.0 (CUDA >= 11.6)
// See: https://github.com/NVIDIA/cub/tree/main#releases
#include "include/helpers_cuda.h"
#if CUB_SUPPORTS_SCAN_BY_KEY()
#include <cub/cub.cuh>
#endif

struct Product
{
    template <typename T>
    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const { return a * b; }
};

#if CUB_SUPPORTS_SCAN_BY_KEY()
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
inline void exclusive_sum_by_key(
    KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items)
{
    TORCH_CHECK(num_items <= std::numeric_limits<int64_t>::max(),
                "cub ExclusiveSumByKey does not support more than LONG_MAX elements");
    CUB_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, keys, input, output,
                num_items, cub::Equality(), at::cuda::getCurrentCUDAStream());
}

template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
inline void exclusive_prod_by_key(
    KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items)
{
    TORCH_CHECK(num_items <= std::numeric_limits<int64_t>::max(),
                "cub ExclusiveScanByKey does not support more than LONG_MAX elements");
    CUB_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, keys, input, output, Product(), 1.0f,
                num_items, cub::Equality(), at::cuda::getCurrentCUDAStream());
}
#endif

torch::Tensor transmittance_from_sigma_forward_cub(
    torch::Tensor ray_indices,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas)
{
    DEVICE_GUARD(ray_indices);
    CHECK_INPUT(ray_indices);
    CHECK_INPUT(starts);
    CHECK_INPUT(ends);
    CHECK_INPUT(sigmas);
    TORCH_CHECK(ray_indices.ndimension() == 1);
    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);

    const uint32_t n_samples = sigmas.size(0);

    // parallel across samples
    torch::Tensor sigmas_dt = sigmas * (ends - starts);
    torch::Tensor sigmas_dt_cumsum = torch::empty_like(sigmas);
#if CUB_SUPPORTS_SCAN_BY_KEY()
    exclusive_sum_by_key(
        ray_indices.data_ptr<int64_t>(),
        sigmas_dt.data_ptr<float>(),
        sigmas_dt_cumsum.data_ptr<float>(),
        n_samples);
#else
    std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
    torch::Tensor transmittance = (-sigmas_dt_cumsum).exp();
    return transmittance;
}

torch::Tensor transmittance_from_sigma_backward_cub(
    torch::Tensor ray_indices,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad)
{
    DEVICE_GUARD(ray_indices);
    CHECK_INPUT(ray_indices);
    CHECK_INPUT(starts);
    CHECK_INPUT(ends);
    CHECK_INPUT(transmittance);
    CHECK_INPUT(transmittance_grad);
    TORCH_CHECK(ray_indices.ndimension() == 1);
    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
    TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
    TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);

    const uint32_t n_samples = transmittance.size(0);

    // parallel across samples
    torch::Tensor sigmas_dt_cumsum_grad = -transmittance_grad * transmittance;
    torch::Tensor sigmas_dt_grad = torch::empty_like(transmittance_grad);
#if CUB_SUPPORTS_SCAN_BY_KEY()
    exclusive_sum_by_key(
        thrust::make_reverse_iterator(ray_indices.data_ptr<int64_t>() + n_samples),
        thrust::make_reverse_iterator(sigmas_dt_cumsum_grad.data_ptr<float>() + n_samples),
        thrust::make_reverse_iterator(sigmas_dt_grad.data_ptr<float>() + n_samples),
        n_samples);
#else
    std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
    torch::Tensor sigmas_grad = sigmas_dt_grad * (ends - starts);
    return sigmas_grad;
}

torch::Tensor transmittance_from_alpha_forward_cub(
    torch::Tensor ray_indices, torch::Tensor alphas)
{
    DEVICE_GUARD(ray_indices);
    CHECK_INPUT(ray_indices);
    CHECK_INPUT(alphas);
    TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
    TORCH_CHECK(ray_indices.ndimension() == 1);

    const uint32_t n_samples = alphas.size(0);

    // parallel across samples
    torch::Tensor transmittance = torch::empty_like(alphas);
#if CUB_SUPPORTS_SCAN_BY_KEY()
    exclusive_prod_by_key(
        ray_indices.data_ptr<int64_t>(),
        (1.0f - alphas).data_ptr<float>(),
        transmittance.data_ptr<float>(),
        n_samples);
#else
    std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
    return transmittance;
}

torch::Tensor transmittance_from_alpha_backward_cub(
    torch::Tensor ray_indices,
    torch::Tensor alphas,
    torch::Tensor transmittance,
    torch::Tensor transmittance_grad)
{
    DEVICE_GUARD(ray_indices);
    CHECK_INPUT(ray_indices);
    CHECK_INPUT(transmittance);
    CHECK_INPUT(transmittance_grad);
    TORCH_CHECK(ray_indices.ndimension() == 1);
    TORCH_CHECK(transmittance.ndimension() == 2 & transmittance.size(1) == 1);
    TORCH_CHECK(transmittance_grad.ndimension() == 2 & transmittance_grad.size(1) == 1);

    const uint32_t n_samples = transmittance.size(0);

    // parallel across samples
    torch::Tensor sigmas_dt_cumsum_grad = -transmittance_grad * transmittance;
    torch::Tensor sigmas_dt_grad = torch::empty_like(transmittance_grad);
#if CUB_SUPPORTS_SCAN_BY_KEY()
    exclusive_sum_by_key(
        thrust::make_reverse_iterator(ray_indices.data_ptr<int64_t>() + n_samples),
        thrust::make_reverse_iterator(sigmas_dt_cumsum_grad.data_ptr<float>() + n_samples),
        thrust::make_reverse_iterator(sigmas_dt_grad.data_ptr<float>() + n_samples),
        n_samples);
#else
    std::runtime_error("CUB functions are only supported in CUDA >= 11.6.");
#endif
    torch::Tensor alphas_grad = sigmas_dt_grad / (1.0f - alphas).clamp_min(1e-10f);
    return alphas_grad;
}


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/cuda/csrc/render_weight.cu
================================================
/*
 * Copyright (c) 2022 Ruilong Li, UC Berkeley.
 */

#include "include/helpers_cuda.h"

__global__ void weight_from_sigma_forward_kernel(
    const uint32_t n_rays,
    const int *packed_info,
    const float *starts,
    const float *ends,
    const float *sigmas,
    // outputs
    float *weights)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0]; 
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    starts += base;
    ends += base;
    sigmas += base;
    weights += base;

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const float delta = ends[j] - starts[j];
        const float alpha = 1.f - __expf(-sigmas[j] * delta);
        weights[j] = alpha * T;
        T *= (1.f - alpha);
    }
    return;
}

__global__ void weight_from_sigma_backward_kernel(
    const uint32_t n_rays,
    const int *packed_info, 
    const float *starts, 
    const float *ends,   
    const float *sigmas, 
    const float *weights, 
    const float *grad_weights, 
    // outputs
    float *grad_sigmas)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0]; 
    const int steps = packed_info[i * 2 + 1]; 
    if (steps == 0)
        return;

    starts += base;
    ends += base;
    sigmas += base;
    weights += base;
    grad_weights += base;
    grad_sigmas += base;

    float accum = 0;
    for (int j = 0; j < steps; ++j)
    {
        accum += grad_weights[j] * weights[j];
    }

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const float delta = ends[j] - starts[j];
        const float alpha = 1.f - __expf(-sigmas[j] * delta);
        grad_sigmas[j] = (grad_weights[j] * T - accum) * delta;
        accum -= grad_weights[j] * weights[j];
        T *= (1.f - alpha);
    }
    return;
}

// template <typename scalar_t>
__global__ void weight_from_alpha_patch_based_forward_kernel(
    const uint32_t n_patches,
    const uint32_t patch_size,
    const int *packed_info, // (n_patches, 2)
    const float *alphas,  // (n_samples, patch_size, 1)
    // outputs
    float *weights// ()
    ){
    CUDA_GET_THREAD_ID_2D(i, k, n_patches, patch_size);  // i is the patch id, k is the ray id within the patch

    // locate
    const int base = packed_info[i * 2 + 0];  // get the base of the patch
    const int steps = packed_info[i * 2 + 1]; // get the steps of the patch
    if (steps == 0)
        return;

    alphas += base * patch_size;  // move the pointer to the base
    weights += base * patch_size;  // move the pointer to the base
//     transmittance += base * patch_size;  // move the pointer to the base

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const uint32_t ray_id = j * patch_size + k;
        const float alpha = alphas[ray_id];  // get the alpha value
//         transmittance[ray_id] = T;
        weights[ray_id] = alpha * T;  // calculate the weight
        T *= (1.f - alpha);  // update the T value
    }
    return;
}

__global__ void weight_and_transmittance_from_alpha_patch_based_forward_kernel(
    const uint32_t n_patches,
    const uint32_t patch_size,
    const int *packed_info, // (n_patches, 2)
    const float *alphas,  // (n_samples, patch_size, 1)
    // outputs
    float *weights,
    float *transmittance// ()
    ){
    CUDA_GET_THREAD_ID_2D(i, k, n_patches, patch_size);  // i is the patch id, k is the ray id within the patch

    // locate
    const int base = packed_info[i * 2 + 0];  // get the base of the patch
    const int steps = packed_info[i * 2 + 1]; // get the steps of the patch
    if (steps == 0)
        return;

    alphas += base * patch_size;  // move the pointer to the base
    weights += base * patch_size;  // move the pointer to the base
    transmittance += base * patch_size;  // move the pointer to the base

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const uint32_t ray_id = j * patch_size + k;
        const float alpha = alphas[ray_id];  // get the alpha value
        transmittance[ray_id] = T;
        weights[ray_id] = alpha * T;  // calculate the weight
        T *= (1.f - alpha);  // update the T value
    }
    return;
}

__global__ void weight_from_alpha_forward_kernel(
    const uint32_t n_rays,
    const int *packed_info,
    const float *alphas,   
    // outputs
    float *weights)
{
    CUDA_GET_THREAD_ID(i, n_rays);  // i is the thread id

    // locate
    const int base = packed_info[i * 2 + 0];  // get the base
    const int steps = packed_info[i * 2 + 1]; // get the steps
    if (steps == 0)
        return;

    alphas += base;  // move the pointer to the base
    weights += base;  // move the pointer to the base

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const float alpha = alphas[j];  // get the alpha value
        weights[j] = alpha * T;  // calculate the weight
        T *= (1.f - alpha);  // update the T value
    }
    return;
}

__global__ void weight_from_alpha_backward_kernel(
    const uint32_t n_rays,
    const int *packed_info,  
    const float *alphas,     
    const float *weights,    
    const float *grad_weights,
    // outputs
    float *grad_alphas)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0]; 
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    alphas += base;
    weights += base;
    grad_weights += base;
    grad_alphas += base;

    float accum = 0;
    for (int j = 0; j < steps; ++j)
    {
        accum += grad_weights[j] * weights[j];
    }

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const float alpha = alphas[j];
        grad_alphas[j] = (grad_weights[j] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
        accum -= grad_weights[j] * weights[j];
        T *= (1.f - alpha);
    }
    return;
}


__global__ void weight_from_alpha_importance_sampling_forward_kernel(
    const uint32_t n_rays,
    const int *packed_info,
    const float *alphas,
    const float *importance,
    // outputs
    float *weights)
{
    CUDA_GET_THREAD_ID(i, n_rays);  // i is the thread id

    // locate
    const int base = packed_info[i * 2 + 0];  // get the base
    const int steps = packed_info[i * 2 + 1]; // get the steps
    if (steps == 0)
        return;

    alphas += base;  // move the pointer to the base
    weights += base;  // move the pointer to the base
    importance += base;  // move the pointer to the base

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const float alpha = alphas[j];  // get the alpha value
        weights[j] = alpha * T / importance[j];  // calculate the weight
        T *= (1.f - alpha);  // update the T value
    }
    return;
}

__global__ void weight_from_alpha_importance_sampling_backward_kernel(
    const uint32_t n_rays,
    const int *packed_info,
    const float *alphas,
    const float *weights,
    const float *grad_weights,
    const float *importance,
    // outputs
    float *grad_alphas)
{
    CUDA_GET_THREAD_ID(i, n_rays);

    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    alphas += base;
    weights += base;
    grad_weights += base;
    grad_alphas += base;
    importance += base;

    float accum = 0;
    for (int j = 0; j < steps; ++j)
    {
        accum += grad_weights[j] * weights[j];
    }

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const float alpha = alphas[j];
        grad_alphas[j] = (grad_weights[j] * T - importance[j] * accum) / (importance[j] * fmaxf(1.f - alpha, 1e-10f));
        accum -= grad_weights[j] * weights[j];
        T *= (1.f - alpha);
    }
    return;
}


__global__ void weight_from_alpha_patch_based_backward_kernel(
    const uint32_t n_patches,
    const uint32_t patch_size,
    const int *packed_info,
    const float *alphas,
    const float *weights,
    const float *grad_weights,
    // outputs
    float *grad_alphas)
{
    CUDA_GET_THREAD_ID_2D(i, k, n_patches, patch_size);  // i is the patch id, k is the ray id within the patch


    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    alphas += base * patch_size;  // move the pointer to the base
    weights += base * patch_size;  // move the pointer to the base
    grad_weights += base * patch_size;  // move the pointer to the base
    grad_alphas += base * patch_size;  // move the pointer to the base

    float accum = 0;
    for (int j = 0; j < steps; ++j)
    {
        const uint32_t sample_idx = j * patch_size + k;
        accum += grad_weights[sample_idx] * weights[sample_idx];
    }

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const uint32_t sample_idx = j * patch_size + k;
        const float alpha = alphas[sample_idx];
        grad_alphas[sample_idx] = (grad_weights[sample_idx] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
        accum -= grad_weights[sample_idx] * weights[sample_idx];
        T *= (1.f - alpha);
    }
    return;
}

__global__ void weight_and_transmittance_from_alpha_patch_based_backward_kernel(
    const uint32_t n_patches,
    const uint32_t patch_size,
    const int *packed_info,
    const float *alphas,
    const float *weights,
    const float *grad_weights,
    // outputs
    float *grad_alphas)
{
    CUDA_GET_THREAD_ID_2D(i, k, n_patches, patch_size);  // i is the patch id, k is the ray id within the patch


    // locate
    const int base = packed_info[i * 2 + 0];
    const int steps = packed_info[i * 2 + 1];
    if (steps == 0)
        return;

    alphas += base * patch_size;  // move the pointer to the base
    weights += base * patch_size;  // move the pointer to the base
    grad_weights += base * patch_size;  // move the pointer to the base
    grad_alphas += base * patch_size;  // move the pointer to the base

    float accum = 0;
    for (int j = 0; j < steps; ++j)
    {
        const uint32_t sample_idx = j * patch_size + k;
        accum += grad_weights[sample_idx] * weights[sample_idx];
    }

    // accumulation
    float T = 1.f;
    for (int j = 0; j < steps; ++j)
    {
        const uint32_t sample_idx = j * patch_size + k;
        const float alpha = alphas[sample_idx];
        grad_alphas[sample_idx] = (grad_weights[sample_idx] * T - accum) / fmaxf(1.f - alpha, 1e-10f);
        accum -= grad_weights[sample_idx] * weights[sample_idx];
        T *= (1.f - alpha);
    }
    return;
}

torch::Tensor weight_from_sigma_forward_naive(
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(starts);
    CHECK_INPUT(ends);
    CHECK_INPUT(sigmas);

    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);

    const uint32_t n_samples = sigmas.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor weights = torch::empty_like(sigmas);

    weight_from_sigma_forward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        starts.data_ptr<float>(),
        ends.data_ptr<float>(),
        sigmas.data_ptr<float>(),
        // outputs
        weights.data_ptr<float>());
    return weights;
}

torch::Tensor weight_from_sigma_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,
    torch::Tensor packed_info,
    torch::Tensor starts,
    torch::Tensor ends,
    torch::Tensor sigmas)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(weights);
    CHECK_INPUT(grad_weights);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(starts);
    CHECK_INPUT(ends);
    CHECK_INPUT(sigmas);

    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(starts.ndimension() == 2 & starts.size(1) == 1);
    TORCH_CHECK(ends.ndimension() == 2 & ends.size(1) == 1);
    TORCH_CHECK(sigmas.ndimension() == 2 & sigmas.size(1) == 1);
    TORCH_CHECK(weights.ndimension() == 2 & weights.size(1) == 1);
    TORCH_CHECK(grad_weights.ndimension() == 2 & grad_weights.size(1) == 1);

    const uint32_t n_samples = sigmas.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor grad_sigmas = torch::empty_like(sigmas);

    weight_from_sigma_backward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        starts.data_ptr<float>(),
        ends.data_ptr<float>(),
        sigmas.data_ptr<float>(),
        weights.data_ptr<float>(),
        grad_weights.data_ptr<float>(),
        // outputs
        grad_sigmas.data_ptr<float>());

    return grad_sigmas;
}

torch::Tensor weight_from_alpha_forward_naive(
    torch::Tensor packed_info, torch::Tensor alphas)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor weights = torch::empty_like(alphas);

    weight_from_alpha_forward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        // outputs
        weights.data_ptr<float>());
    return weights;
}

torch::Tensor weight_from_alpha_patch_based_forward_naive(
    torch::Tensor packed_info, // (n_patches, 2)
    torch::Tensor alphas // (n_samples, patches_size, 1)
    )
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(alphas.ndimension() == 3 & alphas.size(2) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_patches = packed_info.size(0);
    const uint32_t patch_size  = alphas.size(1);

    // compute the required number of thread.y from patch size
    // take the log2 of patch size and round up to the next power of 2
    const uint32_t thread_for_a_patch = pow(2, ceil(log2(patch_size)));
    const uint32_t thread_for_n_samples = 256 / thread_for_a_patch;
    // convert to uint
//     thread_for_a_patch = static_cast<uint32_t>(thread_for_a_patch);
//     thread_for_n_samples = static_cast<uint32_t>(thread_for_n_samples);

    const dim3 threads(thread_for_n_samples, thread_for_a_patch);
//     const dim3 blocks = CUDA_N_BLOCKS_NEEDED(n_samples, threads);
    const dim3 blocks((n_patches+threads.x-1)/threads.x, (patch_size+threads.y-1)/threads.y);

    // outputs
    torch::Tensor weights = torch::empty_like(alphas);
    torch::Tensor transmittance = torch::empty_like(alphas);

    weight_from_alpha_patch_based_forward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_patches,
        patch_size,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        // outputs
        weights.data_ptr<float>());
    return weights;
}

torch::Tensor weight_from_alpha_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,
    torch::Tensor packed_info,
    torch::Tensor alphas)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    CHECK_INPUT(weights);
    CHECK_INPUT(grad_weights);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
    TORCH_CHECK(weights.ndimension() == 2 & weights.size(1) == 1);
    TORCH_CHECK(grad_weights.ndimension() == 2 & grad_weights.size(1) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_rays = packed_info.size(0);

    const int threads = 256;
    const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);

    // outputs
    torch::Tensor grad_alphas = torch::empty_like(alphas);

    weight_from_alpha_backward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_rays,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        weights.data_ptr<float>(),
        grad_weights.data_ptr<float>(),
        // outputs
        grad_alphas.data_ptr<float>());
    return grad_alphas;
}

torch::Tensor weight_from_alpha_patch_based_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,  // (n_samples, patches_size, 1)
    torch::Tensor packed_info,
    torch::Tensor alphas)  // (n_samples, patches_size, 1)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    CHECK_INPUT(weights);
    CHECK_INPUT(grad_weights);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(alphas.ndimension() == 3 & alphas.size(2) == 1);
    TORCH_CHECK(weights.ndimension() == 3 & weights.size(2) == 1);
    TORCH_CHECK(grad_weights.ndimension() == 3 & grad_weights.size(2) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_patches = packed_info.size(0);
    const uint32_t patch_size = alphas.size(1);

    // compute the required number of thread.y from patch size
    // take the log2 of patch size and round up to the next power of 2
    const uint32_t thread_for_a_patch = pow(2, ceil(log2(patch_size)));
    const uint32_t thread_for_n_samples = 256 / thread_for_a_patch;

    const dim3 threads(thread_for_n_samples, thread_for_a_patch);
    const dim3 blocks((n_patches+threads.x-1)/threads.x, (patch_size+threads.y-1)/threads.y);

    // outputs
    torch::Tensor grad_alphas = torch::empty_like(alphas);

    weight_from_alpha_patch_based_backward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_patches,
        patch_size,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        weights.data_ptr<float>(),
        grad_weights.data_ptr<float>(),
        // outputs
        grad_alphas.data_ptr<float>());
    return grad_alphas;
}


std::vector<torch::Tensor> weight_and_transmittance_from_alpha_patch_based_forward_naive(
    torch::Tensor packed_info, // (n_patches, 2)
    torch::Tensor alphas // (n_samples, patches_size, 1)
    )
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(alphas.ndimension() == 3 & alphas.size(2) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_patches = packed_info.size(0);
    const uint32_t patch_size  = alphas.size(1);

    // compute the required number of thread.y from patch size
    // take the log2 of patch size and round up to the next power of 2
    const uint32_t thread_for_a_patch = pow(2, ceil(log2(patch_size)));
    const uint32_t thread_for_n_samples = 256 / thread_for_a_patch;

    const dim3 threads(thread_for_n_samples, thread_for_a_patch);
    const dim3 blocks((n_patches+threads.x-1)/threads.x, (patch_size+threads.y-1)/threads.y);

    // outputs
    torch::Tensor weights = torch::empty_like(alphas);
    torch::Tensor transmittance = torch::empty_like(alphas);

    weight_and_transmittance_from_alpha_patch_based_forward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_patches,
        patch_size,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        // outputs
        weights.data_ptr<float>(),
        transmittance.data_ptr<float>());
    return {weights, transmittance};
}

torch::Tensor weight_and_transmittance_from_alpha_patch_based_backward_naive(
    torch::Tensor weights,
    torch::Tensor grad_weights,  // (n_samples, patches_size, 1)
    torch::Tensor packed_info,
    torch::Tensor alphas)  // (n_samples, patches_size, 1)
{
    DEVICE_GUARD(packed_info);
    CHECK_INPUT(packed_info);
    CHECK_INPUT(alphas);
    CHECK_INPUT(weights);
    CHECK_INPUT(grad_weights);
    TORCH_CHECK(packed_info.ndimension() == 2);
    TORCH_CHECK(alphas.ndimension() == 3 & alphas.size(2) == 1);
    TORCH_CHECK(weights.ndimension() == 3 & weights.size(2) == 1);
    TORCH_CHECK(grad_weights.ndimension() == 3 & grad_weights.size(2) == 1);

    const uint32_t n_samples = alphas.size(0);
    const uint32_t n_patches = packed_info.size(0);
    const uint32_t patch_size = alphas.size(1);

    // compute the required number of thread.y from patch size
    // take the log2 of patch size and round up to the next power of 2
    const uint32_t thread_for_a_patch = pow(2, ceil(log2(patch_size)));
    const uint32_t thread_for_n_samples = 256 / thread_for_a_patch;

    const dim3 threads(thread_for_n_samples, thread_for_a_patch);
    const dim3 blocks((n_patches+threads.x-1)/threads.x, (patch_size+threads.y-1)/threads.y);

    // outputs
    torch::Tensor grad_alphas = torch::empty_like(alphas);

    weight_and_transmittance_from_alpha_patch_based_backward_kernel<<<
        blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
        n_patches,
        patch_size,
        // inputs
        packed_info.data_ptr<int>(),
        alphas.data_ptr<float>(),
        weights.data_ptr<float>(),
        grad_weights.data_ptr<float>(),
        // outputs
        grad_alphas.data_ptr<float>());
    return grad_alphas;
}

// torch::Tensor weight_from_alpha_importance_sampling_forward_naive(
//     torch::Tensor packed_info, torch::Tensor alphas, torch::Tensor importance_pdfs)
// {
//     DEVICE_GUARD(packed_info);
//     CHECK_INPUT(packed_info);
//     CHECK_INPUT(alphas);
//     CHECK_INPUT(importance_pdfs);
//     TORCH_CHECK(packed_info.ndimension() == 2);
//     TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
//     TORCH_CHECK(importance_pdfs.ndimension() == 2 & importance_pdfs.size(1) == 1);
//
//     const uint32_t n_samples = alphas.size(0);
//     const uint32_t n_rays = packed_info.size(0);
//
//     const int threads = 256;
//     const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
//
//     // outputs
//     torch::Tensor weights = torch::empty_like(alphas);
//
//     weight_from_alpha_forward_kernel<<<
//         blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
//         n_rays,
//         // inputs
//         packed_info.data_ptr<int>(),
//         alphas.data_ptr<float>(),
//         importance_pdfs.data_ptr<float>(),
//         // outputs
//         weights.data_ptr<float>());
//     return weights;
// }
//
// torch::Tensor weight_from_alpha_importance_sampling_backward_naive(
//     torch::Tensor weights,
//     torch::Tensor grad_weights,
//     torch::Tensor packed_info,
//     torch::Tensor alphas,
//     torch::Tensor importance_pdfs)
// {
//     DEVICE_GUARD(packed_info);
//     CHECK_INPUT(packed_info);
//     CHECK_INPUT(alphas);
//     CHECK_INPUT(weights);
//     CHECK_INPUT(grad_weights);
//     CHECK_INPUT(importance_pdfs);
//     TORCH_CHECK(packed_info.ndimension() == 2);
//     TORCH_CHECK(alphas.ndimension() == 2 & alphas.size(1) == 1);
//     TORCH_CHECK(weights.ndimension() == 2 & weights.size(1) == 1);
//     TORCH_CHECK(importance_pdfs.ndimension() == 2 & importance_pdfs.size(1) == 1);
//     TORCH_CHECK(grad_weights.ndimension() == 2 & grad_weights.size(1) == 1);
//
//
//     const uint32_t n_samples = alphas.size(0);
//     const uint32_t n_rays = packed_info.size(0);
//
//     const int threads = 256;
//     const int blocks = CUDA_N_BLOCKS_NEEDED(n_rays, threads);
//
//     // outputs
//     torch::Tensor grad_alphas = torch::empty_like(alphas);
//
//     weight_from_alpha_backward_kernel<<<
//         blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
//         n_rays,
//         // inputs
//         packed_info.data_ptr<int>(),
//         alphas.data_ptr<float>(),
//         weights.data_ptr<float>(),
//         grad_weights.data_ptr<float>(),
//         importance_pdfs.data_ptr<float>(),
//         // outputs
//         grad_alphas.data_ptr<float>());
//     return grad_alphas;
// }

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/grid.py
================================================
"""
Copyright (c) 2022 Ruilong Li @ UC Berkeley
"""

from typing import Callable, List, Union

import torch
import torch.nn as nn

import nerfacc.cuda as _C

from .contraction import ContractionType, contract_inv

# TODO: check torch.scatter_reduce_
# from torch_scatter import scatter_max


@torch.no_grad()
def query_grid(
    samples: torch.Tensor,
    grid_roi: torch.Tensor,
    grid_values: torch.Tensor,
    grid_type: ContractionType,
):
    """Query grid values given coordinates.

    Args:
        samples: (n_samples, 3) tensor of coordinates.
        grid_roi: (6,) region of interest of the grid. Usually it should be
            accquired from the grid itself using `grid.roi_aabb`.
        grid_values: A 3D tensor of grid values in the shape of (resx, resy, resz).
        grid_type: Contraction type of the grid. Usually it should be
            accquired from the grid itself using `grid.contraction_type`.

    Returns:
        (n_samples) values for those samples queried from the grid.
    """
    assert samples.dim() == 2 and samples.size(-1) == 3
    assert grid_roi.dim() == 1 and grid_roi.size(0) == 6
    assert grid_values.dim() == 3
    assert isinstance(grid_type, ContractionType)
    return _C.grid_query(
        samples.contiguous(),
        grid_roi.contiguous(),
        grid_values.contiguous(),
        grid_type.to_cpp_version(),
    )


class Grid(nn.Module):
    """An abstract Grid class.

    The grid is used as a cache of the 3D space to indicate whether each voxel
    area is important or not for the differentiable rendering process. The
    ray marching function (see :func:`nerfacc.ray_marching`) would use the
    grid to skip the unimportant voxel areas.

    To work with :func:`nerfacc.ray_marching`, three attributes must exist:

        - :attr:`roi_aabb`: The axis-aligned bounding box of the region of interest.
        - :attr:`binary`: A 3D binarized tensor of shape {resx, resy, resz}, \
            with torch.bool data type.
        - :attr:`contraction_type`: The contraction type of the grid, indicating how \
            the 3D space is mapped to the grid.
    """

    def __init__(self, *args, **kwargs):
        super().__init__()
        self.register_buffer("_dummy", torch.empty(0), persistent=False)

    @property
    def device(self) -> torch.device:
        return self._dummy.device

    @property
    def roi_aabb(self) -> torch.Tensor:
        """The axis-aligned bounding box of the region of interest.

        Its is a shape (6,) tensor in the format of {minx, miny, minz, maxx, maxy, maxz}.
        """
        if hasattr(self, "_roi_aabb"):
            return getattr(self, "_roi_aabb")
        else:
            raise NotImplementedError("please set an attribute named _roi_aabb")

    @property
    def binary(self) -> torch.Tensor:
        """A 3D binarized tensor with torch.bool data type.

        The tensor is of shape (resx, resy, resz), in which each boolen value
        represents whether the corresponding voxel should be kept or not.
        """
        if hasattr(self, "_binary"):
            return getattr(self, "_binary")
        else:
            raise NotImplementedError("please set an attribute named _binary")

    @property
    def contraction_type(self) -> ContractionType:
        """The contraction type of the grid.

        The contraction type is an indicator of how the 3D space is contracted
        to this voxel grid. See :class:`nerfacc.ContractionType` for more details.
        """
        if hasattr(self, "_contraction_type"):
            return getattr(self, "_contraction_type")
        else:
            raise NotImplementedError(
                "please set an attribute named _contraction_type"
            )


class OccupancyGrid(Grid):
    """Occupancy grid: whether each voxel area is occupied or not.

    Args:
        roi_aabb: The axis-aligned bounding box of the region of interest. Useful for mapping
            the 3D space to the grid.
        resolution: The resolution of the grid. If an integer is given, the grid is assumed to
            be a cube. Otherwise, a list or a tensor of shape (3,) is expected. Default: 128.
        contraction_type: The contraction type of the grid. See :class:`nerfacc.ContractionType`
            for more details. Default: :attr:`nerfacc.ContractionType.AABB`.
    """

    NUM_DIM: int = 3

    def __init__(
        self,
        roi_aabb: Union[List[int], torch.Tensor],
        resolution: Union[int, List[int], torch.Tensor] = 128,
        contraction_type: ContractionType = ContractionType.AABB,
    ) -> None:
        super().__init__()
        if isinstance(resolution, int):
            resolution = [resolution] * self.NUM_DIM
        if isinstance(resolution, (list, tuple)):
            resolution = torch.tensor(resolution, dtype=torch.int32)
        assert isinstance(
            resolution, torch.Tensor
        ), f"Invalid type: {type(resolution)}"
        assert resolution.shape == (
            self.NUM_DIM,
        ), f"Invalid shape: {resolution.shape}"

        if isinstance(roi_aabb, (list, tuple)):
            roi_aabb = torch.tensor(roi_aabb, dtype=torch.float32)
        assert isinstance(
            roi_aabb, torch.Tensor
        ), f"Invalid type: {type(roi_aabb)}"
        assert roi_aabb.shape == torch.Size(
            [self.NUM_DIM * 2]
        ), f"Invalid shape: {roi_aabb.shape}"

        # total number of voxels
        self.num_cells = int(resolution.prod().item())

        # required attributes
        self.register_buffer("_roi_aabb", roi_aabb)
        self.register_buffer(
            "_binary", torch.zeros(resolution.tolist(), dtype=torch.bool)
        )
        self._contraction_type = contraction_type

        # helper attributes
        self.register_buffer("resolution", resolution)
        self.register_buffer("occs", torch.zeros(self.num_cells))

        # Grid coords & indices
        grid_coords = _meshgrid3d(resolution).reshape(
            self.num_cells, self.NUM_DIM
        )
        self.register_buffer("grid_coords", grid_coords, persistent=False)
        grid_indices = torch.arange(self.num_cells)
        self.register_buffer("grid_indices", grid_indices, persistent=False)

    @torch.no_grad()
    def _get_all_cells(self) -> torch.Tensor:
        """Returns all cells of the grid."""
        return self.grid_indices

    @torch.no_grad()
    def _sample_uniform_and_occupied_cells(self, n: int) -> torch.Tensor:
        """Samples both n uniform and occupied cells."""
        uniform_indices = torch.randint(
            self.num_cells, (n,), device=self.device
        )
        occupied_indices = torch.nonzero(self._binary.flatten())[:, 0]
        if n < len(occupied_indices):
            selector = torch.randint(
                len(occupied_indices), (n,), device=self.device
            )
            occupied_indices = occupied_indices[selector]
        indices = torch.cat([uniform_indices, occupied_indices], dim=0)
        return indices

    @torch.no_grad()
    def _update(
        self,
        step: int,
        occ_eval_fn: Callable,
        occ_thre: float = 0.01,
        ema_decay: float = 0.95,
        warmup_steps: int = 256,
    ) -> None:
        """Update the occ field in the EMA way."""
        # sample cells
        if step < warmup_steps:
            indices = self._get_all_cells()
        else:
            N = self.num_cells // 4
            indices = self._sample_uniform_and_occupied_cells(N)

        # infer occupancy: density * step_size
        grid_coords = self.grid_coords[indices]
        x = (
            grid_coords + torch.rand_like(grid_coords, dtype=torch.float32)
        ) / self.resolution
        if self._contraction_type == ContractionType.UN_BOUNDED_SPHERE:
            # only the points inside the sphere are valid
            mask = (x - 0.5).norm(dim=1) < 0.5
            x = x[mask]
            indices = indices[mask]
        # voxel coordinates [0, 1]^3 -> world
        x = contract_inv(
            x,
            roi=self._roi_aabb,
            type=self._contraction_type,
        )
        occ = occ_eval_fn(x).squeeze(-1)

        # ema update
        self.occs[indices] = torch.maximum(self.occs[indices] * ema_decay, occ)
        # suppose to use scatter max but emperically it is almost the same.
        # self.occs, _ = scatter_max(
        #     occ, indices, dim=0, out=self.occs * ema_decay
        # )
        self._binary = (
            self.occs > torch.clamp(self.occs.mean(), max=occ_thre)
        ).view(self._binary.shape)

    @torch.no_grad()
    def every_n_step(
        self,
        step: int,
        occ_eval_fn: Callable,
        occ_thre: float = 1e-2,
        ema_decay: float = 0.95,
        warmup_steps: int = 256,
        n: int = 16,
    ) -> None:
        """Update the grid every n steps during training.

        Args:
            step: Current training step.
            occ_eval_fn: A function that takes in sample locations :math:`(N, 3)` and
                returns the occupancy values :math:`(N, 1)` at those locations.
            occ_thre: Threshold used to binarize the occupancy grid. Default: 1e-2.
            ema_decay: The decay rate for EMA updates. Default: 0.95.
            warmup_steps: Sample all cells during the warmup stage. After the warmup
                stage we change the sampling strategy to 1/4 uniformly sampled cells
                together with 1/4 occupied cells. Default: 256.
            n: Update the grid every n steps. Default: 16.
        """
        if not self.training:
            raise RuntimeError(
                "You should only call this function only during training. "
                "Please call _update() directly if you want to update the "
                "field during inference."
            )
        if step % n == 0 and self.training:
            self._update(
                step=step,
                occ_eval_fn=occ_eval_fn,
                occ_thre=occ_thre,
                ema_decay=ema_decay,
                warmup_steps=warmup_steps,
            )

    @torch.no_grad()
    def query_occ(self, samples: torch.Tensor) -> torch.Tensor:
        """Query the occupancy field at the given samples.

        Args:
            samples: Samples in the world coordinates. (n_samples, 3)

        Returns:
            Occupancy values at the given samples. (n_samples,)
        """
        return query_grid(
            samples,
            self._roi_aabb,
            self.binary,
            self.contraction_type,
        )


def _meshgrid3d(
    res: torch.Tensor, device: Union[torch.device, str] = "cpu"
) -> torch.Tensor:
    """Create 3D grid coordinates."""
    assert len(res) == 3
    res = res.tolist()
    return torch.stack(
        torch.meshgrid(
            [
                torch.arange(res[0], dtype=torch.long),
                torch.arange(res[1], dtype=torch.long),
                torch.arange(res[2], dtype=torch.long),
            ],
            indexing="ij",
        ),
        dim=-1,
    ).to(device)


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/intersection.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

from typing import Tuple

import torch
from torch import Tensor

import nerfacc.cuda as _C


@torch.no_grad()
def ray_aabb_intersect(
    rays_o: Tensor, rays_d: Tensor, aabb: Tensor
) -> Tuple[Tensor, Tensor]:
    """Ray AABB Test.

    Note:
        this function is not differentiable to any inputs.

    Args:
        rays_o: Ray origins of shape (n_rays, 3).
        rays_d: Normalized ray directions of shape (n_rays, 3).
        aabb: Scene bounding box {xmin, ymin, zmin, xmax, ymax, zmax}. \
            Tensor with shape (6)

    Returns:
        Ray AABB intersection {t_min, t_max} with shape (n_rays) respectively. \
        Note the t_min is clipped to minimum zero. 1e10 means no intersection.

    Examples:

    .. code-block:: python

        aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device="cuda:0")
        rays_o = torch.rand((128, 3), device="cuda:0")
        rays_d = torch.randn((128, 3), device="cuda:0")
        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
        t_min, t_max = ray_aabb_intersect(rays_o, rays_d, aabb)

    """
    if rays_o.is_cuda and rays_d.is_cuda and aabb.is_cuda:
        rays_o = rays_o.contiguous()
        rays_d = rays_d.contiguous()
        aabb = aabb.contiguous()
        t_min, t_max = _C.ray_aabb_intersect(rays_o, rays_d, aabb)
    else:
        raise NotImplementedError("Only support cuda inputs.")
    return t_min, t_max


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/losses.py
================================================
from torch import Tensor

from .pack import unpack_data


def distortion(
    packed_info: Tensor, weights: Tensor, t_starts: Tensor, t_ends: Tensor
) -> Tensor:
    """Distortion loss from Mip-NeRF 360 paper, Equ. 15.

    Args:
        packed_info: Packed info for the samples. (n_rays, 2)
        weights: Weights for the samples. (all_samples,)
        t_starts: Per-sample start distance. Tensor with shape (all_samples, 1).
        t_ends: Per-sample end distance. Tensor with shape (all_samples, 1).

    Returns:
        Distortion loss. (n_rays,)
    """
    # （all_samples, 1) -> (n_rays, n_samples)
    w = unpack_data(packed_info, weights[..., None]).squeeze(-1)
    t1 = unpack_data(packed_info, t_starts).squeeze(-1)
    t2 = unpack_data(packed_info, t_ends).squeeze(-1)

    interval = t2 - t1
    tmid = (t1 + t2) / 2

    loss_uni = (1 / 3) * (interval * w.pow(2)).sum(-1)
    ww = w.unsqueeze(-1) * w.unsqueeze(-2)
    mm = (tmid.unsqueeze(-1) - tmid.unsqueeze(-2)).abs()
    loss_bi = (ww * mm).sum((-1, -2))
    return loss_uni + loss_bi


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/pack.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""
from typing import Optional, Tuple

import torch
from torch import Tensor

import nerfacc.cuda as _C


def pack_data(data: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
    """Pack per-ray data (n_rays, n_samples, D) to (all_samples, D) based on mask.

    Args:
        data: Tensor with shape (n_rays, n_samples, D).
        mask: Boolen tensor with shape (n_rays, n_samples).

    Returns:
        Tuple of Tensors including packed data (all_samples, D), \
        and packed_info (n_rays, 2) which stores the start index of the sample,
        and the number of samples kept for each ray. \

    Examples:

    .. code-block:: python

        data = torch.rand((10, 3, 4), device="cuda:0")
        mask = data.rand((10, 3), dtype=torch.bool, device="cuda:0")
        packed_data, packed_info = pack(data, mask)
        print(packed_data.shape, packed_info.shape)

    """
    assert data.dim() == 3, "data must be with shape of (n_rays, n_samples, D)."
    assert (
        mask.shape == data.shape[:2]
    ), "mask must be with shape of (n_rays, n_samples)."
    assert mask.dtype == torch.bool, "mask must be a boolean tensor."
    packed_data = data[mask]
    num_steps = mask.sum(dim=-1, dtype=torch.int32)
    cum_steps = num_steps.cumsum(dim=0, dtype=torch.int32)
    packed_info = torch.stack([cum_steps - num_steps, num_steps], dim=-1)
    return packed_data, packed_info


@torch.no_grad()
def pack_info(ray_indices: Tensor, n_rays: int = None) -> Tensor:
    """Pack `ray_indices` to `packed_info`. Useful for converting per sample data to per ray data.

    Note: 
        this function is not differentiable to any inputs.

    Args:
        ray_indices: Ray index of each sample. LongTensor with shape (n_sample).

    Returns:
        packed_info: Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. IntTensor with shape (n_rays, 2).
    """
    assert (
        ray_indices.dim() == 1
    ), "ray_indices must be a 1D tensor with shape (n_samples)."
    if ray_indices.is_cuda:
        ray_indices = ray_indices
        device = ray_indices.device
        if n_rays is None:
            n_rays = int(ray_indices.max()) + 1
        # else:
        #     assert n_rays > ray_indices.max()
        src = torch.ones_like(ray_indices, dtype=torch.int)
        num_steps = torch.zeros((n_rays,), device=device, dtype=torch.int)
        num_steps.scatter_add_(0, ray_indices, src)
        cum_steps = num_steps.cumsum(dim=0, dtype=torch.int)
        packed_info = torch.stack([cum_steps - num_steps, num_steps], dim=-1)
    else:
        raise NotImplementedError("Only support cuda inputs.")
    return packed_info


@torch.no_grad()
def unpack_info(packed_info: Tensor, n_samples: int) -> Tensor:
    """Unpack `packed_info` to `ray_indices`. Useful for converting per ray data to per sample data.

    Note: 
        this function is not differentiable to any inputs.

    Args:
        packed_info: Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. IntTensor with shape (n_rays, 2).
        n_samples: Total number of samples.

    Returns:
        Ray index of each sample. LongTensor with shape (n_sample).

    Examples:

    .. code-block:: python

        rays_o = torch.rand((128, 3), device="cuda:0")
        rays_d = torch.randn((128, 3), device="cuda:0")
        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
        # Ray marching with near far plane.
        packed_info, t_starts, t_ends = ray_marching(
            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
        )
        # torch.Size([128, 2]) torch.Size([115200, 1]) torch.Size([115200, 1])
        print(packed_info.shape, t_starts.shape, t_ends.shape)
        # Unpack per-ray info to per-sample info.
        ray_indices = unpack_info(packed_info, t_starts.shape[0])
        # torch.Size([115200]) torch.int64
        print(ray_indices.shape, ray_indices.dtype)

    """
    assert (
        packed_info.dim() == 2 and packed_info.shape[-1] == 2
    ), "packed_info must be a 2D tensor with shape (n_rays, 2)."
    if packed_info.is_cuda:
        ray_indices = _C.unpack_info(packed_info.contiguous(), n_samples)
    else:
        raise NotImplementedError("Only support cuda inputs.")
    return ray_indices


def unpack_data(
    packed_info: Tensor,
    data: Tensor,
    n_samples: Optional[int] = None,
) -> Tensor:
    """Unpack packed data (all_samples, D) to per-ray data (n_rays, n_samples, D).

    Args:
        packed_info (Tensor): Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. Tensor with shape (n_rays, 2).
        data: Packed data to unpack. Tensor with shape (n_samples, D).
        n_samples (int): Optional Number of samples per ray. If not provided, it \
            will be inferred from the packed_info.

    Returns:
        Unpacked data (n_rays, n_samples, D).

    Examples:

    .. code-block:: python

        rays_o = torch.rand((128, 3), device="cuda:0")
        rays_d = torch.randn((128, 3), device="cuda:0")
        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

        # Ray marching with aabb.
        scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device="cuda:0")
        packed_info, t_starts, t_ends = ray_marching(
            rays_o, rays_d, scene_aabb=scene_aabb, render_step_size=1e-2
        )
        print(t_starts.shape)  # torch.Size([all_samples, 1])

        t_starts = unpack_data(packed_info, t_starts, n_samples=1024)
        print(t_starts.shape)  # torch.Size([128, 1024, 1])
    """
    assert (
        packed_info.dim() == 2 and packed_info.shape[-1] == 2
    ), "packed_info must be a 2D tensor with shape (n_rays, 2)."
    assert (
        data.dim() == 2
    ), "data must be a 2D tensor with shape (n_samples, D)."
    if n_samples is None:
        n_samples = packed_info[:, 1].max().item()
    return _UnpackData.apply(packed_info, data, n_samples)


class _UnpackData(torch.autograd.Function):
    """Unpack packed data (all_samples, D) to per-ray data (n_rays, n_samples, D)."""

    @staticmethod
    def forward(ctx, packed_info: Tensor, data: Tensor, n_samples: int):
        # shape of the data should be (all_samples, D)
        packed_info = packed_info.contiguous()
        data = data.contiguous()
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(packed_info)
            ctx.n_samples = n_samples
        return _C.unpack_data(packed_info, data, n_samples)

    @staticmethod
    def backward(ctx, grad: Tensor):
        # shape of the grad should be (n_rays, n_samples, D)
        packed_info = ctx.saved_tensors[0]
        n_samples = ctx.n_samples
        mask = _C.unpack_info_to_mask(packed_info, n_samples)
        packed_grad = grad[mask].contiguous()
        return None, packed_grad, None


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/ray_marching.py
================================================
from typing import Callable, Optional, Tuple

import torch

import nerfacc.cuda as _C

from .contraction import ContractionType
from .grid import Grid
from .intersection import ray_aabb_intersect
from .vol_rendering import render_visibility


@torch.no_grad()
def ray_marching(
    # rays
    rays_o: torch.Tensor,
    rays_d: torch.Tensor,
    t_min: Optional[torch.Tensor] = None,
    t_max: Optional[torch.Tensor] = None,
    # bounding box of the scene
    scene_aabb: Optional[torch.Tensor] = None,
    # binarized grid for skipping empty space
    grid: Optional[Grid] = None,
    # sigma/alpha function for skipping invisible space
    sigma_fn: Optional[Callable] = None,
    alpha_fn: Optional[Callable] = None,
    early_stop_eps: float = 1e-4,
    alpha_thre: float = 0.0,
    # rendering options
    near_plane: Optional[float] = None,
    far_plane: Optional[float] = None,
    render_step_size: float = 1e-3,
    stratified: bool = False,
    cone_angle: float = 0.0,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Ray marching with space skipping.

    Note:
        The logic for computing `t_min` and `t_max`:
        1. If `t_min` and `t_max` are given, use them with highest priority.
        2. If `t_min` and `t_max` are not given, but `scene_aabb` is given, use \
            :func:`ray_aabb_intersect` to compute `t_min` and `t_max`.
        3. If `t_min` and `t_max` are not given, and `scene_aabb` is not given, \
            set `t_min` to 0.0, and `t_max` to 1e10. (the case of unbounded scene)
        4. Always clip `t_min` with `near_plane` and `t_max` with `far_plane` if given.

    Warning:
        This function is not differentiable to any inputs.

    Args:
        rays_o: Ray origins of shape (n_rays, 3).
        rays_d: Normalized ray directions of shape (n_rays, 3).
        t_min: Optional. Per-ray minimum distance. Tensor with shape (n_rays).
        t_max: Optional. Per-ray maximum distance. Tensor with shape (n_rays).
        scene_aabb: Optional. Scene bounding box for computing t_min and t_max.
            A tensor with shape (6,) {xmin, ymin, zmin, xmax, ymax, zmax}.
            `scene_aabb` will be ignored if both `t_min` and `t_max` are provided.
        grid: Optional. Grid that idicates where to skip during marching.
            See :class:`nerfacc.Grid` for details.
        sigma_fn: Optional. If provided, the marching will skip the invisible space
            by evaluating the density along the ray with `sigma_fn`. It should be a 
            function that takes in samples {t_starts (N, 1), t_ends (N, 1),
            ray indices (N,)} and returns the post-activation density values (N, 1).
            You should only provide either `sigma_fn` or `alpha_fn`.
        alpha_fn: Optional. If provided, the marching will skip the invisible space
            by evaluating the density along the ray with `alpha_fn`. It should be a
            function that takes in samples {t_starts (N, 1), t_ends (N, 1),
            ray indices (N,)} and returns the post-activation opacity values (N, 1).
            You should only provide either `sigma_fn` or `alpha_fn`.
        early_stop_eps: Early stop threshold for skipping invisible space. Default: 1e-4.
        alpha_thre: Alpha threshold for skipping empty space. Default: 0.0.
        near_plane: Optional. Near plane distance. If provided, it will be used
            to clip t_min.
        far_plane: Optional. Far plane distance. If provided, it will be used
            to clip t_max.
        render_step_size: Step size for marching. Default: 1e-3.
        stratified: Whether to use stratified sampling. Default: False.
        cone_angle: Cone angle for linearly-increased step size. 0. means
            constant step size. Default: 0.0.

    Returns:
        A tuple of tensors.

            - **ray_indices**: Ray index of each sample. IntTensor with shape (n_samples).
            - **t_starts**: Per-sample start distance. Tensor with shape (n_samples, 1).
            - **t_ends**: Per-sample end distance. Tensor with shape (n_samples, 1).

    Examples:

    .. code-block:: python

        import torch
        from nerfacc import OccupancyGrid, ray_marching, unpack_info

        device = "cuda:0"
        batch_size = 128
        rays_o = torch.rand((batch_size, 3), device=device)
        rays_d = torch.randn((batch_size, 3), device=device)
        rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

        # Ray marching with near far plane.
        ray_indices, t_starts, t_ends = ray_marching(
            rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3
        )

        # Ray marching with aabb.
        scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device=device)
        ray_indices, t_starts, t_ends = ray_marching(
            rays_o, rays_d, scene_aabb=scene_aabb, render_step_size=1e-3
        )

        # Ray marching with per-ray t_min and t_max.
        t_min = torch.zeros((batch_size,), device=device)
        t_max = torch.ones((batch_size,), device=device)
        ray_indices, t_starts, t_ends = ray_marching(
            rays_o, rays_d, t_min=t_min, t_max=t_max, render_step_size=1e-3
        )

        # Ray marching with aabb and skip areas based on occupancy grid.
        scene_aabb = torch.tensor([0.0, 0.0, 0.0, 1.0, 1.0, 1.0], device=device)
        grid = OccupancyGrid(roi_aabb=[0.0, 0.0, 0.0, 0.5, 0.5, 0.5]).to(device)
        ray_indices, t_starts, t_ends = ray_marching(
            rays_o, rays_d, scene_aabb=scene_aabb, grid=grid, render_step_size=1e-3
        )

        # Convert t_starts and t_ends to sample locations.
        t_mid = (t_starts + t_ends) / 2.0
        sample_locs = rays_o[ray_indices] + t_mid * rays_d[ray_indices]

    """
    if not rays_o.is_cuda:
        raise NotImplementedError("Only support cuda inputs.")
    if alpha_fn is not None and sigma_fn is not None:
        raise ValueError(
            "Only one of `alpha_fn` and `sigma_fn` should be provided."
        )

    # logic for t_min and t_max:
    # 1. if t_min and t_max are given, use them with highest priority.
    # 2. if t_min and t_max are not given, but scene_aabb is given, use
    # ray_aabb_intersect to compute t_min and t_max.
    # 3. if t_min and t_max are not given, and scene_aabb is not given,
    # set t_min to 0.0, and t_max to 1e10. (the case of unbounded scene)
    # 4. always clip t_min with near_plane and t_max with far_plane if given.
    if t_min is None or t_max is None:
        if scene_aabb is not None:
            t_min, t_max = ray_aabb_intersect(rays_o, rays_d, scene_aabb)
        else:
            t_min = torch.zeros_like(rays_o[..., 0])
            t_max = torch.ones_like(rays_o[..., 0]) * 1e10
    if near_plane is not None:
        t_min = torch.clamp(t_min, min=near_plane)
    if far_plane is not None:
        t_max = torch.clamp(t_max, max=far_plane)

    # stratified sampling: prevent overfitting during training
    if stratified:
        t_min = t_min + torch.rand_like(t_min) * render_step_size

    # use grid for skipping if given
    if grid is not None:
        grid_roi_aabb = grid.roi_aabb
        grid_binary = grid.binary
        contraction_type = grid.contraction_type.to_cpp_version()
    else:
        grid_roi_aabb = torch.tensor(
            [-1e10, -1e10, -1e10, 1e10, 1e10, 1e10],
            dtype=torch.float32,
            device=rays_o.device,
        )
        grid_binary = torch.ones(
            [1, 1, 1], dtype=torch.bool, device=rays_o.device
        )
        contraction_type = ContractionType.AABB.to_cpp_version()

    # marching with grid-based skipping
    packed_info, ray_indices, t_starts, t_ends = _C.ray_marching(
        # rays
        rays_o.contiguous(),
        rays_d.contiguous(),
        t_min.contiguous(),
        t_max.contiguous(),
        # coontraction and grid
        grid_roi_aabb.contiguous(),
        grid_binary.contiguous(),
        contraction_type,
        # sampling
        render_step_size,
        cone_angle,
    )

    # skip invisible space
    if sigma_fn is not None or alpha_fn is not None:
        # Query sigma without gradients
        if sigma_fn is not None:
            sigmas = sigma_fn(t_starts, t_ends, ray_indices)
            assert (
                sigmas.shape == t_starts.shape
            ), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
            alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))
        elif alpha_fn is not None:
            alphas = alpha_fn(t_starts, t_ends, ray_indices)
            assert (
                alphas.shape == t_starts.shape
            ), "alphas must have shape of (N, 1)! Got {}".format(alphas.shape)

        # Compute visibility of the samples, and filter out invisible samples
        masks = render_visibility(
            alphas,
            ray_indices=ray_indices,
            packed_info=packed_info,
            early_stop_eps=early_stop_eps,
            alpha_thre=alpha_thre,
            n_rays=rays_o.shape[0],
        )
        ray_indices, t_starts, t_ends = (
            ray_indices[masks],
            t_starts[masks],
            t_ends[masks],
        )

    return ray_indices, t_starts, t_ends


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/sampling.py
================================================
import math
from typing import Callable, Optional, Tuple, Union, overload

import torch

import nerfacc.cuda as _C

from .cdf import ray_resampling
from .grid import Grid
from .pack import pack_info, unpack_info
from .vol_rendering import (
    render_transmittance_from_alpha,
    render_weight_from_density,
)


@overload
def sample_along_rays(
    rays_o: torch.Tensor,  # [n_rays, 3]
    rays_d: torch.Tensor,  # [n_rays, 3]
    t_min: torch.Tensor,  # [n_rays,]
    t_max: torch.Tensor,  # [n_rays,]
    step_size: float,
    cone_angle: float = 0.0,
    grid: Optional[Grid] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Sample along rays with per-ray min max."""
    ...


@overload
def sample_along_rays(
    rays_o: torch.Tensor,  # [n_rays, 3]
    rays_d: torch.Tensor,  # [n_rays, 3]
    t_min: float,
    t_max: float,
    step_size: float,
    cone_angle: float = 0.0,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Sample along rays with near far plane."""
    ...


@torch.no_grad()
def sample_along_rays(
    rays_o: torch.Tensor,  # [n_rays, 3]
    rays_d: torch.Tensor,  # [n_rays, 3]
    t_min: Union[float, torch.Tensor],  # [n_rays,]
    t_max: Union[float, torch.Tensor],  # [n_rays,]
    step_size: float,
    cone_angle: float = 0.0,
    grid: Optional[Grid] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Sample intervals along rays."""
    if isinstance(t_min, float) and isinstance(t_max, float):
        n_rays = rays_o.shape[0]
        device = rays_o.device
        num_steps = math.floor((t_max - t_min) / step_size)
        t_starts = (
            (t_min + torch.arange(0, num_steps, device=device) * step_size)
            .expand(n_rays, -1)
            .reshape(-1, 1)
        )
        t_ends = t_starts + step_size
        ray_indices = torch.arange(0, n_rays, device=device).repeat_interleave(
            num_steps, dim=0
        )
    else:
        if grid is None:
            packed_info, ray_indices, t_starts, t_ends = _C.ray_marching(
                # rays
                t_min.contiguous(),
                t_max.contiguous(),
                # sampling
                step_size,
                cone_angle,
            )
        else:
            (
                packed_info,
                ray_indices,
                t_starts,
                t_ends,
            ) = _C.ray_marching_with_grid(
                # rays
                rays_o.contiguous(),
                rays_d.contiguous(),
                t_min.contiguous(),
                t_max.contiguous(),
                # coontraction and grid
                grid.roi_aabb.contiguous(),
                grid.binary.contiguous(),
                grid.contraction_type.to_cpp_version(),
                # sampling
                step_size,
                cone_angle,
            )
    return ray_indices, t_starts, t_ends


@torch.no_grad()
def proposal_sampling_with_filter(
    t_starts: torch.Tensor,  # [n_samples, 1]
    t_ends: torch.Tensor,  # [n_samples, 1]
    ray_indices: torch.Tensor,  # [n_samples,]
    n_rays: Optional[int] = None,
    # compute density of samples: {t_starts, t_ends, ray_indices} -> density
    sigma_fn: Optional[Callable] = None,
    # proposal density fns: {t_starts, t_ends, ray_indices} -> density
    proposal_sigma_fns: Tuple[Callable, ...] = [],
    proposal_n_samples: Tuple[int, ...] = [],
    proposal_require_grads: bool = False,
    # acceleration options
    early_stop_eps: float = 1e-4,
    alpha_thre: float = 0.0,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Hueristic marching with proposal fns."""
    assert len(proposal_sigma_fns) == len(proposal_n_samples), (
        "proposal_sigma_fns and proposal_n_samples must have the same length, "
        f"but got {len(proposal_sigma_fns)} and {len(proposal_n_samples)}."
    )
    if n_rays is None:
        n_rays = ray_indices.max() + 1

    # compute density from proposal fns
    proposal_samples = []
    for proposal_fn, n_samples in zip(proposal_sigma_fns, proposal_n_samples):

        # compute weights for resampling
        sigmas = proposal_fn(t_starts, t_ends, ray_indices)
        assert (
            sigmas.shape == t_starts.shape
        ), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
        alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))
        transmittance = render_transmittance_from_alpha(
            alphas, ray_indices=ray_indices, n_rays=n_rays
        )
        weights = alphas * transmittance

        # Compute visibility for filtering
        if alpha_thre > 0 or early_stop_eps > 0:
            vis = (alphas >= alpha_thre) & (transmittance >= early_stop_eps)
            vis = vis.squeeze(-1)
            ray_indices, t_starts, t_ends, weights = (
                ray_indices[vis],
                t_starts[vis],
                t_ends[vis],
                weights[vis],
            )
        packed_info = pack_info(ray_indices, n_rays=n_rays)

        # Rerun the proposal function **with** gradients on filtered samples.
        if proposal_require_grads:
            with torch.enable_grad():
                sigmas = proposal_fn(t_starts, t_ends, ray_indices)
                weights = render_weight_from_density(
                    t_starts, t_ends, sigmas, ray_indices=ray_indices
                )
                proposal_samples.append(
                    (packed_info, t_starts, t_ends, weights)
                )

        # resampling on filtered samples
        packed_info, t_starts, t_ends = ray_resampling(
            packed_info, t_starts, t_ends, weights, n_samples=n_samples
        )
        ray_indices = unpack_info(packed_info, t_starts.shape[0])

    # last round filtering with sigma_fn
    if (alpha_thre > 0 or early_stop_eps > 0) and (sigma_fn is not None):
        sigmas = sigma_fn(t_starts, t_ends, ray_indices)
        assert (
            sigmas.shape == t_starts.shape
        ), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
        alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))
        transmittance = render_transmittance_from_alpha(
            alphas, ray_indices=ray_indices, n_rays=n_rays
        )
        vis = (alphas >= alpha_thre) & (transmittance >= early_stop_eps)
        vis = vis.squeeze(-1)
        ray_indices, t_starts, t_ends = (
            ray_indices[vis],
            t_starts[vis],
            t_ends[vis],
        )

    return ray_indices, t_starts, t_ends, proposal_samples


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/version.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

__version__ = "0.3.5"


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/nerfacc/vol_rendering.py
================================================
"""
Copyright (c) 2022 Ruilong Li, UC Berkeley.
"""

from typing import Callable, Optional, Tuple

import torch
from torch import Tensor

import nerfacc.cuda as _C

from .pack import pack_info


def rendering(
    # ray marching results
    t_starts: torch.Tensor,
    t_ends: torch.Tensor,
    ray_indices: torch.Tensor,
    n_rays: int,
    # radiance field
    rgb_sigma_fn: Optional[Callable] = None,
    rgb_alpha_fn: Optional[Callable] = None,
    # rendering options
    render_bkgd: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Render the rays through the radience field defined by `rgb_sigma_fn`.

    This function is differentiable to the outputs of `rgb_sigma_fn` so it can 
    be used for gradient-based optimization.

    Note:
        Either `rgb_sigma_fn` or `rgb_alpha_fn` should be provided. 

    Warning:
        This function is not differentiable to `t_starts`, `t_ends` and `ray_indices`.

    Args:
        t_starts: Per-sample start distance. Tensor with shape (n_samples, 1).
        t_ends: Per-sample end distance. Tensor with shape (n_samples, 1).
        ray_indices: Ray index of each sample. IntTensor with shape (n_samples).
        n_rays: Total number of rays. This will decide the shape of the ouputs.
        rgb_sigma_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1), \
            ray indices (N,)} and returns the post-activation rgb (N, 3) and density \
            values (N, 1). 
        rgb_alpha_fn: A function that takes in samples {t_starts (N, 1), t_ends (N, 1), \
            ray indices (N,)} and returns the post-activation rgb (N, 3) and opacity \
            values (N, 1).
        render_bkgd: Optional. Background color. Tensor with shape (3,).

    Returns:
        Ray colors (n_rays, 3), opacities (n_rays, 1) and depths (n_rays, 1).

    Examples:

    .. code-block:: python

        >>> rays_o = torch.rand((128, 3), device="cuda:0")
        >>> rays_d = torch.randn((128, 3), device="cuda:0")
        >>> rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
        >>> ray_indices, t_starts, t_ends = ray_marching(
        >>>     rays_o, rays_d, near_plane=0.1, far_plane=1.0, render_step_size=1e-3)
        >>> def rgb_sigma_fn(t_starts, t_ends, ray_indices):
        >>>     # This is a dummy function that returns random values.
        >>>     rgbs = torch.rand((t_starts.shape[0], 3), device="cuda:0")
        >>>     sigmas = torch.rand((t_starts.shape[0], 1), device="cuda:0")
        >>>     return rgbs, sigmas
        >>> colors, opacities, depths = rendering(
        >>>     t_starts, t_ends, ray_indices, n_rays=128, rgb_sigma_fn=rgb_sigma_fn)
        >>> print(colors.shape, opacities.shape, depths.shape)
        torch.Size([128, 3]) torch.Size([128, 1]) torch.Size([128, 1])

    """
    if rgb_sigma_fn is None and rgb_alpha_fn is None:
        raise ValueError(
            "At least one of `rgb_sigma_fn` and `rgb_alpha_fn` should be specified."
        )

    # Query sigma/alpha and color with gradients
    if rgb_sigma_fn is not None:
        rgbs, sigmas = rgb_sigma_fn(t_starts, t_ends, ray_indices)
        assert rgbs.shape[-1] == 3, "rgbs must have 3 channels, got {}".format(
            rgbs.shape
        )
        assert (
            sigmas.shape == t_starts.shape
        ), "sigmas must have shape of (N, 1)! Got {}".format(sigmas.shape)
        # Rendering: compute weights.
        weights = render_weight_from_density(
            t_starts,
            t_ends,
            sigmas,
            ray_indices=ray_indices,
            n_rays=n_rays,
        )
    elif rgb_alpha_fn is not None:
        rgbs, alphas = rgb_alpha_fn(t_starts, t_ends, ray_indices)
        assert rgbs.shape[-1] == 3, "rgbs must have 3 channels, got {}".format(
            rgbs.shape
        )
        assert (
            alphas.shape == t_starts.shape
        ), "alphas must have shape of (N, 1)! Got {}".format(alphas.shape)
        # Rendering: compute weights.
        weights = render_weight_from_alpha(
            alphas,
            ray_indices=ray_indices,
            n_rays=n_rays,
        )

    # Rendering: accumulate rgbs, opacities, and depths along the rays.
    colors = accumulate_along_rays(
        weights, ray_indices, values=rgbs, n_rays=n_rays
    )
    opacities = accumulate_along_rays(
        weights, ray_indices, values=None, n_rays=n_rays
    )
    depths = accumulate_along_rays(
        weights,
        ray_indices,
        values=(t_starts + t_ends) / 2.0,
        n_rays=n_rays,
    )

    # Background composition.
    if render_bkgd is not None:
        colors = colors + render_bkgd * (1.0 - opacities)

    return colors, opacities, depths


def accumulate_along_rays(
    weights: Tensor,
    ray_indices: Tensor,
    values: Optional[Tensor] = None,
    n_rays: Optional[int] = None,
) -> Tensor:
    """Accumulate volumetric values along the ray.

    Note:
        This function is only differentiable to `weights` and `values`.

    Args:
        weights: Volumetric rendering weights for those samples. Tensor with shape \
            (n_samples,).
        ray_indices: Ray index of each sample. LongTensor with shape (n_samples).
        values: The values to be accmulated. Tensor with shape (n_samples, D). If \
            None, the accumulated values are just weights. Default is None.
        n_rays: Total number of rays. This will decide the shape of the ouputs. If \
            None, it will be inferred from `ray_indices.max() + 1`.  If specified \
            it should be at least larger than `ray_indices.max()`. Default is None.

    Returns:
        Accumulated values with shape (n_rays, D). If `values` is not given then we return \
            the accumulated weights, in which case D == 1.

    Examples:

    .. code-block:: python

        # Rendering: accumulate rgbs, opacities, and depths along the rays.
        colors = accumulate_along_rays(weights, ray_indices, values=rgbs, n_rays=n_rays)
        opacities = accumulate_along_rays(weights, ray_indices, values=None, n_rays=n_rays)
        depths = accumulate_along_rays(
            weights,
            ray_indices,
            values=(t_starts + t_ends) / 2.0,
            n_rays=n_rays,
        )
        # (n_rays, 3), (n_rays, 1), (n_rays, 1)
        print(colors.shape, opacities.shape, depths.shape)

    """
    assert ray_indices.dim() == 1 and weights.dim() == 2
    if not weights.is_cuda:
        raise NotImplementedError("Only support cuda inputs.")
    if values is not None:
        assert (
            values.dim() == 2 and values.shape[0] == weights.shape[0]
        ), "Invalid shapes: {} vs {}".format(values.shape, weights.shape)
        src = weights * values
    else:
        src = weights

    if ray_indices.numel() == 0:
        assert n_rays is not None
        return torch.zeros((n_rays, src.shape[-1]), device=weights.device)

    if n_rays is None:
        n_rays = int(ray_indices.max()) + 1
    # assert n_rays > ray_indices.max()

    index = ray_indices[:, None].expand(-1, src.shape[-1])
    outputs = torch.zeros(
        (n_rays, src.shape[-1]), device=src.device, dtype=src.dtype
    )
    outputs.scatter_add_(0, index, src)
    return outputs

def accumulate_along_rays_importance(
    weights: Tensor,
    ray_indices: Tensor,
    values: Optional[Tensor] = None,
    n_rays: Optional[int] = None,
) -> Tensor:
    """Accumulate volumetric values along the ray.

    Note:
        This function is only differentiable to `weights` and `values`.

    Args:
        weights: Volumetric rendering weights for those samples. Tensor with shape \
            (n_samples,).
        ray_indices: Ray index of each sample. LongTensor with shape (n_samples).
        values: The values to be accmulated. Tensor with shape (n_samples, D). If \
            None, the accumulated values are just weights. Default is None.
        n_rays: Total number of rays. This will decide the shape of the ouputs. If \
            None, it will be inferred from `ray_indices.max() + 1`.  If specified \
            it should be at least larger than `ray_indices.max()`. Default is None.

    Returns:
        Accumulated values with shape (n_rays, D). If `values` is not given then we return \
            the accumulated weights, in which case D == 1.

    Examples:

    .. code-block:: python

        # Rendering: accumulate rgbs, opacities, and depths along the rays.
        colors = accumulate_along_rays(weights, ray_indices, values=rgbs, n_rays=n_rays)
        opacities = accumulate_along_rays(weights, ray_indices, values=None, n_rays=n_rays)
        depths = accumulate_along_rays(
            weights,
            ray_indices,
            values=(t_starts + t_ends) / 2.0,
            n_rays=n_rays,
        )
        # (n_rays, 3), (n_rays, 1), (n_rays, 1)
        print(colors.shape, opacities.shape, depths.shape)

    """
    assert ray_indices.dim() == 1 and weights.dim() == 2
    if not weights.is_cuda:
        raise NotImplementedError("Only support cuda inputs.")
    if values is not None:
        assert (
            values.dim() == 2 and values.shape[0] == weights.shape[0]
        ), "Invalid shapes: {} vs {}".format(values.shape, weights.shape)
        src = weights * values
    else:
        src = weights

    if ray_indices.numel() == 0:
        assert n_rays is not None
        return torch.zeros((n_rays, src.shape[-1]), device=weights.device)

    if n_rays is None:
        n_rays = int(ray_indices.max()) + 1
    # assert n_rays > ray_indices.max()

    index = ray_indices[:, None].expand(-1, src.shape[-1])
    outputs = torch.zeros(
        (n_rays, src.shape[-1]), device=src.device, dtype=src.dtype
    )
    outputs.scatter_add_(0, index, src)
    return outputs


def accumulate_along_rays_patch_based(
    weights: Tensor,
    ray_indices: Tensor,
    values: Optional[Tensor] = None,
    n_patches: Optional[int] = None,
) -> Tensor:
    """Accumulate volumetric values along the ray.

    Note:
        This function is only differentiable to `weights` and `values`.

    Args:
        weights: Volumetric rendering weights for those samples. Tensor with shape \
            (n_samples,).
        ray_indices: Ray index of each sample. LongTensor with shape (n_samples).
        values: The values to be accmulated. Tensor with shape (n_samples, D). If \
            None, the accumulated values are just weights. Default is None.
        n_rays: Total number of rays. This will decide the shape of the ouputs. If \
            None, it will be inferred from `ray_indices.max() + 1`.  If specified \
            it should be at least larger than `ray_indices.max()`. Default is None.

    Returns:
        Accumulated values with shape (n_rays, D). If `values` is not given then we return \
            the accumulated weights, in which case D == 1.

    Examples:

    .. code-block:: python

        # Rendering: accumulate rgbs, opacities, and depths along the rays.
        colors = accumulate_along_rays(weights, ray_indices, values=rgbs, n_rays=n_rays)
        opacities = accumulate_along_rays(weights, ray_indices, values=None, n_rays=n_rays)
        depths = accumulate_along_rays(
            weights,
            ray_indices,
            values=(t_starts + t_ends) / 2.0,
            n_rays=n_rays,
        )
        # (n_rays, 3), (n_rays, 1), (n_rays, 1)
        print(colors.shape, opacities.shape, depths.shape)

    """
    assert ray_indices.dim() == 1 and weights.dim() == 3  # (num_samples, patch_size, 1)
    if not weights.is_cuda:
        raise NotImplementedError("Only support cuda inputs.")
    if values is not None:
        assert (
            values.dim() == 3 and values.shape[0] == weights.shape[0]
        ), "Invalid shapes: {} vs {}".format(values.shape, weights.shape)
        src = weights * values
    else:
        src = weights

    if ray_indices.numel() == 0:
        assert n_patches is not None
        return torch.zeros((n_patches, src.shape[1], src.shape[-1]), device=weights.device)

    if n_patches is None:
        n_patches = int(ray_indices.max()) + 1
    # assert n_rays > ray_indices.max()

    index = ray_indices[:, None, None].expand(-1, src.shape[1], src.shape[-1])
    outputs = torch.zeros(
        (n_patches, src.shape[1], src.shape[-1]), device=src.device, dtype=src.dtype
    )
    outputs.scatter_add_(0, index, src)
    return outputs


def render_transmittance_from_density(
    t_starts: Tensor,
    t_ends: Tensor,
    sigmas: Tensor,
    *,
    packed_info: Optional[torch.Tensor] = None,
    ray_indices: Optional[torch.Tensor] = None,
    n_rays: Optional[int] = None,
) -> Tensor:
    """Compute transmittance :math:`T_i` from density :math:`\\sigma_i`.
    
    .. math::
        T_i = exp(-\\sum_{j=1}^{i-1}\\sigma_j\delta_j)

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is 
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        t_starts: Where the frustum-shape sample starts along a ray. Tensor with \
            shape (n_samples, 1).
        t_ends: Where the frustum-shape sample ends along a ray. Tensor with \
            shape (n_samples, 1).
        sigmas: The density values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.

    Returns:
        The rendering transmittance. Tensor with shape (n_sample, 1).

    Examples:

    .. code-block:: python

        >>> t_starts = torch.tensor(
        >>>     [[0.0], [1.0], [2.0], [3.0], [4.0], [5.0], [6.0]], device="cuda")
        >>> t_ends = torch.tensor(
        >>>     [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0]], device="cuda")
        >>> sigmas = torch.tensor(
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda")
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> transmittance = render_transmittance_from_density(
        >>>     t_starts, t_ends, sigmas, ray_indices=ray_indices)
        [[1.00], [0.67], [0.30], [1.00], [0.45], [1.00], [1.00]]
    
    """
    assert (
        ray_indices is not None or packed_info is not None
    ), "Either ray_indices or packed_info should be provided."
    if ray_indices is not None and _C.is_cub_available():
        transmittance = _RenderingTransmittanceFromDensityCUB.apply(
            ray_indices, t_starts, t_ends, sigmas
        )
    else:
        if packed_info is None:
            packed_info = pack_info(ray_indices, n_rays=n_rays)
        transmittance = _RenderingTransmittanceFromDensityNaive.apply(
            packed_info, t_starts, t_ends, sigmas
        )
    return transmittance


def render_transmittance_from_alpha(
    alphas: Tensor,
    *,
    packed_info: Optional[torch.Tensor] = None,
    ray_indices: Optional[torch.Tensor] = None,
    n_rays: Optional[int] = None,
) -> Tensor:
    """Compute transmittance :math:`T_i` from alpha :math:`\\alpha_i`.
    
    .. math::
        T_i = \\prod_{j=1}^{i-1}(1-\\alpha_j)

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is 
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.

    Returns:
        The rendering transmittance. Tensor with shape (n_sample, 1).

    Examples:

    .. code-block:: python

        >>> alphas = torch.tensor( 
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda"))
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> transmittance = render_transmittance_from_alpha(alphas, ray_indices=ray_indices)
        tensor([[1.0], [0.6], [0.12], [1.0], [0.2], [1.0], [1.0]])

    """
    assert (
        ray_indices is not None or packed_info is not None
    ), "Either ray_indices or packed_info should be provided."
    if ray_indices is not None and _C.is_cub_available():
        transmittance = _RenderingTransmittanceFromAlphaCUB.apply(
            ray_indices, alphas
        )
    else:
        if packed_info is None:
            packed_info = pack_info(ray_indices, n_rays=n_rays)
        transmittance = _RenderingTransmittanceFromAlphaNaive.apply(
            packed_info, alphas
        )
    return transmittance


def render_weight_from_density(
    t_starts: Tensor,
    t_ends: Tensor,
    sigmas: Tensor,
    *,
    packed_info: Optional[torch.Tensor] = None,
    ray_indices: Optional[torch.Tensor] = None,
    n_rays: Optional[int] = None,
) -> torch.Tensor:
    """Compute rendering weights :math:`w_i` from density :math:`\\sigma_i` and interval :math:`\\delta_i`.
    
    .. math::
        w_i = T_i(1 - exp(-\\sigma_i\delta_i)), \\quad\\textrm{where}\\quad T_i = exp(-\\sum_{j=1}^{i-1}\\sigma_j\delta_j)

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is 
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        t_starts: Where the frustum-shape sample starts along a ray. Tensor with \
            shape (n_samples, 1).
        t_ends: Where the frustum-shape sample ends along a ray. Tensor with \
            shape (n_samples, 1).
        sigmas: The density values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.

    Returns:
        The rendering weights. Tensor with shape (n_sample, 1).

    Examples:

    .. code-block:: python

        >>> t_starts = torch.tensor(
        >>>     [[0.0], [1.0], [2.0], [3.0], [4.0], [5.0], [6.0]], device="cuda")
        >>> t_ends = torch.tensor(
        >>>     [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0]], device="cuda")
        >>> sigmas = torch.tensor(
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda")
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> weights = render_weight_from_density(
        >>>     t_starts, t_ends, sigmas, ray_indices=ray_indices)
        [[0.33], [0.37], [0.03], [0.55], [0.04], [0.00], [0.59]]
    
    """
    assert (
        ray_indices is not None or packed_info is not None
    ), "Either ray_indices or packed_info should be provided."
    if ray_indices is not None and _C.is_cub_available():
        transmittance = _RenderingTransmittanceFromDensityCUB.apply(
            ray_indices, t_starts, t_ends, sigmas
        )
        alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))
        weights = transmittance * alphas
    else:
        if packed_info is None:
            packed_info = pack_info(ray_indices, n_rays=n_rays)
        weights = _RenderingWeightFromDensityNaive.apply(
            packed_info, t_starts, t_ends, sigmas
        )
    return weights


def render_weight_from_alpha_patch_based(
        alphas: Tensor,
        ray_indices: Tensor,
        *,
        # packed_info: Optional[torch.Tensor] = None
        n_rays: Optional[int] = None,
) -> torch.Tensor:
    """Compute rendering weights :math:`w_i` from opacity :math:`\\alpha_i`.

    .. math::
        w_i = T_i\\alpha_i, \\quad\\textrm{where}\\quad T_i = \\prod_{j=1}^{i-1}(1-\\alpha_j)

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.

    Returns:
        The rendering weights. Tensor with shape (n_sample, 1).

    Examples:

    .. code-block:: python

        >>> alphas = torch.tensor(
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda"))
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> weights = render_weight_from_alpha(alphas, ray_indices=ray_indices)
        tensor([[0.4], [0.48], [0.012], [0.8], [0.02], [0.0], [0.9]])

    """
    packed_info = pack_info(ray_indices, n_rays=n_rays)
    weights = _RenderingWeightFromAlphaPatchBasedNaive.apply(packed_info, alphas)
    return weights


def render_weight_and_transmittance_from_alpha_patch_based(
        alphas: Tensor,
        ray_indices: Tensor,
        *,
        # packed_info: Optional[torch.Tensor] = None
        n_rays: Optional[int] = None,
) -> torch.Tensor:
    """Compute rendering weights :math:`w_i` from opacity :math:`\\alpha_i`.

    .. math::
        w_i = T_i\\alpha_i, \\quad\\textrm{where}\\quad T_i = \\prod_{j=1}^{i-1}(1-\\alpha_j)

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.

    Returns:
        The rendering weights. Tensor with shape (n_sample, 1).

    Examples:

    .. code-block:: python

        >>> alphas = torch.tensor(
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda"))
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> weights = render_weight_from_alpha(alphas, ray_indices=ray_indices)
        tensor([[0.4], [0.48], [0.012], [0.8], [0.02], [0.0], [0.9]])

    """
    packed_info = pack_info(ray_indices, n_rays=n_rays)
    weights, transmittance = _RenderingWeightAndTransmittanceFromAlphaPatchBasedNaive.apply(packed_info, alphas)
    return weights, transmittance

def render_weight_from_alpha(
    alphas: Tensor,
    *,
    packed_info: Optional[torch.Tensor] = None,
    ray_indices: Optional[torch.Tensor] = None,
    n_rays: Optional[int] = None,
) -> torch.Tensor:
    """Compute rendering weights :math:`w_i` from opacity :math:`\\alpha_i`.
    
    .. math::
        w_i = T_i\\alpha_i, \\quad\\textrm{where}\\quad T_i = \\prod_{j=1}^{i-1}(1-\\alpha_j)

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is 
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.

    Returns:
        The rendering weights. Tensor with shape (n_sample, 1).

    Examples:

    .. code-block:: python

        >>> alphas = torch.tensor( 
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda"))
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> weights = render_weight_from_alpha(alphas, ray_indices=ray_indices)
        tensor([[0.4], [0.48], [0.012], [0.8], [0.02], [0.0], [0.9]])

    """
    assert (
        ray_indices is not None or packed_info is not None
    ), "Either ray_indices or packed_info should be provided."
    if ray_indices is not None and _C.is_cub_available():
        transmittance = _RenderingTransmittanceFromAlphaCUB.apply(
            ray_indices, alphas
        )
        weights = transmittance * alphas
    else:
        if packed_info is None:
            packed_info = pack_info(ray_indices, n_rays=n_rays)
        weights = _RenderingWeightFromAlphaNaive.apply(packed_info, alphas)
    return weights


@torch.no_grad()
def render_visibility(
    alphas: torch.Tensor,
    *,
    ray_indices: Optional[torch.Tensor] = None,
    packed_info: Optional[torch.Tensor] = None,
    n_rays: Optional[int] = None,
    early_stop_eps: float = 1e-4,
    alpha_thre: float = 0.0,
) -> torch.Tensor:
    """Filter out transparent and occluded samples.

    In this function, we first compute the transmittance from the sample opacity. The
    transmittance is then used to filter out occluded samples. And opacity is used to
    filter out transparent samples. The function returns a boolean tensor indicating
    which samples are visible (`transmittance > early_stop_eps` and `opacity > alpha_thre`).

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is 
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.
        early_stop_eps: The early stopping threshold on transmittance.
        alpha_thre: The threshold on opacity.
    
    Returns:
        The visibility of each sample. Tensor with shape (n_samples, 1).

    Examples:

    .. code-block:: python

        >>> alphas = torch.tensor( 
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda")
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> transmittance = render_transmittance_from_alpha(alphas, ray_indices=ray_indices)
        tensor([[1.0], [0.6], [0.12], [1.0], [0.2], [1.0], [1.0]])
        >>> visibility = render_visibility(
        >>>     alphas, ray_indices=ray_indices, early_stop_eps=0.3, alpha_thre=0.2)
        tensor([True,  True, False,  True, False, False,  True])

    """
    assert (
        ray_indices is not None or packed_info is not None
    ), "Either ray_indices or packed_info should be provided."
    if ray_indices is not None and _C.is_cub_available():
        transmittance = _RenderingTransmittanceFromAlphaCUB.apply(
            ray_indices, alphas
        )
    else:
        if packed_info is None:
            packed_info = pack_info(ray_indices, n_rays=n_rays)
        transmittance = _RenderingTransmittanceFromAlphaNaive.apply(
            packed_info, alphas
        )
    visibility = transmittance >= early_stop_eps
    if alpha_thre > 0:
        visibility = visibility & (alphas >= alpha_thre)
    visibility = visibility.squeeze(-1)
    return visibility


@torch.no_grad()
def render_visibility_patch_based(
        alphas: torch.Tensor,
        *,
        ray_indices: Optional[torch.Tensor] = None,
        packed_info: Optional[torch.Tensor] = None,
        n_patches: Optional[int] = None,
        early_stop_eps: float = 1e-4,
        alpha_thre: float = 0.0,
) -> torch.Tensor:
    """Filter out transparent and occluded samples.

    In this function, we first compute the transmittance from the sample opacity. The
    transmittance is then used to filter out occluded samples. And opacity is used to
    filter out transparent samples. The function returns a boolean tensor indicating
    which samples are visible (`transmittance > early_stop_eps` and `opacity > alpha_thre`).

    Note:
        Either `ray_indices` or `packed_info` should be provided. If `ray_indices` is
        provided, CUB acceleration will be used if available (CUDA >= 11.6). Otherwise,
        we will use the naive implementation with `packed_info`.

    Args:
        alphas: The opacity values of the samples. Tensor with shape (n_samples, 1).
        packed_info: Optional. Stores information on which samples belong to the same ray. \
            See :func:`nerfacc.ray_marching` for details. LongTensor with shape (n_rays, 2).
        ray_indices: Optional. Ray index of each sample. LongTensor with shape (n_sample).
        n_rays: Optional. Number of rays. Only useful when `ray_indices` is provided yet \
            CUB acceleration is not available. We will implicitly convert `ray_indices` to \
            `packed_info` and use the naive implementation. If not provided, we will infer \
            it from `ray_indices` but it will be slower.
        early_stop_eps: The early stopping threshold on transmittance.
        alpha_thre: The threshold on opacity.

    Returns:
        The visibility of each sample. Tensor with shape (n_samples, 1).

    Examples:

    .. code-block:: python

        >>> alphas = torch.tensor(
        >>>     [[0.4], [0.8], [0.1], [0.8], [0.1], [0.0], [0.9]], device="cuda")
        >>> ray_indices = torch.tensor([0, 0, 0, 1, 1, 2, 2], device="cuda")
        >>> transmittance = render_transmittance_from_alpha(alphas, ray_indices=ray_indices)
        tensor([[1.0], [0.6], [0.12], [1.0], [0.2], [1.0], [1.0]])
        >>> visibility = render_visibility(
        >>>     alphas, ray_indices=ray_indices, early_stop_eps=0.3, alpha_thre=0.2)
        tensor([True,  True, False,  True, False, False,  True])

    """
    assert (
            ray_indices is not None or packed_info is not None
    ), "Either ray_indices or packed_info should be provided."
    if ray_indices is not None and _C.is_cub_available():
        transmittance = _RenderingTransmittanceFromAlphaCUB.apply(
            ray_indices, alphas
        )
    else:
        if packed_info is None:
            packed_info = pack_info(ray_indices, n_rays=n_patches)
        transmittance = _RenderingTransmittanceFromAlphaPatchBasedNaive.apply(
            packed_info, alphas
        )
    visibility = torch.any(transmittance >= early_stop_eps, dim=1, keepdim=True)
    if alpha_thre > 0:
        visibility = visibility & (alphas >= alpha_thre)
    visibility = visibility.squeeze()
    return visibility

class _RenderingTransmittanceFromDensityCUB(torch.autograd.Function):
    """Rendering transmittance from density with CUB implementation."""

    @staticmethod
    def forward(ctx, ray_indices, t_starts, t_ends, sigmas):
        ray_indices = ray_indices.contiguous()
        t_starts = t_starts.contiguous()
        t_ends = t_ends.contiguous()
        sigmas = sigmas.contiguous()
        transmittance = _C.transmittance_from_sigma_forward_cub(
            ray_indices, t_starts, t_ends, sigmas
        )
        if ctx.needs_input_grad[3]:
            ctx.save_for_backward(ray_indices, t_starts, t_ends, transmittance)
        return transmittance

    @staticmethod
    def backward(ctx, transmittance_grads):
        transmittance_grads = transmittance_grads.contiguous()
        ray_indices, t_starts, t_ends, transmittance = ctx.saved_tensors
        grad_sigmas = _C.transmittance_from_sigma_backward_cub(
            ray_indices, t_starts, t_ends, transmittance, transmittance_grads
        )
        return None, None, None, grad_sigmas


class _RenderingTransmittanceFromDensityNaive(torch.autograd.Function):
    """Rendering transmittance from density with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, t_starts, t_ends, sigmas):
        packed_info = packed_info.contiguous()
        t_starts = t_starts.contiguous()
        t_ends = t_ends.contiguous()
        sigmas = sigmas.contiguous()
        transmittance = _C.transmittance_from_sigma_forward_naive(
            packed_info, t_starts, t_ends, sigmas
        )
        if ctx.needs_input_grad[3]:
            ctx.save_for_backward(packed_info, t_starts, t_ends, transmittance)
        return transmittance

    @staticmethod
    def backward(ctx, transmittance_grads):
        transmittance_grads = transmittance_grads.contiguous()
        packed_info, t_starts, t_ends, transmittance = ctx.saved_tensors
        grad_sigmas = _C.transmittance_from_sigma_backward_naive(
            packed_info, t_starts, t_ends, transmittance, transmittance_grads
        )
        return None, None, None, grad_sigmas


class _RenderingTransmittanceFromAlphaCUB(torch.autograd.Function):
    """Rendering transmittance from opacity with CUB implementation."""

    @staticmethod
    def forward(ctx, ray_indices, alphas):
        ray_indices = ray_indices.contiguous()
        alphas = alphas.contiguous()
        transmittance = _C.transmittance_from_alpha_forward_cub(
            ray_indices, alphas
        )
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(ray_indices, transmittance, alphas)
        return transmittance

    @staticmethod
    def backward(ctx, transmittance_grads):
        transmittance_grads = transmittance_grads.contiguous()
        ray_indices, transmittance, alphas = ctx.saved_tensors
        grad_alphas = _C.transmittance_from_alpha_backward_cub(
            ray_indices, alphas, transmittance, transmittance_grads
        )
        return None, grad_alphas


class _RenderingTransmittanceFromAlphaNaive(torch.autograd.Function):
    """Rendering transmittance from opacity with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, alphas):
        packed_info = packed_info.contiguous()
        alphas = alphas.contiguous()
        transmittance = _C.transmittance_from_alpha_forward_naive(
            packed_info, alphas
        )
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(packed_info, transmittance, alphas)
        return transmittance

    @staticmethod
    def backward(ctx, transmittance_grads):
        transmittance_grads = transmittance_grads.contiguous()
        packed_info, transmittance, alphas = ctx.saved_tensors
        grad_alphas = _C.transmittance_from_alpha_backward_naive(
            packed_info, alphas, transmittance, transmittance_grads
        )
        return None, grad_alphas

class _RenderingTransmittanceFromAlphaPatchBasedNaive(torch.autograd.Function):
    """Rendering weight from opacity with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, alphas):
        packed_info = packed_info.contiguous()
        alphas = alphas.contiguous()
        transmittance = _C.transmittance_from_alpha_patch_based_forward_naive(packed_info, alphas)
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(packed_info, transmittance, alphas)
        return transmittance

    @staticmethod
    def backward(ctx, grad_transmittance):
        grad_transmittance = grad_transmittance.contiguous()
        packed_info, transmittance, alphas = ctx.saved_tensors
        grad_alphas = _C.weight_and_transmittance_from_alpha_patch_based_backward_naive(
            packed_info, alphas, transmittance, grad_transmittance
        )
        return None, grad_alphas

class _RenderingWeightFromDensityNaive(torch.autograd.Function):
    """Rendering weight from density with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, t_starts, t_ends, sigmas):
        packed_info = packed_info.contiguous()
        t_starts = t_starts.contiguous()
        t_ends = t_ends.contiguous()
        sigmas = sigmas.contiguous()
        weights = _C.weight_from_sigma_forward_naive(
            packed_info, t_starts, t_ends, sigmas
        )
        if ctx.needs_input_grad[3]:
            ctx.save_for_backward(
                packed_info, t_starts, t_ends, sigmas, weights
            )
        return weights

    @staticmethod
    def backward(ctx, grad_weights):
        grad_weights = grad_weights.contiguous()
        packed_info, t_starts, t_ends, sigmas, weights = ctx.saved_tensors
        grad_sigmas = _C.weight_from_sigma_backward_naive(
            weights, grad_weights, packed_info, t_starts, t_ends, sigmas
        )
        return None, None, None, grad_sigmas


class _RenderingWeightFromAlphaNaive(torch.autograd.Function):
    """Rendering weight from opacity with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, alphas):
        packed_info = packed_info.contiguous()
        alphas = alphas.contiguous()
        weights = _C.weight_from_alpha_forward_naive(packed_info, alphas)
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(packed_info, alphas, weights)
        return weights

    @staticmethod
    def backward(ctx, grad_weights):
        grad_weights = grad_weights.contiguous()
        packed_info, alphas, weights = ctx.saved_tensors
        grad_alphas = _C.weight_from_alpha_backward_naive(
            weights, grad_weights, packed_info, alphas
        )
        return None, grad_alphas

class _RenderingWeightFromAlphaPatchBasedNaive(torch.autograd.Function):
    """Rendering weight from opacity with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, alphas):
        packed_info = packed_info.contiguous()
        alphas = alphas.contiguous()
        weights = _C.weight_from_alpha_patch_based_forward_naive(packed_info, alphas)
        # print(weights.shape, transmittance.shape)
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(packed_info, alphas, weights)
        return weights

    @staticmethod
    def backward(ctx, grad_weights):
        grad_weights = grad_weights.contiguous()
        packed_info, alphas, weights = ctx.saved_tensors
        grad_alphas = _C.weight_from_alpha_patch_based_backward_naive(
            weights, grad_weights, packed_info, alphas
        )
        return None, grad_alphas


class _RenderingWeightAndTransmittanceFromAlphaPatchBasedNaive(torch.autograd.Function):
    """Rendering weight from opacity with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, alphas):
        packed_info = packed_info.contiguous()
        alphas = alphas.contiguous()
        weights, transmittance = _C.weight_and_transmittance_from_alpha_patch_based_forward_naive(packed_info, alphas)
        # print(weights.shape, transmittance.shape)
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(packed_info, alphas, weights)
        return weights, transmittance

    @staticmethod
    def backward(ctx, grad_weights, grad_transmittance):
        grad_weights = grad_weights.contiguous()
        packed_info, alphas, weights = ctx.saved_tensors
        grad_alphas = _C.weight_and_transmittance_from_alpha_patch_based_backward_naive(
            weights, grad_weights, packed_info, alphas
        )
        return None, grad_alphas


class _RenderingWeightFromAlphaImportanceSamplingNaive(torch.autograd.Function):
    """Rendering weight from opacity with naive forloop."""

    @staticmethod
    def forward(ctx, packed_info, alphas, importance_pdfs):
        packed_info = packed_info.contiguous()
        alphas = alphas.contiguous()
        importance_pdfs = importance_pdfs.contiguous()
        weights = _C.weight_from_alpha_importance_sampling_forward_naive(packed_info, alphas, importance_pdfs)
        if ctx.needs_input_grad[1]:
            ctx.save_for_backward(packed_info, alphas, importance_pdfs, weights)
        return weights

    @staticmethod
    def backward(ctx, grad_weights):
        grad_weights = grad_weights.contiguous()
        packed_info, alphas, importance_pdfs, weights = ctx.saved_tensors
        grad_alphas = _C.weight_from_alpha_backward_naive(
            weights, grad_weights, packed_info, alphas, importance_pdfs
        )
        return None, grad_alphas

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/scripts/run_aws_listing.py
================================================
import argparse
import os

from boto3 import client

parser = argparse.ArgumentParser()
parser.add_argument("--access_key_id", type=str, required=True)
parser.add_argument("--secret_access_key", type=str, required=True)
parser.add_argument("--bucket", type=str, required=True)
parser.add_argument("--region", type=str, required=True)
args = parser.parse_args()

URL = f"https://{args.bucket}.s3.{args.region}.amazonaws.com/"

s3 = client(
    "s3",
    aws_access_key_id=args.access_key_id,
    aws_secret_access_key=args.secret_access_key,
)

responses = s3.list_objects_v2(Bucket=args.bucket, Prefix="whl/")["Contents"]

subdirectories = {}
for data in responses:
    splits = data["Key"].split("/")
    if len(splits) == 3:
        subdirectories[splits[1]] = []

for dir in subdirectories.keys():
    responses = s3.list_objects_v2(Bucket=args.bucket, Prefix=f"whl/{dir}")[
        "Contents"
    ]
    for data in responses:
        splits = data["Key"].split("/")
        if len(splits) == 3:
            subdirectories[dir].append(splits[2])

for dir, files in subdirectories.items():
    lines = ""
    for file in files:
        href = os.path.join(URL, "whl", dir, file)
        lines += f"<a href='{href}'>{file}</a>\n<br>\n"

    html = f"<html>\n<head></head>\n<body>\n{lines}\n</body>\n</html>\n"
    html_file = f"/tmp/{dir}.html"
    with open(html_file, "w") as f:
        f.write(html)

    s3.upload_file(
        html_file,
        args.bucket,
        f"whl/{dir}.html",
        ExtraArgs={"ContentType": "text/html"},
    )


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/scripts/run_dev_checks.py
================================================
#!/usr/bin/env python
"""Simple yaml debugger"""
import subprocess

import yaml
from rich.console import Console
from rich.style import Style

console = Console(width=120)

LOCAL_TESTS = [
    "Run license checks",
    "Run isort",
    "Run Black",
    "Python Pylint",
    "Test with pytest",
]


def run_command(command: str) -> bool:
    """Run a command kill actions if it fails

    Args:
        command: command to run
        continue_on_fail: whether to continue running commands if the current one fails.
    """
    ret_code = subprocess.call(command, shell=True)
    if ret_code != 0:
        console.print(f"[bold red]Error: `{command}` failed.")
    return ret_code == 0


def run_github_actions_file(filename: str):
    """Run a github actions file locally.

    Args:
        filename: Which yml github actions file to run.
    """
    with open(filename, "rb") as f:
        my_dict = yaml.safe_load(f)
    steps = my_dict["jobs"]["build"]["steps"]

    success = True

    for step in steps:
        if "name" in step and step["name"] in LOCAL_TESTS:
            compressed = step["run"].replace("\n", ";").replace("\\", "")
            compressed = compressed.replace("--check", "")
            curr_command = f"{compressed}"

            console.line()
            console.rule(f"[bold green]Running: {curr_command}")
            success = success and run_command(curr_command)
        else:
            skip_name = step["name"] if "name" in step else step["uses"]
            console.print(f"Skipping {skip_name}")

    # Code Testing
    console.line()
    console.rule("[bold green]Running pytest")
    success = success and run_command("pytest")

    # Add checks for building documentation
    console.line()
    console.rule("[bold green]Building Documentation")
    success = success and run_command(
        "cd docs/; make clean; make html SPHINXOPTS='-W;'"
    )

    if success:
        console.line()
        console.rule(characters="=")
        console.print(
            "[bold green]:TADA: :TADA: :TADA: ALL CHECKS PASSED :TADA: :TADA: :TADA:",
            justify="center",
        )
        console.rule(characters="=")
    else:
        console.line()
        console.rule(characters="=", style=Style(color="red"))
        console.print(
            "[bold red]:skull: :skull: :skull: ERRORS FOUND :skull: :skull: :skull:",
            justify="center",
        )
        console.rule(characters="=", style=Style(color="red"))


if __name__ == "__main__":
    run_github_actions_file(filename=".github/workflows/code_checks.yml")


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/scripts/run_profiler.py
================================================
from typing import Callable

import torch
import tqdm

import nerfacc

# timing
# https://github.com/pytorch/pytorch/commit/d2784c233bfc57a1d836d961694bcc8ec4ed45e4


class Profiler:
    def __init__(self, warmup=10, repeat=1000):
        self.warmup = warmup
        self.repeat = repeat

    def __call__(self, func: Callable):
        # warmup
        for _ in range(self.warmup):
            func()
        torch.cuda.synchronize()

        # profile
        with torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            profile_memory=True,
        ) as prof:
            for _ in range(self.repeat):
                func()
            torch.cuda.synchronize()

        # return
        events = prof.key_averages()
        # print(events.table(sort_by="self_cpu_time_total", row_limit=10))
        self_cpu_time_total = (
            sum([event.self_cpu_time_total for event in events]) / self.repeat
        )
        self_cuda_time_total = (
            sum([event.self_cuda_time_total for event in events]) / self.repeat
        )
        self_cuda_memory_usage = max(
            [event.self_cuda_memory_usage for event in events]
        )
        return (
            self_cpu_time_total,  # in us
            self_cuda_time_total,  # in us
            self_cuda_memory_usage,  # in bytes
        )


def main():
    device = "cuda:0"
    torch.manual_seed(42)
    profiler = Profiler(warmup=10, repeat=100)

    # # contract
    # print("* contract")
    # x = torch.rand([1024, 3], device=device)
    # roi = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.float32, device=device)
    # fn = lambda: nerfacc.contract(
    #     x, roi=roi, type=nerfacc.ContractionType.UN_BOUNDED_TANH
    # )
    # cpu_t, cuda_t, cuda_bytes = profiler(fn)
    # print(f"{cpu_t:.2f} us, {cuda_t:.2f} us, {cuda_bytes / 1024 / 1024:.2f} MB")

    # rendering
    print("* rendering")
    batch_size = 81920
    rays_o = torch.rand((batch_size, 3), device=device)
    rays_d = torch.randn((batch_size, 3), device=device)
    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

    ray_indices, t_starts, t_ends = nerfacc.ray_marching(
        rays_o,
        rays_d,
        near_plane=0.1,
        far_plane=1.0,
        render_step_size=1e-1,
    )
    sigmas = torch.randn_like(t_starts, requires_grad=True)
    fn = (
        lambda: nerfacc.render_weight_from_density(
            ray_indices, t_starts, t_ends, sigmas
        )
        .sum()
        .backward()
    )
    fn()
    torch.cuda.synchronize()
    for _ in tqdm.tqdm(range(100)):
        fn()
        torch.cuda.synchronize()

    cpu_t, cuda_t, cuda_bytes = profiler(fn)
    print(f"{cpu_t:.2f} us, {cuda_t:.2f} us, {cuda_bytes / 1024 / 1024:.2f} MB")

    packed_info = nerfacc.pack_info(ray_indices, n_rays=batch_size)
    fn = (
        lambda: nerfacc.vol_rendering._RenderingDensity.apply(
            packed_info, t_starts, t_ends, sigmas, 0
        )
        .sum()
        .backward()
    )
    fn()
    torch.cuda.synchronize()
    for _ in tqdm.tqdm(range(100)):
        fn()
        torch.cuda.synchronize()
    cpu_t, cuda_t, cuda_bytes = profiler(fn)
    print(f"{cpu_t:.2f} us, {cuda_t:.2f} us, {cuda_bytes / 1024 / 1024:.2f} MB")


if __name__ == "__main__":
    main()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/setup.cfg
================================================
[isort]
multi_line_output = 3
line_length = 80
include_trailing_comma = true
skip=./examples/pycolmap

================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/setup.py
================================================
import glob
import os
import os.path as osp
import platform
import sys

from setuptools import find_packages, setup

__version__ = None
exec(open("nerfacc/version.py", "r").read())

URL = "https://github.com/KAIR-BAIR/nerfacc"

BUILD_NO_CUDA = os.getenv("BUILD_NO_CUDA", "0") == "1"
WITH_SYMBOLS = os.getenv("WITH_SYMBOLS", "0") == "1"


def get_ext():
    from torch.utils.cpp_extension import BuildExtension

    return BuildExtension.with_options(
        no_python_abi_suffix=True, use_ninja=False
    )


def get_extensions():
    import torch
    from torch.__config__ import parallel_info
    from torch.utils.cpp_extension import CUDAExtension

    extensions_dir = osp.join("nerfacc", "cuda", "csrc")
    sources = glob.glob(osp.join(extensions_dir, "*.cu"))
    # remove generated 'hip' files, in case of rebuilds
    sources = [path for path in sources if "hip" not in path]

    undef_macros = []
    define_macros = []

    if sys.platform == "win32":
        define_macros += [("nerfacc_EXPORTS", None)]

    extra_compile_args = {"cxx": ["-O3"]}
    if not os.name == "nt":  # Not on Windows:
        extra_compile_args["cxx"] += ["-Wno-sign-compare"]
    extra_link_args = [] if WITH_SYMBOLS else ["-s"]

    info = parallel_info()
    if (
        "backend: OpenMP" in info
        and "OpenMP not found" not in info
        and sys.platform != "darwin"
    ):
        extra_compile_args["cxx"] += ["-DAT_PARALLEL_OPENMP"]
        if sys.platform == "win32":
            extra_compile_args["cxx"] += ["/openmp"]
        else:
            extra_compile_args["cxx"] += ["-fopenmp"]
    else:
        print("Compiling without OpenMP...")

    # Compile for mac arm64
    if sys.platform == "darwin" and platform.machine() == "arm64":
        extra_compile_args["cxx"] += ["-arch", "arm64"]
        extra_link_args += ["-arch", "arm64"]

    nvcc_flags = os.getenv("NVCC_FLAGS", "")
    nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ")
    nvcc_flags += ["-O3"]
    if torch.version.hip:
        # USE_ROCM was added to later versions of PyTorch.
        # Define here to support older PyTorch versions as well:
        define_macros += [("USE_ROCM", None)]
        undef_macros += ["__HIP_NO_HALF_CONVERSIONS__"]
    else:
        nvcc_flags += ["--expt-relaxed-constexpr"]
    extra_compile_args["nvcc"] = nvcc_flags

    extension = CUDAExtension(
        f"nerfacc.csrc",
        sources,
        include_dirs=[osp.join(extensions_dir, "include")],
        define_macros=define_macros,
        undef_macros=undef_macros,
        extra_compile_args=extra_compile_args,
        extra_link_args=extra_link_args,
    )

    return [extension]


# work-around hipify abs paths
include_package_data = True
# if torch.cuda.is_available() and torch.version.hip:
#     include_package_data = False

setup(
    name="nerfacc",
    version=__version__,
    description="A General NeRF Acceleration Toolbox",
    author="Ruilong",
    author_email="ruilongli94@gmail.com",
    url=URL,
    download_url=f"{URL}/archive/{__version__}.tar.gz",
    keywords=[],
    python_requires=">=3.7",
    install_requires=["rich>=12", "torch"],
    extras_require={
        # dev dependencies. Install them by `pip install nerfacc[dev]`
        "dev": [
            "black[jupyter]==22.3.0",
            "isort==5.10.1",
            "pylint==2.13.4",
            "pytest==7.1.2",
            "pytest-xdist==2.5.0",
            "typeguard>=2.13.3",
            "pyyaml==6.0",
            "build",
            "twine",
        ],
    },
    ext_modules=get_extensions() if not BUILD_NO_CUDA else [],
    cmdclass={"build_ext": get_ext()} if not BUILD_NO_CUDA else {},
    packages=find_packages(),
    include_package_data=include_package_data,
)


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_contraction.py
================================================
import pytest
import torch

import nerfacc.cuda as _C
from nerfacc import ContractionType, contract, contract_inv

device = "cuda:0"
batch_size = 32
eps = 1e-6


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_ContractionType():
    ctype = ContractionType.AABB.to_cpp_version()
    assert ctype == _C.ContractionTypeGetter(0)
    ctype = ContractionType.UN_BOUNDED_TANH.to_cpp_version()
    assert ctype == _C.ContractionTypeGetter(1)
    ctype = ContractionType.UN_BOUNDED_SPHERE.to_cpp_version()
    assert ctype == _C.ContractionTypeGetter(2)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_identity():
    x = torch.rand([batch_size, 3], device=device)
    roi = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.float32, device=device)
    x_out = contract(x, roi=roi, type=ContractionType.AABB)
    assert torch.allclose(x_out, x, atol=eps)
    x_inv = contract_inv(x_out, roi=roi, type=ContractionType.AABB)
    assert torch.allclose(x_inv, x, atol=eps)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_aabb():
    x = torch.rand([batch_size, 3], device=device)
    roi = torch.tensor(
        [-1, -1, -1, 1, 1, 1], dtype=torch.float32, device=device
    )
    x_out = contract(x, roi=roi, type=ContractionType.AABB)
    x_out_tgt = x * 0.5 + 0.5
    assert torch.allclose(x_out, x_out_tgt, atol=eps)
    x_inv = contract_inv(x_out, roi=roi, type=ContractionType.AABB)
    assert torch.allclose(x_inv, x, atol=eps)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_tanh():
    x = torch.randn([batch_size, 3], device=device)
    roi = torch.tensor(
        [-0.2, -0.3, -0.4, 0.7, 0.8, 0.6], dtype=torch.float32, device=device
    )
    x_out = contract(x, roi=roi, type=ContractionType.UN_BOUNDED_TANH)
    x_out_tgt = (
        torch.tanh((x - roi[:3]) / (roi[3:] - roi[:3]) - 0.5) * 0.5 + 0.5
    )
    assert torch.allclose(x_out, x_out_tgt, atol=eps)
    x_inv = contract_inv(x_out, roi=roi, type=ContractionType.UN_BOUNDED_TANH)
    assert torch.allclose(x_inv, x, atol=eps)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_sphere():
    x = torch.randn([batch_size, 3], device=device)
    roi = torch.tensor(
        [-0.2, -0.3, -0.4, 0.7, 0.8, 0.6], dtype=torch.float32, device=device
    )
    x_out = contract(x, roi=roi, type=ContractionType.UN_BOUNDED_SPHERE)
    assert ((x_out - 0.5).norm(dim=-1) < 0.5).all()
    x_inv = contract_inv(x_out, roi=roi, type=ContractionType.UN_BOUNDED_SPHERE)
    assert torch.allclose(x_inv, x, atol=eps)


if __name__ == "__main__":
    test_ContractionType()
    test_identity()
    test_aabb()
    test_tanh()
    test_sphere()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_grid.py
================================================
import pytest
import torch

from nerfacc import OccupancyGrid

device = "cuda:0"


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def occ_eval_fn(x: torch.Tensor) -> torch.Tensor:
    """Pesudo occupancy function: (N, 3) -> (N, 1)."""
    return ((x - 0.5).norm(dim=-1, keepdim=True) < 0.5).float()


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_occ_grid():
    roi_aabb = [0, 0, 0, 1, 1, 1]
    occ_grid = OccupancyGrid(roi_aabb=roi_aabb, resolution=128).to(device)
    occ_grid.every_n_step(0, occ_eval_fn, occ_thre=0.1)
    assert occ_grid.roi_aabb.shape == (6,)
    assert occ_grid.binary.shape == (128, 128, 128)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_query_grid():
    roi_aabb = [0, 0, 0, 1, 1, 1]
    occ_grid = OccupancyGrid(roi_aabb=roi_aabb, resolution=128).to(device)
    occ_grid.every_n_step(0, occ_eval_fn, occ_thre=0.1)
    samples = torch.rand((100, 3), device=device)
    occs = occ_grid.query_occ(samples)
    assert occs.shape == (100,)


if __name__ == "__main__":
    test_occ_grid()
    test_query_grid()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_intersection.py
================================================
import pytest
import torch

from nerfacc import ray_aabb_intersect

device = "cuda:0"
batch_size = 32
eps = 1e-6


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_intersection():
    rays_o = torch.rand([batch_size, 3], device=device)
    rays_d = torch.randn([batch_size, 3], device=device)
    aabb = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.float32, device=device)
    t_min, t_max = ray_aabb_intersect(rays_o, rays_d, aabb)
    assert (t_min == 0).all()
    t = torch.rand_like(t_min) * (t_max - t_min) + t_min
    x = rays_o + t.unsqueeze(-1) * rays_d
    assert (x >= 0).all() and (x <= 1).all()


if __name__ == "__main__":
    test_intersection()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_loss.py
================================================
import pytest
import torch

from nerfacc import pack_info, ray_marching
from nerfacc.losses import distortion

device = "cuda:0"
batch_size = 32
eps = 1e-6


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_distortion():
    rays_o = torch.rand((batch_size, 3), device=device)
    rays_d = torch.randn((batch_size, 3), device=device)
    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

    ray_indices, t_starts, t_ends = ray_marching(
        rays_o,
        rays_d,
        near_plane=0.1,
        far_plane=1.0,
        render_step_size=1e-3,
    )
    packed_info = pack_info(ray_indices, n_rays=batch_size)
    weights = torch.rand((t_starts.shape[0],), device=device)
    loss = distortion(packed_info, weights, t_starts, t_ends)
    assert loss.shape == (batch_size,)


if __name__ == "__main__":
    test_distortion()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_pack.py
================================================
import pytest
import torch

from nerfacc import pack_data, pack_info, unpack_data, unpack_info

device = "cuda:0"
batch_size = 32
eps = 1e-6


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_pack_data():
    n_rays = 2
    n_samples = 3
    data = torch.rand((n_rays, n_samples, 2), device=device, requires_grad=True)
    mask = torch.rand((n_rays, n_samples), device=device) > 0.5
    packed_data, packed_info = pack_data(data, mask)
    unpacked_data = unpack_data(packed_info, packed_data, n_samples)
    unpacked_data.sum().backward()
    assert (data.grad[mask] == 1).all()
    assert torch.allclose(
        unpacked_data.sum(dim=1), (data * mask[..., None]).sum(dim=1)
    )


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_unpack_info():
    packed_info = torch.tensor(
        [[0, 1], [1, 0], [1, 4]], dtype=torch.int32, device=device
    )
    ray_indices_tgt = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )
    ray_indices = unpack_info(packed_info, n_samples=5)
    packed_info_2 = pack_info(ray_indices, n_rays=packed_info.shape[0])
    assert torch.allclose(packed_info.int(), packed_info_2.int())
    assert torch.allclose(ray_indices, ray_indices_tgt)


if __name__ == "__main__":
    test_pack_data()
    test_unpack_info()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_ray_marching.py
================================================
import pytest
import torch

from nerfacc import OccupancyGrid, ray_marching, unpack_info

device = "cuda:0"
batch_size = 128


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_marching_with_near_far():
    rays_o = torch.rand((batch_size, 3), device=device)
    rays_d = torch.randn((batch_size, 3), device=device)
    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

    ray_indices, t_starts, t_ends = ray_marching(
        rays_o,
        rays_d,
        near_plane=0.1,
        far_plane=1.0,
        render_step_size=1e-3,
    )
    return


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_marching_with_grid():
    rays_o = torch.rand((batch_size, 3), device=device)
    rays_d = torch.randn((batch_size, 3), device=device)
    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)
    grid = OccupancyGrid(roi_aabb=[0, 0, 0, 1, 1, 1]).to(device)
    grid._binary[:] = True

    ray_indices, t_starts, t_ends = ray_marching(
        rays_o,
        rays_d,
        grid=grid,
        near_plane=0.0,
        far_plane=1.0,
        render_step_size=1e-2,
    )
    ray_indices = ray_indices
    samples = (
        rays_o[ray_indices] + rays_d[ray_indices] * (t_starts + t_ends) / 2.0
    )
    assert (samples <= grid.roi_aabb[3:].unsqueeze(0)).all()
    assert (samples >= grid.roi_aabb[:3].unsqueeze(0)).all()
    return


if __name__ == "__main__":
    test_marching_with_near_far()
    test_marching_with_grid()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_rendering.py
================================================
import pytest
import torch

from nerfacc import (
    accumulate_along_rays,
    render_transmittance_from_density,
    render_visibility,
    render_weight_from_alpha,
    render_weight_from_density,
    rendering,
)

device = "cuda:0"
batch_size = 32
eps = 1e-6


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_render_visibility():
    ray_indices = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )  # (samples,)
    alphas = torch.tensor(
        [0.4, 0.3, 0.8, 0.8, 0.5], dtype=torch.float32, device=device
    ).unsqueeze(
        -1
    )  # (n_samples, 1)

    # transmittance: [1.0, 1.0, 0.7, 0.14, 0.028]
    vis = render_visibility(
        alphas, ray_indices=ray_indices, early_stop_eps=0.03, alpha_thre=0.0
    )
    vis_tgt = torch.tensor(
        [True, True, True, True, False], dtype=torch.bool, device=device
    )
    assert torch.allclose(vis, vis_tgt)

    # transmittance: [1.0, 1.0, 1.0, 0.2, 0.04]
    vis = render_visibility(
        alphas, ray_indices=ray_indices, early_stop_eps=0.05, alpha_thre=0.35
    )
    vis_tgt = torch.tensor(
        [True, False, True, True, False], dtype=torch.bool, device=device
    )
    assert torch.allclose(vis, vis_tgt)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_render_weight_from_alpha():
    ray_indices = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )  # (samples,)
    alphas = torch.tensor(
        [0.4, 0.3, 0.8, 0.8, 0.5], dtype=torch.float32, device=device
    ).unsqueeze(
        -1
    )  # (n_samples, 1)

    # transmittance: [1.0, 1.0, 0.7, 0.14, 0.028]
    weights = render_weight_from_alpha(
        alphas, ray_indices=ray_indices, n_rays=3
    )
    weights_tgt = torch.tensor(
        [1.0 * 0.4, 1.0 * 0.3, 0.7 * 0.8, 0.14 * 0.8, 0.028 * 0.5],
        dtype=torch.float32,
        device=device,
    ).unsqueeze(-1)
    assert torch.allclose(weights, weights_tgt)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_render_weight_from_density():
    ray_indices = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )  # (samples,)
    sigmas = torch.rand(
        (ray_indices.shape[0], 1), device=device
    )  # (n_samples, 1)
    t_starts = torch.rand_like(sigmas)
    t_ends = torch.rand_like(sigmas) + 1.0
    alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))

    weights = render_weight_from_density(
        t_starts, t_ends, sigmas, ray_indices=ray_indices, n_rays=3
    )
    weights_tgt = render_weight_from_alpha(
        alphas, ray_indices=ray_indices, n_rays=3
    )
    assert torch.allclose(weights, weights_tgt)


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_accumulate_along_rays():
    ray_indices = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )  # (n_rays,)
    weights = torch.tensor(
        [0.4, 0.3, 0.8, 0.8, 0.5], dtype=torch.float32, device=device
    ).unsqueeze(-1)
    values = torch.rand((5, 2), device=device)  # (n_samples, 1)

    ray_values = accumulate_along_rays(
        weights, ray_indices, values=values, n_rays=3
    )
    assert ray_values.shape == (3, 2)
    assert torch.allclose(ray_values[0, :], weights[0, :] * values[0, :])
    assert (ray_values[1, :] == 0).all()
    assert torch.allclose(
        ray_values[2, :], (weights[1:, :] * values[1:]).sum(dim=0)
    )


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_rendering():
    def rgb_sigma_fn(t_starts, t_ends, ray_indices):
        return torch.hstack([t_starts] * 3), t_starts

    ray_indices = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )  # (samples,)
    sigmas = torch.rand(
        (ray_indices.shape[0], 1), device=device
    )  # (n_samples, 1)
    t_starts = torch.rand_like(sigmas)
    t_ends = torch.rand_like(sigmas) + 1.0

    _, _, _ = rendering(
        t_starts,
        t_ends,
        ray_indices=ray_indices,
        n_rays=3,
        rgb_sigma_fn=rgb_sigma_fn,
    )


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_grads():
    ray_indices = torch.tensor(
        [0, 2, 2, 2, 2], dtype=torch.int64, device=device
    )  # (samples,)
    packed_info = torch.tensor(
        [[0, 1], [1, 0], [1, 4]], dtype=torch.int32, device=device
    )
    sigmas = torch.tensor([[0.4], [0.8], [0.1], [0.8], [0.1]], device="cuda")
    sigmas.requires_grad = True
    t_starts = torch.rand_like(sigmas)
    t_ends = t_starts + 1.0

    weights_ref = torch.tensor(
        [[0.3297], [0.5507], [0.0428], [0.2239], [0.0174]], device="cuda"
    )
    sigmas_grad_ref = torch.tensor(
        [[0.6703], [0.1653], [0.1653], [0.1653], [0.1653]], device="cuda"
    )

    # naive impl. trans from sigma
    trans = render_transmittance_from_density(
        t_starts, t_ends, sigmas, ray_indices=ray_indices, n_rays=3
    )
    weights = trans * (1.0 - torch.exp(-sigmas * (t_ends - t_starts)))
    weights.sum().backward()
    sigmas_grad = sigmas.grad.clone()
    sigmas.grad.zero_()
    assert torch.allclose(weights_ref, weights, atol=1e-4)
    assert torch.allclose(sigmas_grad_ref, sigmas_grad, atol=1e-4)

    # naive impl. trans from alpha
    trans = render_transmittance_from_density(
        t_starts, t_ends, sigmas, packed_info=packed_info, n_rays=3
    )
    weights = trans * (1.0 - torch.exp(-sigmas * (t_ends - t_starts)))
    weights.sum().backward()
    sigmas_grad = sigmas.grad.clone()
    sigmas.grad.zero_()
    assert torch.allclose(weights_ref, weights, atol=1e-4)
    assert torch.allclose(sigmas_grad_ref, sigmas_grad, atol=1e-4)

    weights = render_weight_from_density(
        t_starts, t_ends, sigmas, ray_indices=ray_indices, n_rays=3
    )
    weights.sum().backward()
    sigmas_grad = sigmas.grad.clone()
    sigmas.grad.zero_()
    assert torch.allclose(weights_ref, weights, atol=1e-4)
    assert torch.allclose(sigmas_grad_ref, sigmas_grad, atol=1e-4)

    weights = render_weight_from_density(
        t_starts, t_ends, sigmas, packed_info=packed_info, n_rays=3
    )
    weights.sum().backward()
    sigmas_grad = sigmas.grad.clone()
    sigmas.grad.zero_()
    assert torch.allclose(weights_ref, weights, atol=1e-4)
    assert torch.allclose(sigmas_grad_ref, sigmas_grad, atol=1e-4)

    alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))
    weights = render_weight_from_alpha(
        alphas, ray_indices=ray_indices, n_rays=3
    )
    weights.sum().backward()
    sigmas_grad = sigmas.grad.clone()
    sigmas.grad.zero_()
    assert torch.allclose(weights_ref, weights, atol=1e-4)
    assert torch.allclose(sigmas_grad_ref, sigmas_grad, atol=1e-4)

    alphas = 1.0 - torch.exp(-sigmas * (t_ends - t_starts))
    weights = render_weight_from_alpha(
        alphas, packed_info=packed_info, n_rays=3
    )
    weights.sum().backward()
    sigmas_grad = sigmas.grad.clone()
    sigmas.grad.zero_()
    assert torch.allclose(weights_ref, weights, atol=1e-4)
    assert torch.allclose(sigmas_grad_ref, sigmas_grad, atol=1e-4)


if __name__ == "__main__":
    test_render_visibility()
    test_render_weight_from_alpha()
    test_render_weight_from_density()
    test_accumulate_along_rays()
    test_rendering()
    test_grads()


================================================
FILE: third_parties/nerfacc-0.3.5/nerfacc-0.3.5/tests/test_resampling.py
================================================
import pytest
import torch

from nerfacc import pack_info, ray_marching, ray_resampling

device = "cuda:0"
batch_size = 128


@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
def test_resampling():
    rays_o = torch.rand((batch_size, 3), device=device)
    rays_d = torch.randn((batch_size, 3), device=device)
    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

    ray_indices, t_starts, t_ends = ray_marching(
        rays_o,
        rays_d,
        near_plane=0.1,
        far_plane=1.0,
        render_step_size=1e-3,
    )
    packed_info = pack_info(ray_indices, n_rays=batch_size)
    weights = torch.rand((t_starts.shape[0],), device=device)
    packed_info, t_starts, t_ends = ray_resampling(
        packed_info, t_starts, t_ends, weights, n_samples=32
    )
    assert t_starts.shape == t_ends.shape == (batch_size * 32, 1)


if __name__ == "__main__":
    test_resampling()


================================================
FILE: utilities/utils.py
================================================
import numpy as np
import cv2
from PIL import Image, ImageChops
import os
import time
import torch
from PIL import Image, ImageDraw, ImageFont

exp_time = str(time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime(time.time())))
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

def crop_a_set_of_images(*image_path):
    from PIL import ImageChops, Image
    imgs = []
    bboxes = []
    for im_path in image_path:
        im = Image.open(im_path)
        bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
        diff = ImageChops.difference(im, bg)
        diff = ImageChops.add(diff, diff, 2.0, -5)
        bbox = diff.getbbox()

        imgs.append(im)
        bboxes.append(bbox)
    bbox_aggre = np.asarray(bboxes)
    bbox_min = np.min(bbox_aggre, 0)
    bbox_max = np.max(bbox_aggre, 0)
    bbox_common = (bbox_min[0], bbox_min[1], bbox_max[2], bbox_max[3])
    for idx, img in enumerate(imgs):
        img = img.crop(bbox_common)
        img.save(image_path[idx])
    pass


def crop_image_based_on_ref_image(ref_img_path, *img_path):
    from PIL import ImageChops, Image
    ref_im = Image.open(ref_img_path)
    bg = Image.new(ref_im.mode, ref_im.size, ref_im.getpixel((0, 0)))
    diff = ImageChops.difference(ref_im, bg)
    diff = ImageChops.add(diff, diff, 2.0, -5)
    bbox = diff.getbbox()

    for idx, im_path in enumerate(img_path):
        img = Image.open(im_path)
        img = img.crop(bbox)
        img.save(im_path)


def angular_error_map(N1, N2):
    dot = np.sum(np.multiply(N1, N2), axis=-1)
    dot = np.clip(dot, -1., 1.)
    return np.rad2deg(np.arccos(dot))


def crop_mask(mask):
    if mask.dtype is not np.uint8:
        mask = mask.astype(np.uint8) * 255
    im = Image.fromarray(mask)
    bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
    diff = ImageChops.difference(im, bg)
    diff = ImageChops.add(diff, diff, 2.0, 0)
    bbox = diff.getbbox()
    return bbox


def crop_image_by_mask(img, mask):
    bbox = crop_mask(mask)
    try:
        crop_img = img.copy()[bbox[1]:bbox[3], bbox[0]:bbox[2]]
    except:
        crop_img = img.copy()
    return crop_img


def save_video(vpath, images, fps):
    height, width, _ = images[0].shape
    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
    video = cv2.VideoWriter(vpath, fourcc, fps, (width, height))
    for image in images:
        video.write(image)
    cv2.destroyAllWindows()
    video.release()


def toRGBA(img, mask):
    img = cv2.cvtColor(img, cv2.COLOR_RGB2RGBA)
    img[:, :, 3] = (mask.astype(bool)*255).astype(np.uint8)
    return img