Repository: DecaYale/RNNPose
Branch: main
Commit: ff223f3cb6bf
Files: 126
Total size: 12.7 MB

Directory structure:
gitextract_p7a82okx/

├── .gitignore
├── LICENSE.md
├── README.md
├── builder/
│   ├── __init__.py
│   ├── dataset_builder.py
│   ├── input_reader_builder.py
│   ├── losses_builder.py
│   ├── lr_scheduler_builder.py
│   ├── optimizer_builder.py
│   └── rnnpose_builder.py
├── config/
│   ├── default.py
│   └── linemod/
│       ├── copy.sh
│       ├── copy_occ.sh
│       ├── template_fw0.5.yml
│       └── template_fw0.5_occ.yml
├── data/
│   ├── __init__.py
│   ├── dataset.py
│   ├── linemod/
│   │   └── linemod_config.py
│   ├── linemod_dataset.py
│   ├── preprocess.py
│   ├── transforms.py
│   └── ycb/
│       └── basic.py
├── doc/
│   └── prepare_data.md
├── docker/
│   ├── Dockerfile
│   └── freeze.yml
├── geometry/
│   ├── __init__.py
│   ├── cholesky.py
│   ├── diff_render.py
│   ├── diff_render_optim.py
│   ├── einsum.py
│   ├── intrinsics.py
│   ├── projective_ops.py
│   ├── se3.py
│   └── transformation.py
├── model/
│   ├── CFNet.py
│   ├── HybridNet.py
│   ├── PoseRefiner.py
│   ├── RNNPose.py
│   ├── descriptor2D.py
│   ├── descriptor3D.py
│   └── losses.py
├── scripts/
│   ├── compile_3rdparty.sh
│   ├── eval.sh
│   ├── eval_lmocc.sh
│   ├── run_dataformatter.sh
│   ├── run_datainfo_generation.sh
│   └── train.sh
├── thirdparty/
│   ├── kpconv/
│   │   ├── __init__.py
│   │   ├── cpp_wrappers/
│   │   │   ├── compile_wrappers.sh
│   │   │   ├── cpp_neighbors/
│   │   │   │   ├── build.bat
│   │   │   │   ├── neighbors/
│   │   │   │   │   ├── neighbors.cpp
│   │   │   │   │   └── neighbors.h
│   │   │   │   ├── setup.py
│   │   │   │   └── wrapper.cpp
│   │   │   ├── cpp_subsampling/
│   │   │   │   ├── build.bat
│   │   │   │   ├── grid_subsampling/
│   │   │   │   │   ├── grid_subsampling.cpp
│   │   │   │   │   └── grid_subsampling.h
│   │   │   │   ├── setup.py
│   │   │   │   └── wrapper.cpp
│   │   │   └── cpp_utils/
│   │   │       ├── cloud/
│   │   │       │   ├── cloud.cpp
│   │   │       │   └── cloud.h
│   │   │       └── nanoflann/
│   │   │           └── nanoflann.hpp
│   │   ├── kernels/
│   │   │   ├── dispositions/
│   │   │   │   └── k_015_center_3D.ply
│   │   │   └── kernel_points.py
│   │   ├── kpconv_blocks.py
│   │   └── lib/
│   │       ├── __init__.py
│   │       ├── ply.py
│   │       ├── timer.py
│   │       └── utils.py
│   ├── nn/
│   │   ├── _ext.c
│   │   ├── nn_utils.py
│   │   ├── setup.py
│   │   └── src/
│   │       ├── ext.h
│   │       └── nearest_neighborhood.cu
│   ├── raft/
│   │   ├── corr.py
│   │   ├── extractor.py
│   │   ├── update.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── augmentor.py
│   │       ├── flow_viz.py
│   │       ├── frame_utils.py
│   │       └── utils.py
│   └── vsd/
│       └── inout.py
├── tools/
│   ├── eval.py
│   ├── generate_data_info_deepim_0_orig.py
│   ├── generate_data_info_deepim_1_syn.py
│   ├── generate_data_info_deepim_2_posecnnval.py
│   ├── generate_data_info_v2_deepim.py
│   ├── train.py
│   └── transform_data_format.py
├── torchplus/
│   ├── __init__.py
│   ├── metrics.py
│   ├── nn/
│   │   ├── __init__.py
│   │   ├── functional.py
│   │   └── modules/
│   │       ├── __init__.py
│   │       ├── common.py
│   │       └── normalization.py
│   ├── ops/
│   │   ├── __init__.py
│   │   └── array_ops.py
│   ├── tools.py
│   └── train/
│       ├── __init__.py
│       ├── checkpoint.py
│       ├── common.py
│       ├── fastai_optim.py
│       ├── learning_schedules.py
│       ├── learning_schedules_fastai.py
│       └── optim.py
├── utils/
│   ├── __init__.py
│   ├── config_io.py
│   ├── distributed_utils.py
│   ├── eval_metric.py
│   ├── furthest_point_sample.py
│   ├── geometric.py
│   ├── img_utils.py
│   ├── log_tool.py
│   ├── pose_utils.py
│   ├── pose_utils_np.py
│   ├── progress_bar.py
│   ├── rand_utils.py
│   ├── singleton.py
│   ├── timer.py
│   ├── util.py
│   └── visualize.py
└── weights/
    ├── gru_update.pth
    ├── img_fea_enc.pth
    └── superpoint_v1.pth

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
**/*.old
**/*.bak

.DS_Store
# Created by https://www.toptal.com/developers/gitignore/api/vscode,python,jupyternotebooks
# Edit at https://www.toptal.com/developers/gitignore?templates=vscode,python,jupyternotebooks

### JupyterNotebooks ###
# gitignore template for Jupyter Notebooks
# website: http://jupyter.org/

.ipynb_checkpoints
*/.ipynb_checkpoints/*

# IPython
profile_default/
ipython_config.py

# Remove previous ipynb_checkpoints
#   git rm -r .ipynb_checkpoints/

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
#lib/
#lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
doc/_build/

# PyBuilder
target/

# Jupyter Notebook

# IPython

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# profiling data
.prof

### vscode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace

# End of https://www.toptal.com/developers/gitignore/api/vscode,python,jupyternotebooks


================================================
FILE: LICENSE.md
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# RNNPose: Recurrent 6-DoF Object Pose Refinement with Robust Correspondence Field Estimation and Pose Optimization

[Yan Xu](https://decayale.github.io/), [Kwan-Yee Lin](https://kwanyeelin.github.io/), [Guofeng Zhang](http://www.cad.zju.edu.cn/home/gfzhang/), [Xiaogang Wang](https://www.ee.cuhk.edu.hk/en-gb/people/academic-staff/professors/prof-xiaogang-wang), [Hongsheng Li](http://www.ee.cuhk.edu.hk/~hsli/). 

*Conference on Computer Vision and Pattern Recognition (CVPR), 2022.*

[[Paper]](https://scholar.google.com/scholar?hl=zh-CN&as_sdt=0%2C5&q=RNNPose%3A+Recurrent+6-DoF+Object+Pose+Refinement+with+Robust+Correspondence+Field+Estimation+and+Pose+Optimization&btnG=)


## 1. Framework 
The basic pipeline of our proposed RNNPose.  (a) Before refinement, a reference image is rendered according to the object initial pose (shown in a fused view).
(b) Our RNN-based framework recurrently refines the object pose based on the estimated correspondence field between the reference and target images. The pose is optimized to be consistent with the reliable correspondence estimations highlighted by the similarity score map (built from learned 3D-2D descriptors) via differentiable LM optimization.  (c) The output refined pose.  

<!-- ![image info](./demo/framework.png) -->
<p align="center">
<img src="./demo/idea.png" alt="alt text" width="450"/>
</p>

## 2. Pose Estimation with Occlusions and Erroneous Pose Initializations


### Estimated Poses and Intermediate System Outputs from Different Recurrent Iterations. 

<p align="center">
 <img src="demo/ape_short_small.gif" alt="animated" height=400/><img src="demo/driller_short_small.gif" alt="animated" height=400/>
</p>


### Pose Estimates with Erroneous Pose Initializations
Visualization of our pose estimations (first row) on Occlusion LINEMOD dataset and the similarity score maps (second row) for downweighting unreliable correspondences during pose optimization. 
For pose visualization, the white boxes represent the erroneous initial poses, the red boxes are estimated by our algorithm and the ground-truth boxes are in blue. Here, the initial poses for pose refinement are originally from PVNet but added with significant disturbances for robustness testing. 
<center class="half">
  <img src="./demo/est_vis.png" height=200 > 
</center>


## 3. Installation 
### Install the Docker 
A dockerfile is provided to help with the environment setup. 
You need to install [docker](https://docs.docker.com/get-docker/) and [nvidia-docker2](https://github.com/NVIDIA/nvidia-docker) first and then set up the docker image and start up a container with the following commands: 

```
cd RNNPose/docker
sudo docker build -t rnnpose .    
sudo docker run  -it  --runtime=nvidia --ipc=host  --volume="HOST_VOLUME_YOU_WANT_TO_MAP:DOCKER_VOLUME"  -e DISPLAY=$DISPLAY -e QT_X11_NO_MITSHM=1  rnnpose bash

```
If you are not familiar with [docker](https://docs.docker.com/get-docker/), you could also install the dependencies manually following the provided dockerfile.  

### Compile the Dependencies
```
cd RNNPose/scripts
bash compile_3rdparty.sh
```


## 4. Data Preparation
We follow [DeepIM](https://github.com/liyi14/mx-DeepIM) and [PVNet](https://github.com/zju3dv/pvnet-rendering) to preprocess the training data for LINEMOD. 
You could follow the steps [here](doc/prepare_data.md) for data preparation. 


## 5. Test with the Pretrained Models
We train our model with the mixture of the real data and the synthetic data on LINEMOD dataset. 
<!-- and evaluate the trained models on the test set of LINEMOD and LINEMOD OCCLUSION datasets.  -->
The trained models on the LINEMOD dataset have been uploaded to the [OneDrive](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155139432_link_cuhk_edu_hk/ESPTVyUryHdGl65fRAxN51gBBayJJb9NpCqWA-tY2CFKJQ?e=R9bcLW). 
You can download them 
and put them into the directory *weight/* for testing. 


An example bash script is provided below for reference. 

```
export PYTHONPATH="$PROJECT_ROOT_PATH:$PYTHONPATH"
export PYTHONPATH="$PROJECT_ROOT_PATH/thirdparty:$PYTHONPATH"

seq=cat
gpu=1
start_gpu_id=0
mkdir $model_dir

train_file=/home/yxu/Projects/Works/RNNPose_release/tools/eval.py
config_path=/mnt/workspace/Works/RNNPose_release/config/linemod/"$seq"_fw0.5.yml
pretrain=$PROJECT_ROOT_PATH/weights/trained_models/"$seq".tckpt

python -u $train_file multi_proc_train  \
        --config_path $config_path \
        --model_dir $model_dir/results \
        --use_dist True \
        --dist_port 10000 \
        --gpus_per_node $gpu \
        --optim_eval True \
        --use_apex True \
        --world_size $gpu \
        --start_gpu_id $start_gpu_id \
        --pretrained_path $pretrain 

```

Note that you need to specify the PROJECT_ROOT_PATH, i.e. the absolute directory of the project folder *RNNPose* and modify the respective data paths in the configuration files to the locations of downloaded data before executing the commands. You could also refer to the commands below for evaluation with our provide scripts.

### Evaluation on LINEMOD
```
bash scripts/eval.sh 
```

### Evaluation on LINEMOD OCCLUSION
```
bash scripts/eval_lmocc.sh

```

## Training from Scratch
An example training script is provided. 
```
bash scripts/train.sh 
```


## 6. Citation
If you find our code useful, please cite our paper. 
```
@inproceedings{xu2022rnnpose,
  title={RNNPose: Recurrent 6-DoF Object Pose Refinement with Robust Correspondence Field Estimation and Pose Optimization},
  author={Xu, Yan and Kwan-Yee Lin and Zhang, Guofeng and Wang, Xiaogang and  Li, Hongsheng},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2022}
}

@article{xu2024rnnpose,
  title={Rnnpose: 6-dof object pose estimation via recurrent correspondence field estimation and pose optimization},
  author={Xu, Yan and Lin, Kwan-Yee and Zhang, Guofeng and Wang, Xiaogang and Li, Hongsheng},
  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
  year={2024},
  publisher={IEEE}
```


## 7. Acknowledgement

The skeleton of this code is borrowed from [RSLO](https://github.com/DecaYale/RSLO). We also would like to thank the public codebases [PVNet](https://github.com/zju3dv/pvnet), [RAFT](https://github.com/princeton-vl/RAFT), [SuperGlue](https://github.com/magicleap/SuperGluePretrainedNetwork) and [DeepV2D](https://github.com/princeton-vl/DeepV2D). 

<!-- ## TODO List and ETA
- [x] Inference code and pretrained models (25/12/2021)
- [ ] Training code
- [ ] Code cleaning and improvement -->


================================================
FILE: builder/__init__.py
================================================


================================================
FILE: builder/dataset_builder.py
================================================

from data.dataset import get_dataset_class
import numpy as np
from functools import partial
from data.preprocess import preprocess, preprocess_deepim, patch_crop


def build(input_reader_config,
          training,
          ):

    prep_cfg = input_reader_config.preprocess
    dataset_cfg = input_reader_config.dataset
    cfg = input_reader_config
    
    dataset_cls = get_dataset_class(dataset_cfg.dataset_class_name)

    if 0:# 'DeepIM' in dataset_cfg.dataset_class_name:
        # patch_cropper = partial(patch_crop, margin_ratio=0.2, output_size=256 )
        patch_cropper = None 
        prep_func = partial(preprocess_deepim, 
                        max_points=dataset_cfg.max_points,
                        correspondence_radius=prep_cfg.correspondence_radius_threshold,
                        patch_cropper=patch_cropper,
                        image_scale=prep_cfg.get('image_scale', 1),
        ) 

    else:
        prep_func = partial(preprocess, 
                        max_points=dataset_cfg.max_points,
                        correspondence_radius=prep_cfg.correspondence_radius_threshold,
                        image_scale=prep_cfg.get('image_scale', 1),
                        crop_param=prep_cfg.get('crop_param', None),
                        kp_3d_param=prep_cfg.get('kp_3d_param', {"kp_num":30} ),
                        use_coords_as_3d_feat=prep_cfg.get('use_coords_as_3d_feat', False)
        ) 
   

    dataset = dataset_cls(
        info_path=dataset_cfg.info_path,
        root_path=dataset_cfg.root_path,
        model_point_dim=dataset_cfg.model_point_dim,
        is_train=training,
        prep_func=prep_func,
        seq_names=dataset_cfg.get('seq_names', None),
        cfg=dataset_cfg
    )

    return dataset


================================================
FILE: builder/input_reader_builder.py
================================================

from torch.utils.data import Dataset

from builder import dataset_builder


class DatasetWrapper(Dataset):
    """ convert our dataset to Dataset class in pytorch.
    """

    def __init__(self, dataset):
        self._dataset = dataset

    def __len__(self):
        return len(self._dataset)

    def __getitem__(self, idx):
        return self._dataset[idx]

    @property
    def dataset(self):
        return self._dataset


def build(input_reader_config,
          training,
          ) -> DatasetWrapper:

    dataset = dataset_builder.build(
        input_reader_config,
        training,
    )
    dataset = DatasetWrapper(dataset)
    return dataset


================================================
FILE: builder/losses_builder.py
================================================

from model import losses

def build(loss_config):

    criterions = {}
  
    criterions["metric_loss"] =losses.MetricLoss(configs=loss_config.metric_loss,)

    criterions["pose_loss"] = losses.PointAlignmentLoss(loss_weight=1)
    
    return criterions


================================================
FILE: builder/lr_scheduler_builder.py
================================================

from torchplus.train import learning_schedules_fastai as lsf
import torch
import numpy as np 

def build(optimizer_config, optimizer, total_step):

    optimizer_type = list(optimizer_config.keys())[0]

    if optimizer_type == 'rms_prop_optimizer':
        config = optimizer_config.rms_prop_optimizer
        lr_scheduler = _create_learning_rate_scheduler(
            config.learning_rate, optimizer, total_step=total_step)

    if optimizer_type == 'momentum_optimizer':
        config = optimizer_config.momentum_optimizer
        lr_scheduler = _create_learning_rate_scheduler(
            config.learning_rate, optimizer, total_step=total_step)

    if optimizer_type == 'adam_optimizer':
        config = optimizer_config.adam_optimizer
        lr_scheduler = _create_learning_rate_scheduler(
            config.learning_rate, optimizer, total_step=total_step)

    return lr_scheduler


def _create_learning_rate_scheduler(learning_rate_config, optimizer, total_step):
    """Create optimizer learning rate scheduler based on config.

    Args:
      learning_rate_config: A LearningRate proto message.

    Returns:
      A learning rate.

    Raises:
      ValueError: when using an unsupported input data type.
    """
    lr_scheduler = None
    # learning_rate_type = learning_rate_config.WhichOneof('learning_rate')
    learning_rate_type = list(learning_rate_config.keys())[0]

    if learning_rate_type == 'multi_phase':
        config = learning_rate_config.multi_phase
        lr_phases = []
        mom_phases = []
        for phase_cfg in config.phases:
            lr_phases.append((phase_cfg.start, phase_cfg.lambda_func))
            mom_phases.append(
                (phase_cfg.start, phase_cfg.momentum_lambda_func))
        lr_scheduler = lsf.LRSchedulerStep(
            optimizer, total_step, lr_phases, mom_phases)


    if learning_rate_type == 'one_cycle':
        config = learning_rate_config.one_cycle

        if len(config.lr_maxs)>1:
          assert(len(config.lr_maxs)==4 )    
          lr_max=[]
          # for i in range(len(config.lr_maxs)):
          #   lr_max += [config.lr_maxs[i]]*optimizer.param_segs[i] 

          lr_max = np.array(list(config.lr_maxs) )
        else:
          lr_max = config.lr_max

        lr_scheduler = lsf.OneCycle(
            optimizer, total_step, lr_max, list(config.moms), config.div_factor, config.pct_start)
    if learning_rate_type == 'exponential_decay':
        config = learning_rate_config.exponential_decay
        lr_scheduler = lsf.ExponentialDecay(
            optimizer, total_step, config.initial_learning_rate, config.decay_length, config.decay_factor, config.staircase)
    if learning_rate_type == 'exponential_decay_warmup':
        config = learning_rate_config.exponential_decay_warmup
        lr_scheduler = lsf.ExponentialDecayWarmup(
            optimizer, total_step, config.initial_learning_rate, config.decay_length, config.decay_factor,   config.div_factor,
            config.pct_start, config.staircase)
    if learning_rate_type == 'manual_stepping':
        config = learning_rate_config.manual_stepping
        lr_scheduler = lsf.ManualStepping(
            optimizer, total_step, list(config.boundaries), list(config.rates))

    if lr_scheduler is None:
        raise ValueError('Learning_rate %s not supported.' %
                         learning_rate_type)

    return lr_scheduler


================================================
FILE: builder/optimizer_builder.py
================================================
from torchplus.train import learning_schedules
from torchplus.train import optim
import torch
from torch import nn
from torchplus.train.fastai_optim import OptimWrapper, FastAIMixedOptim
from functools import partial


def children(m: nn.Module):
    "Get children of `m`."
    return list(m.children())


def num_children(m: nn.Module) -> int:
    "Get number of children modules in `m`."
    return len(children(m))

# return a list of smallest modules dy


def flatten_model(m):
    if m is None:
        return []
    return sum(
        map(flatten_model, m.children()), []) if num_children(m) else [m]


# def get_layer_groups(m): return [nn.Sequential(*flatten_model(m))]
def get_layer_groups(m): return [nn.ModuleList(flatten_model(m))]

def get_voxeLO_net_layer_groups(net):
    vfe_grp = get_layer_groups(net)#[0]

    other_grp = get_layer_groups(nn.Sequential(net._rotation_loss,
                    net._translation_loss,
                    net._pyramid_rotation_loss,
                        net._pyramid_translation_loss,
                     net._consistency_loss, 
                     ))

    return [vfe_grp, mfe_grp, op_grp,other_grp]


def get_voxeLO_net_layer_groups(net):
    vfe_grp = get_layer_groups(net.voxel_feature_extractor)#[0]
    mfe_grp = get_layer_groups(net.middle_feature_extractor)#[0]
    op_grp = get_layer_groups(net.odom_predictor)#[0]

    # other_grp = get_layer_groups(net._rotation_loss) +  \
    #     get_layer_groups(net._translation_loss) \
    #         + get_layer_groups(net._pyramid_rotation_loss) \
    #             + get_layer_groups(net._pyramid_translation_loss) \
    #                 + get_layer_groups(net._consistency_loss)\
    other_grp = get_layer_groups(nn.Sequential(net._rotation_loss,
                    net._translation_loss,
                    net._pyramid_rotation_loss,
                        net._pyramid_translation_loss,
                     net._consistency_loss, 
                     ))

    return [vfe_grp, mfe_grp, op_grp,other_grp]


def build(optimizer_config, net, name=None, mixed=False, loss_scale=512.0):

    optimizer_type = list(optimizer_config.keys())[0]
    print("Optimizer:", optimizer_type)
    
    optimizer=None

    if optimizer_type == 'rms_prop_optimizer':
        config=optimizer_config.rms_prop_optimizer
        optimizer_func=partial(
            torch.optim.RMSprop,
            alpha=config.decay,
            momentum=config.momentum_optimizer_value,
            eps=config.epsilon)

    if optimizer_type == 'momentum_optimizer':
        config=optimizer_config.momentum_optimizer
        optimizer_func=partial(
            torch.optim.SGD,
            momentum=config.momentum_optimizer_value,
            eps=config.epsilon)

    if optimizer_type == 'adam_optimizer':
        config=optimizer_config.adam_optimizer
        if optimizer_config.fixed_weight_decay:
            optimizer_func=partial(
                torch.optim.Adam, betas=(0.9, 0.99), amsgrad=config.amsgrad)
        else:
            # regular adam
            optimizer_func=partial(
                torch.optim.Adam, amsgrad=config.amsgrad)

    optimizer=OptimWrapper.create(
        optimizer_func,
        3e-3,
        get_layer_groups(net),
        # get_voxeLO_net_layer_groups(net),
        wd=config.weight_decay,
        true_wd=optimizer_config.fixed_weight_decay,
        bn_wd=True)
    print(hasattr(optimizer, "_amp_stash"), '_amp_stash')
    if optimizer is None:
        raise ValueError('Optimizer %s not supported.' % optimizer_type)

    if optimizer_config.use_moving_average:
        raise ValueError('torch don\'t support moving average')
    if name is None:
        # assign a name to optimizer for checkpoint system
        optimizer.name=optimizer_type
    else:
        optimizer.name=name
    return optimizer


================================================
FILE: builder/rnnpose_builder.py
================================================
from builder import losses_builder
from model.RNNPose import get_posenet_class
import model.RNNPose


def build(model_cfg,
          measure_time=False, testing=False):
    """build second pytorch instance.
    """

    criterions=losses_builder.build(model_cfg.loss)

    net = get_posenet_class(model_cfg.network_class_name)(
        criterions=criterions,
        opt=model_cfg)
    return net


================================================
FILE: config/default.py
================================================
from yacs.config import CfgNode as CN
from utils.singleton import Singleton
import os

def _merge_a_into_b(a, b):
    """Merge config dictionary a into config dictionary b, clobbering the
    options in b whenever they are also specified in a.
    """
    # if type(a) is not dict:
    if not isinstance(a, (dict)):
        return

    for k, v in a.items():
        # a must specify keys that are in b
        if not k in b:
            raise KeyError('{} is not a valid config key'.format(k))

        # the types must match, too
        old_type = type(b[k])
        if old_type is not type(v):
            if isinstance(b[k], np.ndarray):
                v = np.array(v, dtype=b[k].dtype)
            else:
                raise ValueError(('Type mismatch ({} vs. {}) '
                                'for config key: {}').format(type(b[k]),
                                                            type(v), k))

        # recursively merge dicts
        # if type(v) is dict:
        if isinstance(v, (dict)):
            try:
                _merge_a_into_b(a[k], b[k])
            except:
                print('Error under config key: {}'.format(k))
                raise
        else:
            b[k] = v


class Config(metaclass=Singleton):
    def __init__(self):
        ##############  ↓  Basic   ↓  ##############
        self.ROOT_CN = CN()
        self.ROOT_CN.BASIC = CN()
        self.ROOT_CN.BASIC.input_size=[480,640] #h,w
        self.ROOT_CN.BASIC.crop_size=[320,320] #h,w
        self.ROOT_CN.BASIC.zoom_crop_size=[320,320] #h,w
        self.ROOT_CN.BASIC.render_image_size=[320,320]#h,w
        self.ROOT_CN.BASIC.patch_num=64#h,w

        ##############  ↓  LM OPTIM   ↓  ##############
        self.ROOT_CN.LM=CN()
        self.ROOT_CN.LM.LM_LMBDA= 0.0001
        self.ROOT_CN.LM.EP_LMBDA=100

        ##############  ↓  data   ↓  ##############
        self.ROOT_CN.DATA=CN()
        self.ROOT_CN.DATA.OBJ_ROOT="" #h,w
        self.ROOT_CN.DATA.VOC_ROOT=f"{os.path.dirname(os.path.abspath(__file__)) }/../EXPDATA/" #h,w

    def __get_item__(self, key):
        return self.ROOT_CN.__getitem__(key)
    
    def merge(self, config_dict, sub_key=None):

        if sub_key is not None:
            _merge_a_into_b(config_dict, self.ROOT_CN[sub_key])
        else:
            _merge_a_into_b(config_dict, self.ROOT_CN)

##############  ↓  Model  ↓  ##############
# _CN.model = CN()
# _CN.model.input_size=[480,640]
# _CN.model.crop_size=[320,320] 
# def get_cfg_defaults():

def get_cfg(Node=None):
    """Get a yacs CfgNode object with default values for my_project."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    # return _CN.clone()
    if Node is None:
        return  Config()
    else:
        return Config().__get_item__(Node)


================================================
FILE: config/linemod/copy.sh
================================================
declare -a arr=("glue" "ape" "cat" "phone" "eggbox" "benchvise" "lamp" "camera" "can" "driller" "duck" "holepuncher" "iron"  )

#create training scripts
for seq in "${arr[@]}"
do
   echo "$seq"
   cat template_fw0.5.yml > "$seq"_fw0.5.yml
   sed -i "s/SEQ_NAME/$seq/g" "$seq"_fw0.5.yml
done

arraylength=${#arr[@]}


================================================
FILE: config/linemod/copy_occ.sh
================================================
declare -a arr=("glue" "ape" "cat" "phone" "eggbox" "benchvise" "lamp" "camera" "can" "driller" "duck" "holepuncher" "iron"  )

#create training scripts
for seq in "${arr[@]}"
do
   echo "$seq"
   cat template_fw0.5_occ.yml > "$seq"_fw0.5_occ.yml
   sed -i "s/SEQ_NAME/$seq/g" "$seq"_fw0.5_occ.yml
done


================================================
FILE: config/linemod/template_fw0.5.yml
================================================
vars:
  input_h: &input_h
    320 
  input_w: &input_w
    320 
  batch_size: &batch_size
    1
  descriptor_dim: &descriptor_dim
    32 
  correspondence_radius_threshold: &correspondence_radius_threshold
    0.01 #0.04 
  seq_name: &seq_name
    ["SEQ_NAME"]
BASIC:
  zoom_crop_size: [240,240]
model:
  input_h: *input_h
  input_w: *input_w
  batch_size: *batch_size
  seq_len: 2 


  network_class_name: RNNPose 
  descriptor_net:
    module_class_name: HybridFeaNet 
    
    keypoints_detector_2d:
      input_dim: 3
      descriptor_dim: *descriptor_dim 
      remove_borders: 4
      normalize_output: True 

    keypoints_detector_3d:
      #KPCONV configurations
      num_layers: 4
      KP_extent: 2.0
      batch_norm_momentum: 0.02
      use_batch_norm: true
      in_points_dim: 3
      fixed_kernel_points: 'center' #['center', 'verticals', 'none']
      KP_influence: 'linear'
      aggregation_mode: 'sum' #['closest', 'sum']
      modulated: false 
      first_subsampling_dl:  0.025
      conv_radius: 2.5
      deform_radius: 5
      in_features_dim: 1 #3
      first_feats_dim: 128
      num_kernel_points: 15
      final_feats_dim: *descriptor_dim #256 #32
      normalize_output: True 
      gnn_feats_dim: 128 #256
    context_fea_extractor_3d:
      #KPCONV configurations
      num_layers: 4
      KP_extent: 2.0
      batch_norm_momentum: 0.02
      use_batch_norm: true
      in_points_dim: 3
      fixed_kernel_points: 'center' #['center', 'verticals', 'none']
      KP_influence: 'linear'
      aggregation_mode: 'sum' #['closest', 'sum']
      modulated: false 
      first_subsampling_dl:  0.025
      conv_radius: 2.5
      deform_radius: 5
      in_features_dim: 1 #3
      first_feats_dim: 128
      num_kernel_points: 15
      final_feats_dim: 256 #*descriptor_dim #256 #32
      normalize_output: False 
      gnn_feats_dim: 128 #256
  motion_net:
    IS_CALIBRATED: True
    RESCALE_IMAGES: False
    ITER_COUNT: 4 
    RENDER_ITER_COUNT: 3 #2 #1 #3
    TRAIN_RESIDUAL_WEIGHT: 0 #0.1 
    TRAIN_FLOW_WEIGHT: 0.5 #0.1 #1 
    TRAIN_REPROJ_WEIGHT: 0 
    OPTIM_ITER_COUNT: 1
    FLOW_NET: 'raft' 
    SYN_OBSERVED: False
    ONLINE_CROP: True
    raft:
      small: False #True
      fea_net: "default"
      mixed_precision: True
      # pretrained_model: "/mnt/workspace/datasets/weights/models/raft-small.pth"
      pretrained_model: "/mnt/workspace/datasets/weights/models/raft-chairs.pth"
      input_dim: 3 
      iters: 1 
      
  loss:
    metric_loss:
      type: "normal" 
      pos_radius: *correspondence_radius_threshold # the radius used to find the positive correspondences
      safe_radius: 0.02 #0.13
      pos_margin: 0.1
      neg_margin: 1.4
      max_points: 256
      matchability_radius: 0.06
      weight: 0.001
    saliency_loss:
      loss_weight: 1
      reg_weight: 0.01
    geometric_loss:
      loss_weight: 1
      reg_weight: 0.5 #0.1 


train_config:

  optimizer: 
      adam_optimizer: 
        learning_rate: 
          one_cycle: 
            lr_maxs: []
            lr_max: 0.0001 # 
            moms: [0.95, 0.85]
            div_factor: 10.0
            pct_start: 0.01 #0.05
        amsgrad: false
        weight_decay: 0.0001 
      fixed_weight_decay: true
      use_moving_average: false

  steps: 200000 
  steps_per_eval:  10000
  loss_scale_factor: -1
  clear_metrics_every_epoch: true

train_input_reader:
  dataset:
    dataset_class_name: "LinemodDeepIMSynRealV2" 
    info_path: ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/deepim/linemod_orig_deepim.info.train",  "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/deepim/linemod_syn_deepim.info.train", 
        "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/linemod_fusesformatted_all10k_deepim.info.train",
    ] 
    root_path: ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/LM6d_converted/LM6d_refine",  "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/LM6d_converted/LM6d_refine_syn",
     "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/LINEMOD/fuse_formatted/"
     ]

    model_point_dim: 3
    max_points: 20000
    seq_names: *seq_name 

  batch_size: *batch_size 
  preprocess: 
    correspondence_radius_threshold: *correspondence_radius_threshold
    num_workers: 3 
    image_scale: 1
    crop_param:
      rand_crop: false
      margin_ratio: 0.85 
      output_size: *input_h 
      crop_with_init_pose: True
    

eval_input_reader:
  dataset:
    dataset_class_name: "LinemodDeepIMSynRealV2"
    info_path: ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/linemod_posecnn.info.eval" ] 
    root_path: ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/LM6d_converted/LM6d_refine" ]
    model_point_dim: 3
    max_points: 20000
    seq_names: *seq_name 
  batch_size: *batch_size 
  preprocess:
    correspondence_radius_threshold: *correspondence_radius_threshold
    num_workers: 3 
    image_scale: 1
    crop_param:
      rand_crop: false
      margin_ratio: 0.85 #0.5 
      output_size: *input_h #
      crop_with_init_pose: True
      

================================================
FILE: config/linemod/template_fw0.5_occ.yml
================================================
vars:
  input_h: &input_h
    320 
  input_w: &input_w
    320 
  batch_size: &batch_size
    1
  descriptor_dim: &descriptor_dim
    32 
  correspondence_radius_threshold: &correspondence_radius_threshold
    0.01 #0.04 
  seq_name: &seq_name
    ["SEQ_NAME"]
BASIC:
  zoom_crop_size: [240,240]
model:
  input_h: *input_h
  input_w: *input_w
  batch_size: *batch_size
  seq_len: 2 


  network_class_name: RNNPose 
  descriptor_net:
    module_class_name: HybridFeaNet 
    
    keypoints_detector_2d:
      input_dim: 3
      descriptor_dim: *descriptor_dim 
      remove_borders: 4
      normalize_output: True 

    keypoints_detector_3d:
      #KPCONV configurations
      num_layers: 4
      KP_extent: 2.0
      batch_norm_momentum: 0.02
      use_batch_norm: true
      in_points_dim: 3
      fixed_kernel_points: 'center' #['center', 'verticals', 'none']
      KP_influence: 'linear'
      aggregation_mode: 'sum' #['closest', 'sum']
      modulated: false 
      first_subsampling_dl:  0.025
      conv_radius: 2.5
      deform_radius: 5
      in_features_dim: 1 #3
      first_feats_dim: 128
      num_kernel_points: 15
      final_feats_dim: *descriptor_dim #256 #32
      normalize_output: True 
      gnn_feats_dim: 128 #256
    context_fea_extractor_3d:
      #KPCONV configurations
      num_layers: 4
      KP_extent: 2.0
      batch_norm_momentum: 0.02
      use_batch_norm: true
      in_points_dim: 3
      fixed_kernel_points: 'center' #['center', 'verticals', 'none']
      KP_influence: 'linear'
      aggregation_mode: 'sum' #['closest', 'sum']
      modulated: false 
      first_subsampling_dl:  0.025
      conv_radius: 2.5
      deform_radius: 5
      in_features_dim: 1 #3
      first_feats_dim: 128
      num_kernel_points: 15
      final_feats_dim: 256 #*descriptor_dim #256 #32
      normalize_output: False 
      gnn_feats_dim: 128 #256
  motion_net:
    IS_CALIBRATED: True
    RESCALE_IMAGES: False
    ITER_COUNT: 4 
    RENDER_ITER_COUNT: 3 #2 #1 #3
    TRAIN_RESIDUAL_WEIGHT: 0 #0.1 
    TRAIN_FLOW_WEIGHT: 0.5 #0.1 #1 
    TRAIN_REPROJ_WEIGHT: 0 
    OPTIM_ITER_COUNT: 1
    FLOW_NET: 'raft' 
    SYN_OBSERVED: False
    ONLINE_CROP: True
    raft:
      small: False #True
      fea_net: "default"
      mixed_precision: True
      # pretrained_model: "/mnt/workspace/datasets/weights/models/raft-small.pth"
      pretrained_model: "/mnt/workspace/datasets/weights/models/raft-chairs.pth"
      input_dim: 3 
      iters: 1 
      
  loss:
    metric_loss:
      type: "normal" 
      pos_radius: *correspondence_radius_threshold # the radius used to find the positive correspondences
      safe_radius: 0.02 #0.13
      pos_margin: 0.1
      neg_margin: 1.4
      max_points: 256
      matchability_radius: 0.06
      weight: 0.001
    saliency_loss:
      loss_weight: 1
      reg_weight: 0.01
    geometric_loss:
      loss_weight: 1
      reg_weight: 0.5 #0.1 


train_config:

  optimizer: 
      adam_optimizer: 
        learning_rate: 
          one_cycle: 
            lr_maxs: []
            lr_max: 0.0001 # 
            moms: [0.95, 0.85]
            div_factor: 10.0
            pct_start: 0.01 #0.05
        amsgrad: false
        weight_decay: 0.0001 
      fixed_weight_decay: true
      use_moving_average: false

  steps: 200000 
  steps_per_eval:  10000
  loss_scale_factor: -1
  clear_metrics_every_epoch: true

train_input_reader:
  dataset:
    dataset_class_name: "LinemodDeepIMSynRealV2" 
    info_path: ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/deepim/linemod_orig_deepim.info.train",  "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/deepim/linemod_syn_deepim.info.train", 
        "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/linemod_fusesformatted_all10k_deepim.info.train",
    ] 
    root_path: ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/LM6d_converted/LM6d_refine",  "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/LM6d_converted/LM6d_refine_syn",
     "/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/LINEMOD/fuse_formatted/"
     ]

    model_point_dim: 3
    max_points: 20000
    seq_names: *seq_name 

  batch_size: *batch_size 
  preprocess: 
    correspondence_radius_threshold: *correspondence_radius_threshold
    num_workers: 3 
    image_scale: 1
    crop_param:
      rand_crop: false
      margin_ratio: 0.85 
      output_size: *input_h 
      crop_with_init_pose: True
    

eval_input_reader:
  dataset:
    dataset_class_name: "LinemodDeepIMSynRealV2"
    info_path:  ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/data_info/deepim/linemod_bop_lmocc_pvnetdr.info.eval"]
    root_path:  ["/home/RNNPose/Projects/Works/RNNPose_release/EXPDATA/lmo"] 
    init_post_type: "PVNET_LINEMOD_OCC"
    model_point_dim: 3
    max_points: 20000
    seq_names: *seq_name 
  batch_size: *batch_size 
  preprocess:
    correspondence_radius_threshold: *correspondence_radius_threshold
    num_workers: 3 
    image_scale: 1
    crop_param:
      rand_crop: false
      margin_ratio: 0.85 #0.5 
      output_size: *input_h #
      crop_with_init_pose: True
      
      
================================================
FILE: data/__init__.py
================================================
from . import dataset
from . import linemod_dataset

================================================
FILE: data/dataset.py
================================================
import pathlib
import pickle
import time
from functools import partial

import numpy as np


REGISTERED_DATASET_CLASSES = {}


def register_dataset(cls, name=None):
    global REGISTERED_DATASET_CLASSES
    if name is None:
        name = cls.__name__
    assert name not in REGISTERED_DATASET_CLASSES, f"exist class: {REGISTERED_DATASET_CLASSES}"
    REGISTERED_DATASET_CLASSES[name] = cls
    return cls


def get_dataset_class(name):
    global REGISTERED_DATASET_CLASSES
    assert name in REGISTERED_DATASET_CLASSES, f"available class: {REGISTERED_DATASET_CLASSES}"
    return REGISTERED_DATASET_CLASSES[name]


class Dataset(object):
    NumPointFeatures = -1

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

    def _read_data(self, query):

        raise NotImplementedError

    def evaluation(self, dt_annos, output_dir):
        """Dataset must provide a evaluation function to evaluate model."""
        raise NotImplementedError


================================================
FILE: data/linemod/linemod_config.py
================================================
import numpy as np
diameters = {
    'cat': 15.2633,
    'ape': 9.74298,
    'benchvise': 28.6908,
    'bowl': 17.1185,
    'cam': 17.1593,
    'camera': 17.1593,
    'can': 19.3416,
    'cup': 12.5961,
    'driller': 25.9425,
    'duck': 10.7131,
    'eggbox': 17.6364,
    'glue': 16.4857,
    'holepuncher': 14.8204,
    'iron': 30.3153,
    'lamp': 28.5155,
    'phone': 20.8394
}

linemod_cls_names = ['ape', 'cam', 'cat', 'duck', 'glue', 'iron', 'phone', 'benchvise', 'can', 'driller', 'eggbox', 'holepuncher', 'lamp']

linemod_K = np.array([[572.4114, 0., 325.2611],
                  [0., 573.57043, 242.04899],
                  [0., 0., 1.]])


blender_K = np.array([[700., 0., 320.],
                    [0., 700., 240.],
                    [0., 0., 1.]])

================================================
FILE: data/linemod_dataset.py
================================================
import numpy as np 
import random
import os 
from data.dataset import Dataset, register_dataset
import pickle
import PIL
import cv2
import torch
import time
import scipy

from utils.geometric import range_to_depth, render_pointcloud
from .transforms import make_transforms 
from thirdparty.kpconv.lib.utils import square_distance
from utils.geometric import rotation_angle
# from utils.visualize import *
from transforms3d.quaternions import mat2quat, quat2mat, qmult
from transforms3d.euler import mat2euler, euler2mat, euler2quat, quat2euler
import math
from config.default import get_cfg

CURRENT_DIR=os.path.dirname(os.path.abspath(__file__))

try:
    from pytorch3d.io import load_obj, load_ply
except:
    print("Warning: error occurs when importing pytorch3d ")
    pass


def se3_q2m(se3_q):
    assert se3_q.size == 7
    se3_mx = np.zeros((3, 4))
    # quat = se3_q[0:4] / LA.norm(se3_q[0:4])
    quat = se3_q[:4]
    R = quat2mat(quat)
    se3_mx[:, :3] = R
    se3_mx[:, 3] = se3_q[4:]
    return se3_mx

def info_convertor(info,):
    """
        [Transform the original kitti info file]
    """

    seqs = info.keys() #['cat']#
    seq_lengths = [len(info[i]) for i in seqs]
    data = []
    for seq in seqs:
        print(seq)
        data.append(info[seq])

    new_infos = {
        "seqs": list(seqs),
        "seq_lengths": seq_lengths,
        "data": data
    }
    return new_infos

def resize(im, target_size, max_size, stride=0, interpolation=cv2.INTER_LINEAR):
    """
    only resize input image to target size and return scale
    :param im: BGR image input by opencv
    :param target_size: one dimensional size (the short side)
    :param max_size: one dimensional max size (the long side)
    :param stride: if given, pad the image to designated stride
    :param interpolation: if given, using given interpolation method to resize image
    :return:
    """
    im_shape = im.shape
    im_size_min = np.min(im_shape[0:2])
    im_size_max = np.max(im_shape[0:2])
    im_scale = float(target_size) / float(im_size_min)
    # prevent bigger axis from being more than max_size:
    if np.round(im_scale * im_size_max) > max_size:
        im_scale = float(max_size) / float(im_size_max)
    im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=interpolation)

    if stride == 0:
        return im, im_scale
    else:
        # pad to product of stride
        im_height = int(np.ceil(im.shape[0] / float(stride)) * stride)
        im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
        im_channel = im.shape[2]
        padded_im = np.zeros((im_height, im_width, im_channel))
        padded_im[: im.shape[0], : im.shape[1], :] = im
        return padded_im, im_scale
def sample_poses(pose_tgt):
    SYN_STD_ROTATION = 15
    SYN_STD_TRANSLATION = 0.01
    ANGLE_MAX=45
    pose_src = pose_tgt.copy()
    num = pose_tgt.shape[0]
    for i in range(num):
        euler = mat2euler(pose_tgt[i, :3, :3])
        euler += SYN_STD_ROTATION * np.random.randn(3) * math.pi / 180.0
        pose_src[i, :3, :3] = euler2mat(euler[0], euler[1], euler[2])

        pose_src[i, 0, 3] = pose_tgt[i, 0, 3]+ SYN_STD_TRANSLATION * np.random.randn(1)
        pose_src[i, 1, 3] = pose_tgt[i, 1, 3] + SYN_STD_TRANSLATION * np.random.randn(1)
        pose_src[i, 2, 3] = pose_tgt[i, 2, 3]  + 5 * SYN_STD_TRANSLATION * np.random.randn(1)

        r_dist = np.arccos((np.trace(pose_src[i, :3,:3].transpose(-1,-2) @ pose_tgt[i, :3,:3]) - 1 )/2)/math.pi*180

        while r_dist > ANGLE_MAX:#or not (16 < center_x < (640 - 16) and 16 < center_y < (480 - 16)):
            # print("r_dist > ANGLE_MAX, resampling...")
            print("Too large angular differences. Resample the pose...")
            euler = mat2euler(pose_tgt[i, :3, :3])
            euler += SYN_STD_ROTATION * np.random.randn(3) * math.pi / 180.0
            pose_src[i, :3, :3] = euler2mat(euler[0], euler[1], euler[2])

            pose_src[i, 0, 3] = pose_tgt[i, 0, 3]+ SYN_STD_TRANSLATION * np.random.randn(1)
            pose_src[i, 1, 3] = pose_tgt[i, 1, 3] + SYN_STD_TRANSLATION * np.random.randn(1)
            pose_src[i, 2, 3] = pose_tgt[i, 2, 3]  + 5 * SYN_STD_TRANSLATION * np.random.randn(1)

            r_dist = np.arccos((np.trace(pose_src[i, :3,:3].transpose(-1,-2) @ pose_tgt[i, :3,:3]) - 1 )/2)*math.pi/180
    return pose_src.squeeze()


@register_dataset
class LinemodDeepIMSynRealV2(Dataset):
    # use deepim 3d model for geometric feature extraction, mingle the synthetic and real data  
    def __init__(self, root_path,
                 info_path, model_point_dim,
                 is_train,
                 prep_func=None,
                 seq_names=None, 
                 cfg={}
                 ):
        super().__init__()

        assert info_path is not None
        assert isinstance(root_path, (tuple, list)) and isinstance(info_path, (tuple, list))
        assert len(root_path) == len(info_path)
        print("Info:",info_path)
        # assert split in ['train', 'val', 'test']
        self.is_train = is_train
        self.VOC_ROOT = get_cfg('DATA').VOC_ROOT#"/DATA/yxu/LINEMOD_DEEPIM/"

        infos=[]
        for ipath in info_path:
            with open(ipath, 'rb') as f:
                info = pickle.load(f)

                if seq_names is not None:
                    for k in list(info.keys()):
                        if k not in seq_names:
                            del info[k]
                infos.append( info_convertor(info) )

        #merge multiple infos 
        self.infos = infos[0]
        self.infos['dataset_idx'] = [0]*len(self.infos['seqs'])
        for i, info in enumerate(infos[1:]):
            for k in self.infos:
                if k == 'dataset_idx':
                    self.infos[k].extend([i+1]*len(info['seqs']))
                else:
                    self.infos[k].extend(info[k])


        self.root_paths = root_path
        self.model_point_dim = model_point_dim
        # self.max_points=max_points#30000
        self.prep_func=prep_func
        # self.rgb_transformer = None #make_transforms(None, is_train=is_train)
        self.rgb_transformer = make_transforms(None, is_train=is_train)
        print("dataset size:",self.__len__())

        self.init_pose_type = cfg.get("init_post_type", "POSECNN_LINEMOD" ) 
        # self.init_pose_type = cfg.get("init_post_type", "PVNET_LINEMOD_OCC" ) 
#         self.init_pose_type = cfg.get("init_post_type", "PVNET_LINEMOD" ) 
        print("INIT_POSE_TYPE:", self.init_pose_type)
        #Load posecnn results
        if not self.is_train:
            with open(f"{CURRENT_DIR}/../EXPDATA/init_poses/linemod_posecnn_results.pkl", 'rb') as f:
                self.pose_cnn_results_test_posecnn=pickle.load(f)
            try:
                if self.init_pose_type == "POSECNN_LINEMOD":
                    #load posecnn results 
                    self.pose_cnn_results_test=self.pose_cnn_results_test_posecnn
                elif self.init_pose_type =="PVNET_LINEMOD":
                    self.pose_cnn_results_test=np.load(f"{CURRENT_DIR}/../EXPDATA/init_poses/pvnet/pvnet_linemod_test.npy", allow_pickle=True).flat[0]
                elif self.init_pose_type =="PVNET_LINEMOD_OCC":
                    self.pose_cnn_results_test=np.load(f"{CURRENT_DIR}/../EXPDATA/init_poses/pvnet/pvnet_linemodocc_test.npy", allow_pickle=True).flat[0]
                else: 
                    raise NotImplementedError
            except:
                print("Loading posecnn results failed!")
                self.pose_cnn_results_test=None
            try:
                # self.blender_to_bop_pose=np.load(f"{CURRENT_DIR}/../EXPDATA/init_poses/metricpose/blender2bop_RT.npy", allow_pickle=True).flat[0]
                self.blender_to_bop_pose=np.load(f"{CURRENT_DIR}/../EXPDATA/init_poses/pose_conversion/blender2bop_RT.npy", allow_pickle=True).flat[0]
            except:
                print("Loading pose conversion matrix failed!")
                self.blender_to_bop_pose=None 
                
        else:
            self.pose_cnn_results_test=None
            self.blender_to_bop_pose=None
        
    def load_random_background(self, im_observed, mask):
        VOC_root = os.path.join(self.VOC_ROOT, "VOCdevkit/VOC2012")
        VOC_image_set_dir = os.path.join(VOC_root, "ImageSets/Main")
        VOC_bg_list_path = os.path.join(VOC_image_set_dir, "diningtable_trainval.txt")
        with open(VOC_bg_list_path, "r") as f:
            VOC_bg_list = [
                line.strip("\r\n").split()[0] for line in f.readlines() if line.strip("\r\n").split()[1] == "1"
            ]
        height, width, channel = im_observed.shape
        target_size = min(height, width)
        max_size = max(height, width)
        observed_hw_ratio = float(height) / float(width)

        k = random.randint(0, len(VOC_bg_list) - 1)
        bg_idx = VOC_bg_list[k]
        bg_path = os.path.join(VOC_root, "JPEGImages/{}.jpg".format(bg_idx))
        bg_image = cv2.imread(bg_path, cv2.IMREAD_COLOR)[...,::-1] #RGB
        bg_h, bg_w, bg_c = bg_image.shape
        bg_image_resize = np.zeros((height, width, channel), dtype="uint8")
        if (float(height) / float(width) < 1 and float(bg_h) / float(bg_w) < 1) or (
            float(height) / float(width) >= 1 and float(bg_h) / float(bg_w) >= 1
        ):
            if bg_h >= bg_w:
                bg_h_new = int(np.ceil(bg_w * observed_hw_ratio))
                if bg_h_new < bg_h:
                    bg_image_crop = bg_image[0:bg_h_new, 0:bg_w, :]
                else:
                    bg_image_crop = bg_image
            else:
                bg_w_new = int(np.ceil(bg_h / observed_hw_ratio))
                if bg_w_new < bg_w:
                    bg_image_crop = bg_image[0:bg_h, 0:bg_w_new, :]
                else:
                    bg_image_crop = bg_image
        else:
            if bg_h >= bg_w:
                bg_h_new = int(np.ceil(bg_w * observed_hw_ratio))
                bg_image_crop = bg_image[0:bg_h_new, 0:bg_w, :]
            else:  # bg_h < bg_w
                bg_w_new = int(np.ceil(bg_h / observed_hw_ratio))
                print(bg_w_new)
                bg_image_crop = bg_image[0:bg_h, 0:bg_w_new, :]

        bg_image_resize_0, _ = resize(bg_image_crop, target_size, max_size)
        h, w, c = bg_image_resize_0.shape
        bg_image_resize[0:h, 0:w, :] = bg_image_resize_0

        # add background to image_observed
        res_image = bg_image_resize.copy()
        res_image[mask>0]=im_observed[mask>0]

        # im_observed = res_image
        return res_image

    def _read_data(self, idx):
        """
        info structure:
        {
            'cat':[
                {
                "index": idx,
                "model_path": str,
                "rgb_path": str,
                "depth_path": str,
                "RT": np.array([3,4]),
                "K":  np.array([3,3]),
                },
                {
                "index": idx,
                "model_path": str,
                "rgb_path": str,
                "depth_path": str,
                "RT": np.array([3,4]),
                "K":  np.array([3,3]),
                }
            ...
            ],
            'dog':[

            ]
            ...
        }

        """

        if isinstance(idx, (tuple, list)):
            idx, seed = idx
        else:
            seed = None

        seq_lengths = np.array(self.infos['seq_lengths'])
        seq_lengths_cum = np.cumsum(seq_lengths)
        seq_lengths_cum = np.insert(seq_lengths_cum, 0, 0)  # insert a dummy 0
        seq_idx = np.nonzero(seq_lengths_cum > idx)[0][0]-1

        frame_idx = idx - seq_lengths_cum[seq_idx]

        info = self.infos["data"][seq_idx]
        dataset_idx = self.infos["dataset_idx"][seq_idx]
        

        model_points_path = os.path.join(f'{os.path.dirname(__file__)}/../EXPDATA/LM6d_converted/models/{self.infos["seqs"][seq_idx]}/textured.obj' ) # TODO: need check

        rgb_path = os.path.join(self.root_paths[dataset_idx], info[frame_idx]['rgb_observed_path']) 
        depth_path = os.path.join(self.root_paths[dataset_idx], info[frame_idx]['depth_gt_observed_path']) 

        if info[frame_idx].get('rgb_noisy_rendered', None) is not None:
            rgb_noisy_rendered_path = os.path.join(self.root_paths[dataset_idx], info[frame_idx]['rgb_noisy_rendered']) 
        else:
            rgb_noisy_rendered_path = None
        if info[frame_idx].get('depth_noisy_rendered', None) is not None:
            depth_noisy_rendered_path = os.path.join(self.root_paths[dataset_idx], info[frame_idx]['depth_noisy_rendered']) 
        else:
            depth_noisy_rendered_path = None

        if info[frame_idx].get('pose_noisy_rendered', None) is not None:
            rendered_RT = info[frame_idx]['pose_noisy_rendered'].astype(np.float32)
        # else:
        elif self.is_train:
            rendered_RT = sample_poses( info[frame_idx]['gt_pose'].astype(np.float32)[None] )

        K = info[frame_idx]['K'].astype(np.float32)
        RT = info[frame_idx]['gt_pose'].astype(np.float32) #[R,t]

        # evaluation 
        if not self.is_train:
            if self.pose_cnn_results_test is not None:
                class_name=self.infos["seqs"][seq_idx]

                if self.init_pose_type == "PVNET_LINEMOD":
                    try:
                        posecnn_RT = self.pose_cnn_results_test[class_name][frame_idx] # if self.pose_cnn_results_test is not None else np.zeros_like(RT)
                        #Transformations are needed as the pvnet has a different coordinate system. 
                        posecnn_RT[:3,:3] =  posecnn_RT[:3,:3]@self.blender_to_bop_pose[class_name][:3,:3].T
                        posecnn_RT[:3,3:] =  -posecnn_RT[:3,:3] @self.blender_to_bop_pose[class_name][:3,3:]  + posecnn_RT[:3,3:] 
                    except:
                        print("Warning: frame_idx is out of the range of self.pose_cnn_results_test!", flush=True)
                        posecnn_RT= se3_q2m(self.pose_cnn_results_test_posecnn[class_name][frame_idx]['pose']) #np.zeros_like(RT)
                elif self.init_pose_type =="POSECNN_LINEMOD":
                    posecnn_RT= se3_q2m(self.pose_cnn_results_test_posecnn[class_name][frame_idx]['pose']) 
                elif self.init_pose_type == "PVNET_LINEMOD_OCC":
                    try:
                        posecnn_RT = self.pose_cnn_results_test[class_name][frame_idx].copy()# if self.pose_cnn_results_test is not None else np.zeros_like(RT)
                        #Transformations are needed as the pvnet has a different coordinate system. 
                        posecnn_RT[:3,:3] =  posecnn_RT[:3,:3]@self.blender_to_bop_pose[class_name][:3,:3].T
                        posecnn_RT[:3,3:] =  -posecnn_RT[:3,:3] @self.blender_to_bop_pose[class_name][:3,3:]  + posecnn_RT[:3,3:] 
                    except:
                        # print(frame_idx)
                        raise
                else:
                    raise NotImplementedError 
                
                rendered_RT = posecnn_RT
            else:
                print("Warning: fail to load cnn poses!", flush=True)
                posecnn_RT = np.zeros_like(RT)
        else:
            posecnn_RT = np.zeros_like(RT)
        
        #add noise--for testing purpose only, should always be disabled in normal cases 
#         rot_std=0; trans_std=0.04; ang_max=1000;
#         print(f"Add pose noises rot_std={rot_std}, trans_std={trans_std}", flush=True)
#         rendered_RT=sample_poses(rendered_RT[None], rot_std=rot_std, trans_std=trans_std, ang_max=ang_max) 

        # Regularize the matrix to be a valid rotation
        rendered_RT[:3,:3] = rendered_RT[:3,:3]@ np.linalg.inv(scipy.linalg.sqrtm(rendered_RT[:3,:3].T@rendered_RT[:3,:3]))
        
        # model_points = np.fromfile(
        #     str(model_points_path), dtype=np.float32, count=-1).reshape([-1, self.model_point_dim]) # N x model_point_dim
        model_points, _,_ = load_obj(str(model_points_path) )
        model_points = model_points.numpy()
        
        visb = model_points[:,-1:]  # N x model_point_dim

        model_point_features=np.ones_like(model_points[:,:1]).astype(np.float32)


        rgb =  np.asarray(PIL.Image.open(rgb_path))

        if depth_path.endswith('.npy'):
            depth = np.load(depth_path) # blender 
        else:
            depth = cv2.imread(depth_path, -1) /1000.

        if self.is_train and "LM6d_refine_syn" in self.root_paths[dataset_idx]: #synthetic data
            rgb = self.load_random_background(rgb, mask=(depth>0)[...,None].repeat(rgb.shape[-1], axis=-1) )


        rgb_rendered =  np.asarray(PIL.Image.open(rgb_noisy_rendered_path)) if rgb_noisy_rendered_path is not None else None
        depth_rendered = np.asarray(PIL.Image.open(depth_noisy_rendered_path))/1000 if depth_noisy_rendered_path is not None else None #TODO: need check

        ren_mask = render_pointcloud(model_points, rendered_RT[None],K=K[None], render_image_size=rgb.shape[:2] ).squeeze()>0
        # depth = range_to_depth(depth<1, depth*2, K)

        return {
            "class_name":  self.infos["seqs"][seq_idx], 
            "idx": idx,
            "model_points": model_points,
            "visibility": visb,
            "model_point_features":model_point_features,
            "image": rgb,
            "depth": depth,
            "mask": depth>0,
            "rendered_image": rgb_rendered,
            "rendered_depth": depth_rendered,
            "K": K,
            "RT": RT,
            "rendered_RT": rendered_RT.astype(np.float32),
            "ren_mask":ren_mask,
            "POSECNN_RT": posecnn_RT.astype(np.float32), #for test, TODO
            "scale": 1 # model_scale * scale = depth_scale
        }


    def __getitem__(self, idx):

        data=self._read_data(idx) 
        try:
            data_p=self.prep_func(data, rand_rgb_transformer=self.rgb_transformer, find_2d3d_correspondence=self.is_train )
        except Exception as e: 
            if e.args[0] in ["Too few correspondences are found!"] :
                if isinstance(idx, (tuple, list)):
                    # idx, seed = idx
                    idx = [(idx[0]+1)%self.__len__(), idx[1]]
                else:
                    idx = (idx+1) %self.__len__()
                data_p= self.__getitem__(idx )
            else:
                raise ValueError

        return data_p

    def __len__(self):
        return np.sum(self.infos['seq_lengths'])


================================================
FILE: data/preprocess.py
================================================
import open3d as o3d
import copy
import os

import pathlib
import pickle
import time
from collections import defaultdict
from functools import partial

import cv2
import numpy as np
import quaternion
from skimage import io as imgio
from utils.timer import simple_timer

import matplotlib.pyplot as plt
from collections.abc import Iterable
import torch
import torch.nn.functional as F
import quaternion

from functools import partial
import thirdparty.kpconv.cpp_wrappers.cpp_subsampling.grid_subsampling as cpp_subsampling
import thirdparty.kpconv.cpp_wrappers.cpp_neighbors.radius_neighbors as cpp_neighbors
from thirdparty.kpconv.lib.timer import Timer
from utils.geometric import range_to_depth, mask_depth_to_point_cloud
from utils.furthest_point_sample import fragmentation_fps
from utils.rand_utils import truncated_normal


def merge_batch(batch_list):
    # [batch][key][seq]->example[key][seq][batch]
    # Or [batch][key]->example[key][batch]
    example_merged = defaultdict(list)
    for example in batch_list:  # batch dim
        for k, v in example.items():  # key dim
            # assert isinstance(v, list)
            if isinstance(v, list):
                seq_len = len(v)
                if k not in example_merged:
                    example_merged[k] = [[] for i in range(seq_len)]
                for i, vi in enumerate(v):  # seq dim
                    example_merged[k][i].append(vi)

            else:
                example_merged[k].append(v)

    ret = {}
    for key, elems in example_merged.items():
        if key in ['model_points', "original_model_points", 'visibility']:
            # concat the points of lenghts (N1,N2...) to a longer one with length (N1+N2+...)
            ret[key] = np.concatenate(elems, axis=0)
            # record the point numbers for original batches
            ret['batched_model_point_lengths'] = np.array(
                [len(p) for p in elems], dtype=np.int32)
        elif key in ['rand_model_points', ]:
            # concat the points of lenghts (N1,N2...) to a longer one with length (N1+N2+...)
            ret[key] = np.concatenate(elems, axis=0)
            # record the point numbers for original batches
            ret['batched_rand_model_point_lengths'] = np.array(
                [len(p) for p in elems], dtype=np.int32)
        elif key in ['model_point_features']:
            ret[key] = np.concatenate(elems, axis=0)

        # ['odometry/tq','odometry/RT','odometry/invRT' ]:
        elif key in ['image', 'depth', 'K', 'RT', 'original_RT' ,'rand_RT', 'correspondences_2d3d', 'scale',  'POSECNN_RT','rendered_image', 'rendered_depth', 'rendered_RT', '3d_keypoint_inds', '3d_keypoints', 'mask', 'ren_mask']: #'depth_coords2d','lifted_points', 
            try:
                ret[key] = np.stack(elems, axis=0)
            except:
                print(key, flush=True)
                raise
        elif key == 'metrics':
            ret[key] = elems
        else:
            ret[key] = []
            for e in elems:
                ret[key].append(e)

    return ret


def get_correspondences(src_pcd, tgt_pcd, search_voxel_size, K=None, trans=None):
    if trans is not None:
        src_pcd.transform(trans)
    pcd_tree = o3d.geometry.KDTreeFlann(tgt_pcd)

    correspondences = []
    for i, point in enumerate(src_pcd.points):
        [count, idx, _] = pcd_tree.search_radius_vector_3d(
            point, search_voxel_size)
        if K is not None:
            idx = idx[:K]
        for j in idx:
            correspondences.append([i, j])

    correspondences = np.array(correspondences)
    # correspondences = torch.from_numpy(correspondences)
    return correspondences


def to_pcd(xyz):
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(xyz)
    return pcd


def to_tsfm(rot, trans):
    tsfm = np.eye(4)
    tsfm[:3, :3] = rot
    tsfm[:3, 3] = trans.flatten()
    return tsfm


def CameraIntrinsicUpdate(old_K, aug_param):
    '''
    old_K: array of shape (N,3,3), the old camera intrinsic parameters
    aug_pram: dict, the data augmentation parameters
    '''
    aug_type = aug_param['aug_type']
    assert aug_type in ['crop', 'scale', 'flip']

    new_K = np.copy(old_K)
    if aug_type == 'crop':
        cx, cy = aug_param['crop/left_top_corner']  # x,y
        new_K[..., 0, 2] = new_K[..., 0, 2] - cx
        new_K[..., 1, 2] = new_K[..., 1, 2] - cy
    elif aug_type == 'scale':
        s_x, s_y = aug_param['scale/scale']
        new_K[..., 0, 0] = s_x * new_K[..., 0, 0]
        new_K[..., 1, 1] = s_y * new_K[..., 1, 1]

        new_K[..., 0, 2] = s_x * new_K[..., 0, 2]
        new_K[..., 1, 2] = s_y * new_K[..., 1, 2]
    elif aug_type == 'flip':
        w = aug_param['flip/width']
        # h = aug_param['flip/heigh']
        new_K[..., 0, 2] = w - new_K[..., 0, 2]  # px' = w-px
        # new_K[...,1,2] = h- new_K[...,1,2]
        new_K[..., 0, 0] = - new_K[..., 0, 0]  # fx' = -fx

    return new_K


def crop_transform(images, depths, Ks, crop_param, ):
    assert(len(images) == len(depths) == len(Ks))

    crop_type = crop_param["crop_type"]
    assert(crop_type in ["fixed", "center", "random"])

    crop_size = crop_param["crop_size"]
    iheight, iwidth = images[0].shape[:2]

    if crop_type == "fixed":
        lt_corner = crop_param["lt_corner"]
        op = transforms.Crop(
            lt_corner[0], lt_corner[1], crop_size[0], crop_size[1])
    elif crop_type == "center":
        op = transforms.CenterCrop(crop_size)

        ci, cj, _, _ = op.get_params(images[0], crop_size)
        lt_corner = ci, cj

    elif crop_type == "random":
        op = transforms.RandomCrop((iheight, iwidth), crop_size)

        lt_corner = op.i, op.j

    for i, _ in enumerate(images):
        images[i] = op(images[i])
        depths[i] = op(depths[i])

        Ks[i] = CameraIntrinsicUpdate(Ks[i],
                                      {"aug_type": "crop", "crop/left_top_corner": (lt_corner[1], lt_corner[0])})

    return images, depths, Ks


# def patch_crop(image, depth, mask, K_old, margin_ratio=0.2, output_size=128, offset_ratio=(0,0),bbox=None, mask_depth=True):
def patch_crop(image, depth, mask, K_old, margin_ratio=0.2, output_size=128, offset_ratio=(0,0),bbox=None, mask_depth=False):
    '''
        image: HxWx3
        mask: HxW
        K_old: 3x3
        offset: (offset_h, offset_w)
    '''

    H, W, _ = image.shape
    
    mask = mask.astype('uint8')*255
    if bbox is None:
        _x, _y, _w, _h = cv2.boundingRect(mask)
    else:
        _x, _y, _w, _h = bbox[1], bbox[0], bbox[3]-bbox[1], bbox[2]-bbox[0]

    # center = [_x+_w/2, _y+_h/2]
    center = [_x+_w/2+offset_ratio[1]*_w, _y+_h/2+offset_ratio[0]*_h ]

    L = int(max(_w, _h) * (1+2*margin_ratio))
    
    if L<0:
        #TODO
        print(mask.sum(), depth.sum(), '!!!', flush=True)
        L=128

    x = max(0, int(center[0] - L/2))
    y = max(0, int(center[1] - L/2))

    crop = image[y:y+L, x:x+L]
    # only keep the ROI depth

    if mask_depth:
        depth[mask < 1] = 0 # removed by dy at 0810
    depth_crop = depth[y:y+L, x:x+L]
    mask_crop = mask[y:y+L, x:x+L]

    
    # w=h=int ((1+2*margin_ratio)*L) # actual crop size
    w = h = L  # actual crop size
    # automatically handle the "out of range" problem
    patch = np.zeros([h, w, 3], dtype=image.dtype)
    # depth_patch = np.ones([h, w], dtype=depth.dtype)
    depth_patch = np.zeros([h, w], dtype=depth.dtype)
    mask_patch = np.zeros([h, w], dtype=depth.dtype)

    try:
        xp = 0
        yp = 0
        patch[xp: xp+crop.shape[0], yp:yp+crop.shape[1]] = crop
        depth_patch[xp: xp+crop.shape[0], yp:yp+crop.shape[1]] = depth_crop
        mask_patch[xp: xp+crop.shape[0], yp:yp+crop.shape[1]] = mask_crop
    except:
        import pdb
        pdb.set_trace()
    patch = cv2.resize(patch, (output_size, output_size),
                       interpolation=cv2.INTER_LINEAR)
    depth_patch = cv2.resize(
        depth_patch, (output_size, output_size), interpolation=cv2.INTER_NEAREST)
    mask_patch = cv2.resize(
        mask_patch, (output_size, output_size), interpolation=cv2.INTER_NEAREST)

    # update the intrinsic parameters
    K_new = np.zeros_like(K_old)
    scale = output_size/L
    K_new[0, 2] = (K_old[0, 2]-x)*scale
    K_new[1, 2] = (K_old[1, 2]-y)*scale
    K_new[0, 0] = K_old[0, 0]*scale
    K_new[1, 1] = K_old[1, 1]*scale
    K_new[2, 2] = 1

    # return patch, depth_patch, K_new
    return patch, depth_patch, mask_patch, K_new


def preprocess_deepim(
    input_dict,
    max_points,
    correspondence_radius,
    normalize_model=True,
    rand_transform_model=False,  # False,#True,
    rand_rgb_transformer=None,
    image_scale=None,
    patch_cropper=None,  # func patch_crop(...)
    
):
    output_dict = copy.deepcopy(input_dict)

    ################################### process 3D point clouds ###################################

    if (output_dict['model_points'].shape[0] > max_points):
        # if(output_dict['model_points'].shape[0] > 20000):
        idx = np.random.permutation(
            output_dict['model_points'].shape[0])[:max_points]
        print(idx, output_dict['model_points'].shape, flush=True)
        output_dict['model_points'] = output_dict['model_points'][idx]
        output_dict['model_point_features'] = output_dict['model_point_features'][idx]

    output_dict['original_RT'] = copy.deepcopy(output_dict['RT'])
    if normalize_model:
        points = output_dict['model_points']
        mean = points.mean(axis=0)
        scope = points.max(axis=0)-points.min(axis=0)
        points_normalize = (points-mean)/scope.max()
        # points_normalize.tofile(bin_save_path)
        # modify the extrinsic parameters
        output_dict['RT'][:, 3:] = output_dict['RT'][:, :3] @ mean[:,
                                                                   None] + output_dict['RT'][:, 3:]  # 3x3 @ 3x1 + 3x1
        # input_dict['RT'][:,:3] *=scope.max()
        output_dict['scale'] = scope.max()
        output_dict['original_model_points'] = output_dict['model_points']
        output_dict['model_points'] = points_normalize


    if rand_transform_model:
        points = output_dict['model_points']
        rand_quat = np.random.randn(1, 4)
        rand_quat = rand_quat/np.linalg.norm(rand_quat, axis=-1)
        rand_rot = quaternion.as_rotation_matrix(
            quaternion.from_float_array(rand_quat)).squeeze()  # 3x3
        output_dict['rand_model_points'] = (
            rand_rot@ points.T).T.astype(np.float32)
        output_dict['rand_RT'] = copy.deepcopy(output_dict['RT'])
        # output_dict['RT'][:,:3]@rand_rot.T
        output_dict['rand_RT'][:, :3] = rand_rot
        output_dict['rand_RT'][:, 3] = 0

    ################################### process 2D images ###################################
    # carve out image patches
    if patch_cropper is not None:
        ref_mask = output_dict['depth'] > 0
        output_dict['image'], output_dict['depth'], output_dict['K'] = patch_cropper(
            output_dict['image'], output_dict['depth'],  ref_mask, output_dict['K'])

        output_dict['rendered_image'], output_dict['rendered_depth'], _ = patch_cropper(
            output_dict['rendered_image'], output_dict['rendered_depth'],  ref_mask, output_dict['K'].copy() )

    # rescale image
    if image_scale is not None:
        output_dict['image'] = cv2.resize(output_dict['image'],
                                          (output_dict['image'].shape[1]*image_scale,
                                           output_dict['image'].shape[0]*image_scale),
                                          interpolation=cv2.INTER_AREA)
        output_dict['depth'] = cv2.resize(output_dict['depth'],
                                          (output_dict['depth'].shape[1]*image_scale,
                                           output_dict['depth'].shape[0]*image_scale),
                                          interpolation=cv2.INTER_NEAREST)
        output_dict['K'][:2] = output_dict['K'][:2]*image_scale

    # lift depth
    depth = output_dict['depth'].squeeze()  # H,W
    depth_pts, depth_coords2d = mask_depth_to_point_cloud(
        depth != 0, depth, output_dict['K'])
    depth_pts = (output_dict['RT'][:, :3].T@(depth_pts.T - output_dict['RT']
                                             [:, 3:])).T / output_dict['scale']  # transformed to the model frame

    # find 2d-3d correspondences
    tsfm = np.eye(4)
    tsfm[:3] = output_dict['RT']
    model_pcd = output_dict['model_points']

    correspondences_2d3d = get_correspondences(
        to_pcd(depth_pts), to_pcd(model_pcd),  correspondence_radius, K=5)
    if len(correspondences_2d3d.shape) < 2 or len(correspondences_2d3d) < 10:
        print(depth_pts.shape, model_pcd.shape)
        print("correspondences_2d3d.shape:",
              correspondences_2d3d.shape, flush=True)
        # raise ValueError("Too few correspondences are found!")
        raise Exception("Too few correspondences are found!")

    output_dict['depth_coords2d'] = depth_coords2d
    output_dict['lifted_points'] = depth_pts
    # output_dict['correspondences_2d3d'] = np.zeros(1)#correspondences_2d3d
    output_dict['correspondences_2d3d'] = correspondences_2d3d

    if rand_rgb_transformer is not None:
        output_dict['image'], _, _ = rand_rgb_transformer(output_dict['image'])
    # TO TENSOR
    output_dict['image'] = (output_dict['image'].astype(
        np.float32)/255.0).transpose([2, 0, 1])  # .mean(axis=0, keepdims=True) # 1,H,W
    output_dict['depth'] = output_dict['depth'].astype(np.float32)[
        None]  # 1,H,W

    return output_dict

def preprocess(
    input_dict,
    max_points,
    correspondence_radius,
    normalize_model=True,
    rand_transform_model=False, 
    rand_rgb_transformer=None,
    image_scale=None,
    crop_param=None,
    kp_3d_param=None,
    use_coords_as_3d_feat=False,
    find_2d3d_correspondence=True,
    
):
    output_dict = copy.deepcopy(input_dict)

    ################################### process 3D point clouds ###################################
    if use_coords_as_3d_feat:
        output_dict['model_point_features'] = output_dict['model_points'][:,:3]

    if (output_dict['model_points'].shape[0] > max_points):
        # if(output_dict['model_points'].shape[0] > 20000):
        idx = np.random.permutation(
            output_dict['model_points'].shape[0])[:max_points]
        print(idx, output_dict['model_points'].shape, flush=True)
        output_dict['model_points'] = output_dict['model_points'][idx]
        output_dict['model_point_features'] = output_dict['model_point_features'][idx]

    output_dict['original_RT'] = copy.deepcopy(output_dict['RT'])
    output_dict['original_model_points'] = output_dict['model_points']
    if normalize_model:
        points = output_dict['model_points']
        mean = points.mean(axis=0)
        scope = points.max(axis=0)-points.min(axis=0)
        points_normalize = (points-mean)/scope.max()
        # modify the extrinsic parameters
        output_dict['RT'][:, 3:] = output_dict['RT'][:, :3] @ mean[:,
                                                                   None] + output_dict['RT'][:, 3:]  # 3x3 @ 3x1 + 3x1
        output_dict['scale'] = scope.max()
        output_dict['model_points'] = points_normalize


    if rand_transform_model:
        points = output_dict['model_points']
        rand_quat = np.random.randn(1, 4)
        rand_quat = rand_quat/np.linalg.norm(rand_quat, axis=-1)
        rand_rot = quaternion.as_rotation_matrix(
            quaternion.from_float_array(rand_quat)).squeeze()  # 3x3
        output_dict['rand_model_points'] = (
            rand_rot@ points.T).T.astype(np.float32)
        output_dict['rand_RT'] = copy.deepcopy(output_dict['RT'])
        output_dict['rand_RT'][:, :3] = rand_rot
        output_dict['rand_RT'][:, 3] = 0

    ################################### process 2D images ###################################
    #crop image
    if crop_param is not None:# and output_dict['mask'].sum()>0:
        #without random cropping
        if not crop_param.rand_crop: 
            if crop_param.get("crop_with_init_pose", False):
                # bbox= output_dict.get('bbox', None)
                bbox=None
                output_dict['image'], output_dict['depth'], output_dict['mask'], output_dict['K'] = patch_crop(output_dict['image'], output_dict['depth'], mask=output_dict['ren_mask'],
                                K_old=output_dict['K'], margin_ratio=crop_param.margin_ratio, output_size=crop_param.output_size,  bbox=bbox
                                                )
            elif crop_param.get("crop_with_rand_bbox_shift", True): 
                bbox= output_dict.get('bbox', None)
                # offset_ratio= [truncated_normal(0,0.5,-1,1)*crop_param.max_rand_offset_ratio, truncated_normal(0,0.5,-1,1)*crop_param.max_rand_offset_ratio] 
                offset_ratio= [truncated_normal(0,0.33,-1,1)*1, truncated_normal(0,0.33,-1,1)*1] 
                output_dict['image'], output_dict['depth'], output_dict['mask'], output_dict['K'] = patch_crop(output_dict['image'], output_dict['depth'], mask=output_dict['mask'],
                                                    K_old=output_dict['K'], margin_ratio=crop_param.margin_ratio, output_size=crop_param.output_size, offset_ratio=offset_ratio, bbox=output_dict.get('bbox', None) 
                                                )
            else:
                bbox= output_dict.get('bbox', None)
                output_dict['image'], output_dict['depth'], output_dict['mask'], output_dict['K'] = patch_crop(output_dict['image'], output_dict['depth'], mask=output_dict['mask'],
                                                    K_old=output_dict['K'], margin_ratio=crop_param.margin_ratio, output_size=crop_param.output_size, bbox=output_dict.get('bbox', None) 
                                                )
        else:
            margin_ratio= truncated_normal(0.5, 0.5, 0, 1) *crop_param.max_rand_margin_ratio
            offset_ratio= [truncated_normal(0,0.5,-1,1)*crop_param.max_rand_offset_ratio, truncated_normal(0,0.5,-1,1)*crop_param.max_rand_offset_ratio] 
            output_dict['image'], output_dict['depth'], output_dict['mask'], output_dict['K'] = patch_crop(output_dict['image'], output_dict['depth'], mask=output_dict['mask'],
                                                K_old=output_dict['K'], margin_ratio=margin_ratio, output_size=crop_param.output_size, offset_ratio=offset_ratio,  bbox=output_dict.get('bbox', None) 
                                              )
            
    # rescale image
    if image_scale is not None:
        output_dict['image'] = cv2.resize(output_dict['image'],
                                          (output_dict['image'].shape[1]*image_scale,
                                           output_dict['image'].shape[0]*image_scale),
                                          interpolation=cv2.INTER_AREA)
        output_dict['depth'] = cv2.resize(output_dict['depth'],
                                          (output_dict['depth'].shape[1]*image_scale,
                                           output_dict['depth'].shape[0]*image_scale),
                                          interpolation=cv2.INTER_NEAREST)
        output_dict['K'][:2] = output_dict['K'][:2]*image_scale

    # lift depths
    depth = output_dict['depth'].squeeze()  # H,W
    depth_pts, depth_coords2d = mask_depth_to_point_cloud(
        depth != 0, depth, output_dict['K'])

    depth_pts = (output_dict['RT'][:, :3].T@(depth_pts.T - output_dict['RT']
                                             [:, 3:])).T / output_dict['scale']  # transformed to the model frame

    # find 2d-3d correspondences
    if find_2d3d_correspondence:
        tsfm = np.eye(4)
        tsfm[:3] = output_dict['RT']
        model_pcd = output_dict['model_points']
        correspondences_2d3d = get_correspondences(
            to_pcd(depth_pts), to_pcd(model_pcd),  correspondence_radius, K=5)
        if len(correspondences_2d3d.shape) < 2 or len(correspondences_2d3d) < 10:# or ( "mask" in output_dict and output_dict['mask'].sum()<10 ) :
            print(depth_pts.shape, model_pcd.shape)
            print("correspondences_2d3d.shape:",
                  correspondences_2d3d.shape, flush=True)
            raise Exception("Too few correspondences are found!")

        output_dict['depth_coords2d'] = depth_coords2d
        output_dict['lifted_points'] = depth_pts
        output_dict['correspondences_2d3d'] = correspondences_2d3d
    else:
        output_dict['depth_coords2d'] = depth_coords2d
        output_dict['lifted_points'] = depth_pts
        output_dict['correspondences_2d3d'] = np.zeros([10,2], dtype=np.int64) 


    if rand_rgb_transformer is not None:
        output_dict['image'], _, _ = rand_rgb_transformer(output_dict['image'])
    # TO TENSORs
    output_dict['image'] = (output_dict['image'].astype(
        np.float32)/255.0).transpose([2, 0, 1])  # .mean(axis=0, keepdims=True) # 1,H,W
    output_dict['depth'] = output_dict['depth'].astype(np.float32)[
        None]  # 1,H,W

    return output_dict

def batch_grid_subsampling_kpconv(points, batches_len, features=None, labels=None, sampleDl=0.1, max_p=0, verbose=0, random_grid_orient=True):
    """
    CPP wrapper for a grid subsampling (method = barycenter for points and features)
    """
    if (features is None) and (labels is None):
        s_points, s_len = cpp_subsampling.subsample_batch(points,
                                                          batches_len,
                                                          sampleDl=sampleDl,
                                                          max_p=max_p,
                                                          verbose=verbose)
        return torch.from_numpy(s_points), torch.from_numpy(s_len)

    elif (labels is None):
        s_points, s_len, s_features = cpp_subsampling.subsample_batch(points,
                                                                      batches_len,
                                                                      features=features,
                                                                      sampleDl=sampleDl,
                                                                      max_p=max_p,
                                                                      verbose=verbose)
        return torch.from_numpy(s_points), torch.from_numpy(s_len), torch.from_numpy(s_features)

    elif (features is None):
        s_points, s_len, s_labels = cpp_subsampling.subsample_batch(points,
                                                                    batches_len,
                                                                    classes=labels,
                                                                    sampleDl=sampleDl,
                                                                    max_p=max_p,
                                                                    verbose=verbose)
        return torch.from_numpy(s_points), torch.from_numpy(s_len), torch.from_numpy(s_labels)

    else:
        s_points, s_len, s_features, s_labels = cpp_subsampling.subsample_batch(points,
                                                                                batches_len,
                                                                                features=features,
                                                                                classes=labels,
                                                                                sampleDl=sampleDl,
                                                                                max_p=max_p,
                                                                                verbose=verbose)
        return torch.from_numpy(s_points), torch.from_numpy(s_len), torch.from_numpy(s_features), torch.from_numpy(s_labels)


def batch_neighbors_kpconv(queries, supports, q_batches, s_batches, radius, max_neighbors):
    """
    Computes neighbors for a batch of queries and supports, apply radius search
    :param queries: (N1, 3) the query points
    :param supports: (N2, 3) the support points
    :param q_batches: (B) the list of lengths of batch elements in queries
    :param s_batches: (B)the list of lengths of batch elements in supports
    :param radius: float32
    :return: neighbors indices
    """

    neighbors = cpp_neighbors.batch_query(
        queries, supports, q_batches, s_batches, radius=radius)
    # print("neighbors.shape" , neighbors.shape, queries.shape,flush=True)
    if max_neighbors > 0:
        return torch.from_numpy(neighbors[:, :max_neighbors])
    else:
        return torch.from_numpy(neighbors)


def collate_fn_descriptor(list_data, config, neighborhood_limits):
    ret = merge_batch(list_data)

    batched_points = torch.from_numpy(ret['model_points'])
    batched_lengths = torch.from_numpy(ret['batched_model_point_lengths'])
    batched_features = torch.from_numpy(ret['model_point_features'])

    if ret.get('rand_model_points', None) is not None:
        batched_rand_points = torch.from_numpy(ret['rand_model_points'])
        batched_rand_lengths = torch.from_numpy(
            ret['batched_rand_model_point_lengths'])

        batched_points = torch.cat(
            [batched_points, batched_rand_points], dim=0)
        batched_lengths = torch.cat(
            [batched_lengths, batched_rand_lengths], dim=0)
        batched_features = torch.cat(
            [batched_features, batched_features], dim=0)

    # Starting radius of convolutions
    r_normal = config.first_subsampling_dl * config.conv_radius
    # Starting layer
    layer_blocks = []
    layer = 0

    # Lists of inputs
    input_points = []
    input_neighbors = []
    input_pools = []
    input_upsamples = []
    input_batches_len = []
    timer = Timer()
    for block_i, block in enumerate(config.architecture):
        # Stop when meeting a global pooling or upsampling
        if 'global' in block or 'upsample' in block:
            break

        # Get all blocks of the layer
        if not ('pool' in block or 'strided' in block):
            layer_blocks += [block]
            if block_i < len(config.architecture) - 1 and not ('upsample' in config.architecture[block_i + 1]):
                continue

        # Convolution neighbors indices
        # *****************************

        if layer_blocks:
            # Convolutions are done in this layer, compute the neighbors with the good radius
            if np.any(['deformable' in blck for blck in layer_blocks[:-1]]):
                r = r_normal * config.deform_radius / config.conv_radius
            else:
                r = r_normal
            conv_i = batch_neighbors_kpconv(
                batched_points, batched_points, batched_lengths, batched_lengths, r, neighborhood_limits[layer])

        else:
            # This layer only perform pooling, no neighbors required
            conv_i = torch.zeros((0, 1), dtype=torch.int64)

        # Pooling neighbors indices
        # *************************

        if 'pool' in block or 'strided' in block:

            # New subsampling length
            dl = 2 * r_normal / config.conv_radius

            # Subsampled points
            pool_p, pool_b = batch_grid_subsampling_kpconv(
                batched_points, batched_lengths, sampleDl=dl)

            # Radius of pooled neighbors
            if 'deformable' in block:
                r = r_normal * config.deform_radius / config.conv_radius
            else:
                r = r_normal

            # Subsample indices
            pool_i = batch_neighbors_kpconv(
                pool_p, batched_points, pool_b, batched_lengths, r, neighborhood_limits[layer])

            # Upsample indices (with the radius of the next layer to keep wanted density)
            up_i = batch_neighbors_kpconv(
                batched_points, pool_p, batched_lengths, pool_b, 2 * r, neighborhood_limits[layer])

        else:
            # No pooling in the end of this layer, no pooling indices required
            pool_i = torch.zeros((0, 1), dtype=torch.int64)
            pool_p = torch.zeros((0, 3), dtype=torch.float32)
            pool_b = torch.zeros((0,), dtype=torch.int64)
            up_i = torch.zeros((0, 1), dtype=torch.int64)

        # Updating input lists
        input_points += [batched_points.float()]
        input_neighbors += [conv_i.long()]
        input_pools += [pool_i.long()]
        input_upsamples += [up_i.long()]
        input_batches_len += [batched_lengths]

        # New points for next layer
        batched_points = pool_p
        batched_lengths = pool_b

        # Update radius and reset blocks
        r_normal *= 2
        layer += 1
        layer_blocks = []

    ###############
    # Return inputs
    ###############
    dict_inputs = {
        "idx": ret["idx"],
        'model_points': input_points,
        'visibility': torch.from_numpy(ret['visibility']),
        'neighbors': input_neighbors,
        'pools': input_pools,
        'upsamples': input_upsamples,
        'model_point_features': batched_features.float(),
        'stack_lengths': input_batches_len,
        'image': torch.from_numpy(ret['image']),
        'depth': torch.from_numpy(ret['depth']),
        'mask': torch.from_numpy(ret['mask']),
        'ren_mask': torch.from_numpy(ret['ren_mask']),
        'K': torch.from_numpy(ret['K']),
        'RT': torch.from_numpy(ret['RT']),
        'original_RT': torch.from_numpy(ret['original_RT']),
        'POSECNN_RT': torch.from_numpy(ret.get('POSECNN_RT', np.zeros_like(ret['RT']) ) ),
        'rand_RT': torch.from_numpy(ret.get('rand_RT', np.zeros_like(ret['RT']))),
        # "lifted_points": torch.from_numpy(ret['lifted_points']),
        "lifted_points": [torch.from_numpy(d) for d in ret['lifted_points'] ] ,
        # 'depth_coords2d': torch.from_numpy(ret['depth_coords2d']),
        'depth_coords2d': [torch.from_numpy(d) for d in ret['depth_coords2d']],
        "correspondences_2d3d": torch.from_numpy(ret['correspondences_2d3d']),
        "original_model_points": torch.from_numpy(ret['original_model_points']),
        "class_name": ret['class_name'],
        "3d_keypoint_inds": torch.from_numpy(ret['3d_keypoint_inds']),
        "3d_keypoints": torch.from_numpy(ret['3d_keypoints'] ) 
    }

    return dict_inputs


def collate_fn_descriptor_deepim(list_data, config, neighborhood_limits):
    ret = merge_batch(list_data)

    batched_points = torch.from_numpy(ret['model_points'])
    batched_lengths = torch.from_numpy(ret['batched_model_point_lengths'])
    batched_features = torch.from_numpy(ret['model_point_features'])
    

    if ret.get('rand_model_points', None) is not None:
        # torch.from_numpy(np.concatenate(batched_points_list, axis=0))
        batched_rand_points = torch.from_numpy(ret['rand_model_points'])
        # torch.from_numpy(np.concatenate(batched_points_list, axis=0))
        batched_rand_lengths = torch.from_numpy(
            ret['batched_rand_model_point_lengths'])

        batched_points = torch.cat(
            [batched_points, batched_rand_points], dim=0)
        batched_lengths = torch.cat(
            [batched_lengths, batched_rand_lengths], dim=0)
        batched_features = torch.cat(
            [batched_features, batched_features], dim=0)

    # Starting radius of convolutions
    r_normal = config.first_subsampling_dl * config.conv_radius
    # Starting layer
    layer_blocks = []
    layer = 0

    # Lists of inputs
    input_points = []
    input_neighbors = []
    input_pools = []
    input_upsamples = []
    input_batches_len = []
    timer = Timer()
    for block_i, block in enumerate(config.architecture):
        # timer.tic()

        # Stop when meeting a global pooling or upsampling
        if 'global' in block or 'upsample' in block:
            break

        # Get all blocks of the layer
        if not ('pool' in block or 'strided' in block):
            layer_blocks += [block]
            if block_i < len(config.architecture) - 1 and not ('upsample' in config.architecture[block_i + 1]):
                continue

        # Convolution neighbors indices
        # *****************************

        if layer_blocks:
            # Convolutions are done in this layer, compute the neighbors with the good radius
            if np.any(['deformable' in blck for blck in layer_blocks[:-1]]):
                r = r_normal * config.deform_radius / config.conv_radius
            else:
                r = r_normal
            conv_i = batch_neighbors_kpconv(
                batched_points, batched_points, batched_lengths, batched_lengths, r, neighborhood_limits[layer])

        else:
            # This layer only perform pooling, no neighbors required
            conv_i = torch.zeros((0, 1), dtype=torch.int64)

        # Pooling neighbors indices
        # *************************

        # If end of layer is a pooling operation
        if 'pool' in block or 'strided' in block:

            # New subsampling length
            dl = 2 * r_normal / config.conv_radius

            # Subsampled points
            pool_p, pool_b = batch_grid_subsampling_kpconv(
                batched_points, batched_lengths, sampleDl=dl)

            # Radius of pooled neighbors
            if 'deformable' in block:
                r = r_normal * config.deform_radius / config.conv_radius
            else:
                r = r_normal

            # Subsample indices
            pool_i = batch_neighbors_kpconv(
                pool_p, batched_points, pool_b, batched_lengths, r, neighborhood_limits[layer])

            # Upsample indices (with the radius of the next layer to keep wanted density)
            up_i = batch_neighbors_kpconv(
                batched_points, pool_p, batched_lengths, pool_b, 2 * r, neighborhood_limits[layer])

        else:
            # No pooling in the end of this layer, no pooling indices required
            pool_i = torch.zeros((0, 1), dtype=torch.int64)
            pool_p = torch.zeros((0, 3), dtype=torch.float32)
            pool_b = torch.zeros((0,), dtype=torch.int64)
            up_i = torch.zeros((0, 1), dtype=torch.int64)

        # Updating input lists
        input_points += [batched_points.float()]
        input_neighbors += [conv_i.long()]
        input_pools += [pool_i.long()]
        input_upsamples += [up_i.long()]
        input_batches_len += [batched_lengths]

        # New points for next layer
        batched_points = pool_p
        batched_lengths = pool_b

        # Update radius and reset blocks
        r_normal *= 2
        layer += 1
        layer_blocks = []

        # timer.toc()
    ###############
    # Return inputs
    ###############
    dict_inputs = {
        "idx": ret["idx"],
        'model_points': input_points,
        'visibility': torch.from_numpy(ret['visibility']),
        'neighbors': input_neighbors,
        'pools': input_pools,
        'upsamples': input_upsamples,
        'model_point_features': batched_features.float(),
        'stack_lengths': input_batches_len,
        'image': torch.from_numpy(ret['image']),
        'depth': torch.from_numpy(ret['depth']),
        "ren_mask": torch.from_numpy(ret['ren_mask']),
        'K': torch.from_numpy(ret['K']),
        'RT': torch.from_numpy(ret['RT']),
        'original_RT': torch.from_numpy(ret['original_RT']),
        'rendered_RT': torch.from_numpy(ret['rendered_RT']) if ret.get('rendered_RT', None) is not None else None ,
        'POSECNN_RT': torch.from_numpy(ret.get('POSECNN_RT', np.zeros_like(ret['RT']) ) ),
        # TODO
        'rand_RT': torch.from_numpy(ret.get('rand_RT', np.zeros_like(ret['RT']))),
        # "lifted_points": torch.from_numpy(ret['lifted_points']),
        "lifted_points": [torch.from_numpy(d) for d in ret['lifted_points'] ] ,
        # 'depth_coords2d': torch.from_numpy(ret['depth_coords2d']),
        'depth_coords2d': [torch.from_numpy(d) for d in ret['depth_coords2d']],
        "correspondences_2d3d": torch.from_numpy(ret['correspondences_2d3d']),
        "original_model_points": torch.from_numpy(ret['original_model_points']),
        "class_name": ret['class_name'],
    }

    return dict_inputs


def calibrate_neighbors(dataset, config, collate_fn, keep_ratio=0.8, samples_threshold=2000):
    timer = Timer()
    last_display = timer.total_time

    # From config parameter, compute higher bound of neighbors number in a neighborhood
    hist_n = int(np.ceil(4 / 3 * np.pi * (config.deform_radius + 1) ** 3))
    neighb_hists = np.zeros((config.num_layers, hist_n), dtype=np.int32)

    # Get histogram of neighborhood sizes i in 1 epoch max.
    for i in range(len(dataset)):
        timer.tic()

        batched_input = collate_fn(
            [dataset[i]], config, neighborhood_limits=[hist_n] * 5)
        # update histogram
        counts = [torch.sum(neighb_mat < neighb_mat.shape[0], dim=1).numpy()
                  for neighb_mat in batched_input['neighbors']]
        
        hists = [np.bincount(c, minlength=hist_n)[:hist_n] for c in counts]
        neighb_hists += np.vstack(hists)
        timer.toc()

        if timer.total_time - last_display > 0.1:
            last_display = timer.total_time
            print(f"Calib Neighbors {i:08d}: timings {timer.total_time:4.2f}s")

        if np.min(np.sum(neighb_hists, axis=1)) > samples_threshold:
            break

    cumsum = np.cumsum(neighb_hists.T, axis=0)
    percentiles = np.sum(cumsum < (keep_ratio * cumsum[hist_n - 1, :]), axis=0)

    neighborhood_limits = percentiles
    print('\n')

    return neighborhood_limits


def get_dataloader(dataset, kpconv_config, batch_size=1, num_workers=4, shuffle=True, sampler=None, neighborhood_limits=None):
    if neighborhood_limits is None:
        # neighborhood_limits = calibrate_neighbors(dataset, dataset.config, collate_fn=collate_fn_descriptor)
        neighborhood_limits = calibrate_neighbors(
            dataset, kpconv_config, collate_fn=collate_fn_descriptor)
    print("neighborhood:", neighborhood_limits)
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        # https://discuss.pytorch.org/t/supplying-arguments-to-collate-fn/25754/4
        collate_fn=partial(collate_fn_descriptor, config=kpconv_config,
                           neighborhood_limits=neighborhood_limits),
        sampler=sampler,
        drop_last=False
    )
    return dataloader, neighborhood_limits

def get_dataloader_deepim(dataset, kpconv_config, batch_size=1, num_workers=4, shuffle=True, sampler=None, neighborhood_limits=None):
    if neighborhood_limits is None:
        neighborhood_limits = calibrate_neighbors(
            dataset, kpconv_config, collate_fn=collate_fn_descriptor_deepim)
    print("Neighborhood:", neighborhood_limits)
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        # https://discuss.pytorch.org/t/supplying-arguments-to-collate-fn/25754/4
        collate_fn=partial(collate_fn_descriptor_deepim, config=kpconv_config,
                           neighborhood_limits=neighborhood_limits),
        sampler=sampler,
        drop_last=False
    )
    return dataloader, neighborhood_limits

if __name__ == '__main__':
    pass


================================================
FILE: data/transforms.py
================================================
import numpy as np
import random
import torch
import torchvision
from torchvision.transforms import functional as F
import cv2
from PIL import Image


class Compose(object):

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, kpts=None, mask=None):
        for t in self.transforms:
            img, kpts, mask = t(img, kpts, mask)
        return img, kpts, mask

    def __repr__(self):
        format_string = self.__class__.__name__ + "("
        for t in self.transforms:
            format_string += "\n"
            format_string += "    {0}".format(t)
        format_string += "\n)"
        return format_string


class ToTensor(object):

    def __call__(self, img, kpts, mask):
        return np.asarray(img).astype(np.float32) / 255., kpts, mask


class Normalize(object):

    def __init__(self, mean, std, to_bgr=True):
        self.mean = mean
        self.std = std
        self.to_bgr = to_bgr

    def __call__(self, img, kpts, mask):
        img -= self.mean
        img /= self.std
        if self.to_bgr:
            img = img.transpose(2, 0, 1).astype(np.float32)
        return img, kpts, mask


class ColorJitter(object):

    def __init__(self,
                 brightness=None,
                 contrast=None,
                 saturation=None,
                 hue=None,
                 ):
        self.color_jitter = torchvision.transforms.ColorJitter(
            brightness=brightness,
            contrast=contrast,
            saturation=saturation,
            hue=hue,)

    def __call__(self, image, kpts, mask):
        image = np.asarray(self.color_jitter(Image.fromarray(np.ascontiguousarray(image, np.uint8))))
        return image, kpts, mask


class RandomBlur(object):

    def __init__(self, prob=0.5):
        self.prob = prob

    def __call__(self, image, kpts, mask):
        if random.random() < self.prob:
            sigma = np.random.choice([3, 5, 7, 9])
            image = cv2.GaussianBlur(image, (sigma, sigma), 0)
        return image, kpts, mask


def make_transforms(cfg, is_train):
    if is_train is True:
        transform = Compose(
            [
                RandomBlur(0.5),
                ColorJitter(0.1, 0.1, 0.05, 0.05),
                # ToTensor(),
                # Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )
    else:
        transform = Compose(
            [
                # ToTensor(),
                # Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )

    return transform


================================================
FILE: data/ycb/basic.py
================================================
import mmcv 
bop_ycb_idx2class={
        1: '002_master_chef_can', 
        2: '003_cracker_box',
        3: '004_sugar_box', 
        4: '005_tomato_soup_can',
        5: '006_mustard_bottle',
        6: '007_tuna_fish_can',
        7: '008_pudding_box', 
        8: '009_gelatin_box',
        9: '010_potted_meat_can', 
        10: '011_banana', 
        11: '019_pitcher_base', 
        12: '021_bleach_cleanser', 
        13: '024_bowl', 
        14: '025_mug', 
        15: '035_power_drill',
        16: '036_wood_block',
        17: '037_scissors', 
        18: '040_large_marker',
        19: '051_large_clamp',
        20: '052_extra_large_clamp',
        21: '061_foam_brick', 
    }
bop_ycb_class2idx = dict([[bop_ycb_idx2class[k],k ] for k in bop_ycb_idx2class.keys() ])


================================================
FILE: doc/prepare_data.md
================================================
# Data Preparation Tips
All the related data for data preparation can be downloaded [here](https://mycuhk-my.sharepoint.com/:f:/g/personal/1155139432_link_cuhk_edu_hk/EoXnZ96Tuy9PpYlZCvDN8vUBPdP1lP-PWQWiZH2KtIQoaQ?e=lpE472). You could download them first and then follow the instructions below for data preparation. 


## Download Datasets 
First, the following dataset need to be downloaded and extracted to the folder *EXPDATA/* 

[LINEMOD](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155139432_link_cuhk_edu_hk/EYFaYrk0kcdBgC6WMtLJqP0B9Ar0_Nff9qhI2Cs95qDbdA?e=yYxexC)

[LINEMOD_OCC_TEST](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155139432_link_cuhk_edu_hk/EUKcRnwyy9RGu2ASwA3QDXsBnMRrFP-U4X4Eqq-g_MhmIQ?e=hv6H2s)

## Synthetic Data Generation

The preprocessed data following [DeepIM](https://github.com/liyi14/mx-DeepIM) and [PVNet](https://github.com/zju3dv/pvnet-rendering) can be downloaded from [LM6d_converted](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155139432_link_cuhk_edu_hk/EYFaYrk0kcdBgC6WMtLJqP0B9Ar0_Nff9qhI2Cs95qDbdA) and [raw_data](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155139432_link_cuhk_edu_hk/ESSFXi_7qs1AgNmty7_9y4AB8ffFsGJWOC3ikgD5BIeXHQ?e=qOmvds). 
After downloading, you should put the downloaded files into the folder *EXPDATA/* (lying in the repository's root directory). 
To create occluded objects during training, we follow [PVNet](https://github.com/zju3dv/pvnet-rendering) to randomly create occlusions. 
You could run the following scripts to transform the data format for our dataloader. 
```
    bash scripts/run_dataformatter.sh
```
The command above will automatically save the formatted data into *EXPDATA/*. 

## Download the Object CAD Models
You also need to download the [object models](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155139432_link_cuhk_edu_hk/EQScZuLrkPNPmN4eO3kePaUBjOe92EvbKb7kGJk2vKz-bA?e=8McAdh) and put the extracted folder *models* into *./EXPDATA/LM6d_converted/. 

## Download Background Images
[Pascal VOC 2012](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar) need to be downloaded to folder *EXPDATA/*. These images will be necessary for the random background generation for training. 

## Download Initial Poses 
The initial poses estimated by PoseCNN and PVNet can be downloaded from [here](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155139432_link_cuhk_edu_hk/EQh5y0M_zHVMnbVszjEviCUBNAX_22MFN26Msa48XlJ5MQ?e=rfhT7k). 
The initial pose folder also should be put into the folder  *EXPDATA/*

## Generate the Information Files
Run the following script to generate the info files, which is put into the folder *EXPDATA/data_info/*

```
bash scripts/run_datainfo_generation.sh
```


After the the data preparation, the expected directory structure should be 


```
./EXPDATA
    |──LM6d_converted 
    |        |──LM6d_refine 
    |        |──LM6d_refine_syn
    |        └──models
    |──LINEMOD
    |        └──fuse_formatted
    |──lmo
    |──VOCdevkit
    |──raw_data
    |──init_poses
    └──data_info
```


================================================
FILE: docker/Dockerfile
================================================
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04

RUN apt-key del 7fa2af80
RUN apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
RUN rm /etc/apt/sources.list.d/cuda.list
RUN rm /etc/apt/sources.list.d/nvidia-ml.list

# Dependencies for glvnd and X11.
RUN apt-get update 
RUN  apt-get install -y -qq --no-install-recommends \
    libglvnd0 \
    libgl1 \
    libglx0 \
    libegl1 \
    libxext6 \
    libx11-6 \
  && rm -rf /var/lib/apt/lists/*
# Env vars for the nvidia-container-runtime.
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES graphics,utility,compute

#env vars for cuda
ENV CUDA_HOME /usr/local/cuda

#install miniconda
RUN apt-get update --fix-missing && \
    apt-get install -y wget bzip2 ca-certificates curl git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

RUN wget --quiet https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh -O ~/miniconda.sh && \
    /bin/bash ~/miniconda.sh -b -p /opt/miniconda3 && \
    rm ~/miniconda.sh && \
    /opt/miniconda3/bin/conda clean -tipsy && \
    ln -s /opt/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
    echo ". /opt/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \
    echo "conda activate base" >> ~/.bashrc && \
    echo "conda deactivate && conda activate py37" >> ~/.bashrc

#https://blog.csdn.net/Mao_Jonah/article/details/89502380
COPY freeze.yml freeze.yml
RUN /opt/miniconda3/bin/conda env create -n py37 -f freeze.yml

# WORKDIR /tmp/
# COPY config.jupyter.tar config.jupyter.tar
# RUN tar -xvf config.jupyter.tar -C /root/

#install apex
ENV TORCH_CUDA_ARCH_LIST "6.0 6.2 7.0 7.2"
# make sure we don't overwrite some existing directory called "apex"
WORKDIR /tmp/unique_for_apex
# uninstall Apex if present, twice to make absolutely sure :)
RUN /opt/miniconda3/envs/py37/bin/pip3 uninstall -y apex || :
RUN /opt/miniconda3/envs/py37/bin/pip3 uninstall -y apex || :
# SHA is something the user can touch to force recreation of this Docker layer,
# and therefore force cloning of the latest version of Apex
RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
WORKDIR /tmp/unique_for_apex/apex
RUN /opt/miniconda3/envs/py37/bin/pip3 install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
#install pytorch3d 
# RUN /opt/miniconda3/envs/py37/bin/pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py37_cu102_pyt171/download.html
# RUN /opt/miniconda3/envs/py37/bin/pip install "git+https://github.com/facebookresearch/pytorch3d.git"
# RUN /opt/miniconda3/bin/conda install pytorch3d==0.5.0 -c pytorch3d -n py37


#other pkgs
RUN apt-get update \
  && apt-get install -y -qq --no-install-recommends \
  cmake build-essential vim xvfb unzip tmux psmisc  \
  libx11-dev libassimp-dev \
  mesa-common-dev freeglut3-dev \
  rsync \
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/*

#create some directories
RUN mkdir -p /home/RNNPose

EXPOSE 8887 8888 8889 10000 10001 10002 
WORKDIR /home/RNNPose


================================================
FILE: docker/freeze.yml
================================================
name: py37_tmp
channels:
  - pytorch
  - pytorch3d
  - open3d-admin
  - bottler
  - iopath
  - fvcore
  - conda-forge
  - defaults
dependencies:
  - pytorch3d=0.5.0
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=4.5=1_gnu
  - anyio=2.2.0=py37h06a4308_1
  - argon2-cffi=20.1.0=py37h27cfd23_1
  - async_generator=1.10=py37h28b3542_0
  - attrs=21.2.0=pyhd3eb1b0_0
  - babel=2.9.1=pyhd3eb1b0_0
  - backcall=0.2.0=pyhd3eb1b0_0
  - blas=1.0=mkl
  - bleach=3.3.0=pyhd3eb1b0_0
  - brotlipy=0.7.0=py37h27cfd23_1003
  - ca-certificates=2021.5.30=ha878542_0
  - certifi=2021.5.30=py37h89c1867_0
  - cffi=1.14.5=py37h261ae71_0
  - charset-normalizer=2.0.4=pyhd3eb1b0_0
  - cryptography=3.4.7=py37hd23ed53_0
  - cudatoolkit=10.2.89=h8f6ccaa_8
  - cycler=0.10.0=py37_0
  - dbus=1.13.18=hb2f20db_0
  - defusedxml=0.7.1=pyhd3eb1b0_0
  - entrypoints=0.3=py37_0
  - expat=2.4.1=h2531618_2
  - fontconfig=2.13.1=h6c09931_0
  - freetype=2.10.4=h5ab3b9f_0
  - fvcore=0.1.5.post20210825=py37
  - glib=2.68.2=h36276a3_0
  - gst-plugins-base=1.14.0=h8213a91_2
  - gstreamer=1.14.0=h28cd5cc_2
  - icu=58.2=he6710b0_3
  - idna=3.2=pyhd3eb1b0_0
  - importlib-metadata=3.10.0=py37h06a4308_0
  - importlib_metadata=3.10.0=hd3eb1b0_0
  - intel-openmp=2021.2.0=h06a4308_610
  - iopath=0.1.9=py37
  - ipykernel=5.3.4=py37h5ca1d4c_0
  - ipython=7.22.0=py37hb070fc8_0
  - ipython_genutils=0.2.0=pyhd3eb1b0_1
  - ipywidgets=7.6.3=pyhd3eb1b0_1
  - jedi=0.17.0=py37_0
  - jinja2=3.0.0=pyhd3eb1b0_0
  - joblib=1.0.1=pyhd3eb1b0_0
  - jpeg=9b=h024ee3a_2
  - json5=0.9.6=pyhd3eb1b0_0
  - jsonschema=3.2.0=py_2
  - jupyter=1.0.0=py37h89c1867_6
  - jupyter_client=6.1.12=pyhd3eb1b0_0
  - jupyter_console=6.4.0=pyhd8ed1ab_0
  - jupyter_core=4.7.1=py37h06a4308_0
  - jupyter_server=1.4.1=py37h06a4308_0
  - jupyterlab=3.0.16=pyhd8ed1ab_0
  - jupyterlab_pygments=0.1.2=py_0
  - jupyterlab_server=2.7.1=pyhd3eb1b0_0
  - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
  - kiwisolver=1.3.1=py37h2531618_0
  - kornia=0.5.3=pyhd8ed1ab_0
  - lcms2=2.12=h3be6417_0
  - ld_impl_linux-64=2.35.1=h7274673_9
  - libffi=3.3=he6710b0_2
  - libgcc-ng=9.3.0=h5101ec6_17
  - libgfortran-ng=7.5.0=ha8ba4b0_17
  - libgfortran4=7.5.0=ha8ba4b0_17
  - libgomp=9.3.0=h5101ec6_17
  - libpng=1.6.37=hbc83047_0
  - libsodium=1.0.18=h7b6447c_0
  - libstdcxx-ng=9.3.0=hd4cf53a_17
  - libtiff=4.2.0=h85742a9_0
  - libuuid=1.0.3=h1bed415_2
  - libuv=1.40.0=h7b6447c_0
  - libwebp-base=1.2.0=h27cfd23_0
  - libxcb=1.14=h7b6447c_0
  - libxml2=2.9.10=hb55368b_3
  - lz4-c=1.9.3=h2531618_0
  - markupsafe=2.0.1=py37h27cfd23_0
  - matplotlib=3.3.4=py37h06a4308_0
  - matplotlib-base=3.3.4=py37h62a2d02_0
  - mistune=0.8.4=py37h14c3975_1001
  - mkl=2021.2.0=h06a4308_296
  - mkl-service=2.3.0=py37h27cfd23_1
  - mkl_fft=1.3.0=py37h42c9631_2
  - mkl_random=1.2.1=py37ha9443f7_2
  - nbclassic=0.2.6=pyhd3eb1b0_0
  - nbclient=0.5.3=pyhd3eb1b0_0
  - nbconvert=6.0.7=py37_0
  - nbformat=5.1.3=pyhd3eb1b0_0
  - ncurses=6.2=he6710b0_1
  - nest-asyncio=1.5.1=pyhd3eb1b0_0
  - ninja=1.10.2=hff7bd54_1
  - notebook=6.4.0=py37h06a4308_0
  - numpy=1.20.2=py37h2d18471_0
  - numpy-base=1.20.2=py37hfae3a4d_0
  - nvidiacub=1.10.0=0
  - olefile=0.46=py37_0
  - open3d=0.13.0=py37_0
  - openssl=1.1.1k=h7f98852_0
  - packaging=20.9=pyhd3eb1b0_0
  - pandas=1.2.4=py37h2531618_0
  - pandoc=2.12=h06a4308_0
  - pandocfilters=1.4.3=py37h06a4308_1
  - parso=0.8.2=pyhd3eb1b0_0
  - pcre=8.44=he6710b0_0
  - pexpect=4.8.0=pyhd3eb1b0_3
  - pickleshare=0.7.5=pyhd3eb1b0_1003
  - pillow=8.2.0=py37he98fc37_0
  - pip=21.1.2=py37h06a4308_0
  - plyfile=0.7.4=pyhd8ed1ab_0
  - portalocker=2.3.0=py37h06a4308_0
  - prometheus_client=0.11.0=pyhd3eb1b0_0
  - prompt-toolkit=3.0.17=pyh06a4308_0
  - prompt_toolkit=3.0.17=hd3eb1b0_0
  - ptyprocess=0.7.0=pyhd3eb1b0_2
  - pycparser=2.20=py_2
  - pygments=2.9.0=pyhd3eb1b0_0
  - pyopenssl=20.0.1=pyhd3eb1b0_1
  - pyparsing=2.4.7=pyhd3eb1b0_0
  - pyqt=5.9.2=py37h05f1152_2
  - pyrsistent=0.17.3=py37h7b6447c_0
  - pysocks=1.7.1=py37_1
  - python=3.7.10=h12debd9_4
  - python-dateutil=2.8.1=pyhd3eb1b0_0
  - python_abi=3.7=1_cp37m
  - pytorch=1.7.1=py3.7_cuda10.2.89_cudnn7.6.5_0
  - pytz=2021.1=pyhd3eb1b0_0
  - pyzmq=20.0.0=py37h2531618_1
  - qt=5.9.7=h5867ecd_1
  - qtconsole=5.1.1=pyhd8ed1ab_0
  - qtpy=1.10.0=pyhd8ed1ab_0
  - readline=8.1=h27cfd23_0
  - requests=2.26.0=pyhd3eb1b0_0
  - scikit-learn=0.24.2=py37ha9443f7_0
  - scipy=1.6.2=py37had2a1c9_1
  - send2trash=1.5.0=pyhd3eb1b0_1
  - setuptools=52.0.0=py37h06a4308_0
  - sip=4.19.8=py37hf484d3e_0
  - six=1.16.0=pyhd3eb1b0_0
  - sniffio=1.2.0=py37h06a4308_1
  - sqlite=3.35.4=hdfb4753_0
  - tabulate=0.8.9=py37h06a4308_0
  - terminado=0.9.4=py37h06a4308_0
  - testpath=0.4.4=pyhd3eb1b0_0
  - threadpoolctl=2.1.0=pyh5ca1d4c_0
  - tk=8.6.10=hbc83047_0
  - torchvision=0.8.2=py37_cu102
  - tornado=6.1=py37h27cfd23_0
  - traitlets=5.0.5=pyhd3eb1b0_0
  - typing_extensions=3.7.4.3=pyha847dfd_0
  - wcwidth=0.2.5=py_0
  - webencodings=0.5.1=py37_1
  - wheel=0.36.2=pyhd3eb1b0_0
  - widgetsnbextension=3.5.1=py37_0
  - xz=5.2.5=h7b6447c_0
  - yacs=0.1.6=py_0
  - yaml=0.2.5=h7b6447c_0
  - zeromq=4.3.4=h2531618_0
  - zipp=3.4.1=pyhd3eb1b0_0
  - zlib=1.2.11=h7b6447c_3
  - zstd=1.4.9=haebb681_0
  - pip:
    - absl-py==0.13.0
    - addict==2.4.0
    - anykeystore==0.2
    - cachetools==4.2.2
    - cryptacular==1.5.5
    - cython==0.29.24
    - decorator==4.4.2
    - easydict==1.9
    - einops==0.3.0
    - fire==0.4.0
    - flow-vis==0.1
    - freetype-py==2.2.0
    - future==0.18.2
    - glumpy==1.2.0
    - google-auth==1.31.0
    - google-auth-oauthlib==0.4.4
    - greenlet==1.1.0
    - grpcio==1.38.0
    - hupper==1.10.3
    - imageio==2.9.0
    - llvmlite==0.36.0
    - loguru==0.5.3
    - markdown==3.3.4
    - networkx==2.5.1
    - numba==0.53.1
    - numpy-quaternion==2021.6.9.13.34.11
    - oauthlib==3.1.1
    - opencv-python==4.5.2.54
    - pastedeploy==2.1.1
    - pbkdf2==1.3
    - plaster==1.0
    - plaster-pastedeploy==0.7
    - protobuf==3.17.3
    - pyasn1==0.4.8
    - pyasn1-modules==0.2.8
    - pyassimp==4.1.3
    - pyglet==1.5.17
    - pyopengl==3.1.5
    - pyopengl-accelerate==3.1.5
    - pyramid==2.0
    - pyramid-mailer==0.15.1
    - python3-openid==3.2.0
    - pywavelets==1.1.1
    - pyyaml==5.4.1
    - repoze-sendmail==4.4.1
    - requests-oauthlib==1.3.0
    - rsa==4.7.2
    - scikit-image==0.18.1
    - sqlalchemy==1.4.18
    - tensorboard==2.5.0
    - tensorboard-data-server==0.6.1
    - tensorboard-plugin-wit==1.8.0
    - tensorboardx==2.2
    - termcolor==1.1.0
    - tifffile==2021.6.14
    - tqdm==4.61.1
    - transaction==3.0.1
    - transforms3d==0.3.1
    - translationstring==1.4
    - triangle==20200424
    - urllib3==1.26.5
    - velruse==1.1.1
    - venusian==3.0.0
    - vispy==0.6.6
    - webob==1.8.7
    - werkzeug==2.0.1
    - zope-deprecation==4.4.0
    - zope-interface==5.4.0
    - mmcv 
prefix: /opt/miniconda3/envs/py37_tmp


================================================
FILE: geometry/__init__.py
================================================


================================================
FILE: geometry/cholesky.py
================================================
# import tensorflow as tf
import torch #as tf
import numpy as np
# from utils.einsum import einsum
from torch import einsum


class _cholesky_solve(torch.autograd.Function):
    @staticmethod
    def forward(ctx, H, b):
        chol = torch.cholesky(H)
        xx = torch.cholesky_solve(b, chol)
        ctx.save_for_backward(chol, xx)

        return xx

    # see OptNet: https://arxiv.org/pdf/1703.00443.pdf
    @staticmethod
    def backward(ctx, dx):
        chol, xx = ctx.saved_tensors

        dz = torch.cholesky_solve(dx, chol)
        xs = torch.squeeze(xx,  -1)
        zs = torch.squeeze(dz, -1)
        dH = -einsum('...i,...j->...ij', xs, zs)

        return dH, dz
def cholesky_solve(H, b):
    return _cholesky_solve.apply(H,b)

def solve(H, b, max_update=1.0):
    """ Solves the linear system Hx = b, H > 0"""

    # small system, solve on cpu
    H = H.to(dtype=torch.float64) 
    b = b.to(dtype=torch.float64) 

    b = torch.unsqueeze(b, -1)
    x = cholesky_solve(H, b)

    # replaces nans and clip large updates
    bad_values = torch.isnan(x) 
    x = torch.where(bad_values, torch.zeros_like(x), x)
    x = torch.clamp(x, -max_update, max_update)

    x = torch.squeeze(x, -1)
    x = x.to(dtype=torch.float32) 
        
    return x


def __test__():
    import numpy as np 
    np.random.seed(0)
    M=np.random.uniform(size=(3,3))
    H=torch.tensor(M@M.transpose(-1,-2), requires_grad=True )

    b=torch.tensor(np.random.uniform(size=(3,) ), requires_grad=True )

    x= solve(H,b )

    x.backward(torch.ones_like(x) )


    print(f"H={H}, b={b}, x={x}, grad={H.grad, b.grad}")

if __name__=="__main__":
    __test__()


================================================
FILE: geometry/diff_render.py
================================================
import torch
import torch.nn as nn 
import torch.nn.functional as F 

import numpy 

from pytorch3d.renderer import (
    OpenGLPerspectiveCameras, look_at_view_transform, look_at_rotation,
    RasterizationSettings, MeshRenderer, MeshRasterizer, BlendParams,
    camera_position_from_spherical_angles, HardPhongShader, PointLights,FoVPerspectiveCameras, PerspectiveCameras, SoftPhongShader, Materials
) 
try:
    from pytorch3d.structures import Meshes, Textures
    use_textures = True
except:
    from pytorch3d.structures import Meshes
    from pytorch3d.renderer import TexturesVertex
    from pytorch3d.renderer import TexturesVertex as Textures

    use_textures = False

import pytorch3d.renderer.mesh.utils as utils
from pytorch3d.io import load_obj, load_ply, load_objs_as_meshes
from pytorch3d.renderer.mesh.rasterizer import Fragments

from plyfile import PlyData
from utils.furthest_point_sample import fragmentation_fps
import time


def rasterize(R, T, meshes, rasterizer, blur_radius=0):
    # It will automatically update the camera settings -> R, T in rasterizer.camera
    fragments = rasterizer(meshes, R=R, T=T)

    # Copy from pytorch3D source code, try if it is necessary to do gradient decent
    if blur_radius > 0.0:
        clipped_bary_coords = utils._clip_barycentric_coordinates(
            fragments.bary_coords
        )
        clipped_zbuf = utils._interpolate_zbuf(
            fragments.pix_to_face, clipped_bary_coords, meshes
        )
        fragments = Fragments(
            bary_coords=clipped_bary_coords,
            zbuf=clipped_zbuf,
            dists=fragments.dists,
            pix_to_face=fragments.pix_to_face,
        )
    return fragments

def set_bary_coords_to_nearest(bary_coords_):
    ori_shape = bary_coords_.shape
    exr = bary_coords_ * (bary_coords_ < 0)
    bary_coords_ = bary_coords_.view(-1, bary_coords_.shape[-1])
    arg_max_idx = bary_coords_.argmax(1)
    return torch.zeros_like(bary_coords_).scatter(1, arg_max_idx.unsqueeze(1), 1.0).view(*ori_shape) + exr

class MeshRendererWithDepth(nn.Module):
    def __init__(self, rasterizer, shader):
        super().__init__()
        self.rasterizer = rasterizer
        self.shader = shader

    def to(self, device):
        # Rasterizer and shader have submodules which are not of type nn.Module
        self.rasterizer.to(device)
        self.shader.to(device)
        return self

    def forward(self, meshes_world, **kwargs) -> torch.Tensor:
        fragments = self.rasterizer(meshes_world, **kwargs)
        images = self.shader(fragments, meshes_world, **kwargs)
        return images, fragments.zbuf

class DiffRender(nn.Module):
    def __init__(self, mesh_path, render_texture=False):
        super().__init__()

        # self.mesh = mesh
        if mesh_path.endswith('.ply'):
            verts, faces = load_ply(mesh_path)
            self.mesh = Meshes(verts=[verts], faces=[faces])
        elif mesh_path.endswith('.obj'):
            verts, faces,_ = load_obj(mesh_path)
            faces=faces.verts_idx
            self.mesh=load_objs_as_meshes([mesh_path])

        self.verts = verts
        self.faces = faces
        self.cam_opencv2pytch3d = torch.tensor(
                                [[-1,0,0,0],
                                [0,-1,0, 0],
                                [0,0, 1, 0],
                                [0,0, 0, 1]], dtype=torch.float32
                                )
        self.render_texture = render_texture

        #get patch infos
        self.pat_centers, self.pat_center_inds,  self.vert_frag_ids= fragmentation_fps(verts.detach().cpu().numpy(), 64)
        self.pat_centers = torch.from_numpy(self.pat_centers)
        self.pat_center_inds = torch.from_numpy(self.pat_center_inds)
        self.vert_frag_ids = torch.from_numpy(self.vert_frag_ids)[...,None] #Nx1


    def to(self, *args, **kwargs):
        if 'device' in kwargs.keys():
            device = kwargs['device']
        else:
            device = args[0]
        super().to(device)
        self.mesh = self.mesh.to(device)
        self.verts = self.verts.to(device)
        self.faces = self.faces.to(device)
        self.pat_centers = self.pat_centers.to(device)
        self.pat_center_inds = self.pat_center_inds.to(device)
        self.vert_frag_ids = self.vert_frag_ids.to(device)
        
        return self

    def get_patch_center_depths(self, T, K):
        #no need to pre-transform, as here we do not use pytorch3d rendering
        # T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]

        #render depths
        X_cam= (self.pat_centers@R+t) #BxKx3
        depth= X_cam[...,2:] #BxKx1
        x=X_cam@K.transpose(-1,-2)  #BxNx3
        x = x/x[...,-1:]
        img_coords= x[...,:2]


        return depth, img_coords 

    # Calculate interpolated maps -> [n, c, h, w]
    # face_memory.shape: [n_face, 3, c]
    @staticmethod
    def forward_interpolate(R, t, meshes, face_memory, rasterizer, blur_radius=0, mode='bilinear', return_depth=True):

        fragments = rasterize(R, t, meshes, rasterizer, blur_radius=blur_radius)

        # [n, h, w, 1, d]
        if mode == 'nearest':
            out_map = utils.interpolate_face_attributes(fragments.pix_to_face, set_bary_coords_to_nearest(fragments.bary_coords), face_memory)
        else:
            out_map = utils.interpolate_face_attributes(fragments.pix_to_face, fragments.bary_coords, face_memory)

        out_map = out_map.squeeze(dim=3)
        out_map = out_map.transpose(3, 2).transpose(2, 1)
        if return_depth:
            return out_map, fragments.zbuf.permute(0,3,1,2) # depth
        else:
            return out_map

    def render_mesh(self,  T, K, render_image_size, near=0.1, far=6, lights=(1,1,-1) ):
        B=T.shape[0]

        device = T.device
        T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]

        cameras = PerspectiveCameras(focal_length= torch.stack([K[:,0,0], K[:,1,1] ], dim=-1), 
            principal_point=K[:,:2,2],  R=R, T=t, image_size=[render_image_size]*B, in_ndc=False, device=device)
        lights = PointLights(device=device, location=[lights])

        raster_settings = RasterizationSettings(
            image_size=render_image_size,
            blur_radius=0.0,
            faces_per_pixel=1,
            bin_size=None, #0
            perspective_correct=True
        )
        materials = Materials(
            device=device,
            # specular_color=[[0.0, 1.0, 0.0]],
            shininess=0
        )
        renderer = MeshRendererWithDepth(
            rasterizer=MeshRasterizer(
                cameras=cameras, 
                raster_settings=raster_settings
            ),
            shader=SoftPhongShader(
                device=device, 
                cameras=cameras,
                lights=lights, 
                blend_params=BlendParams(1e-4, 1e-4, (0, 0, 0))
            )
        )
        image,depth =renderer(self.mesh, lights=lights, materials=materials)

        return image.permute(0,3,1,2)[:,:3], depth.permute(0,3,1,2) # to BCHW

    def render_offset_map(self,  T, K, render_image_size, near=0.1, far=6):
        yy, xx = torch.meshgrid(torch.arange(render_image_size[0], device=T.device), torch.arange(render_image_size[1], device=T.device) )
        # xx = xx.to(dtype=torch.float32)
        # yy = yy.to(dtype=torch.float32)
        coords_grid = torch.stack( [ xx.to(dtype=torch.float32),  yy.to(dtype=torch.float32)], dim=-1 )

        #no need to pre-transform, as here we do not use pytorch3d rendering
        # T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]

        #render depths
        X_cam= (self.pat_centers@R+t)
        x=X_cam@K.transpose(-1,-2)  #BxNx3
        x = x/x[...,-1:]

        offset = x[...,None,None,:2] - coords_grid #BxNx1x1x2-HxWx2
        
        return offset.permute(0,1,4,2,3) #BxNx2xHxW

    def forward(self, vert_attribute, T, K, render_image_size, near=0.1, far=6, mode='bilinear') :
        """
        Args:
            vert_attribute: (N,C)
            T: (B,3,4) or (B,4,4)
            K: (B,3,3)
            render_image_size (tuple): (h,w)
            near (float, optional):  Defaults to 0.1.
            far (int, optional): Defaults to 6.
        """

        if vert_attribute is None:
            return self.render_mesh(T, K, render_image_size, near=0.1, far=6 )
        if self.render_texture:
            ren_tex=self.render_mesh(T, K, render_image_size, near=0.1, far=6 )


        B=T.shape[0]
        face_attribute = vert_attribute[self.faces.long()]

        device = T.device

        T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]
        # t = -(R@T[...,:3,3:]).squeeze(-1)

        cameras = PerspectiveCameras(focal_length= torch.stack([K[:,0,0], K[:,1,1] ], dim=-1), 
            principal_point=K[:,:2,2], image_size=[render_image_size]*B, in_ndc=False, device=device)

        raster_settings = RasterizationSettings(
            image_size=render_image_size,
            blur_radius=0.0,
            faces_per_pixel=1,
            bin_size=None, #0
            perspective_correct=True
        )

        rasterizer = MeshRasterizer(
            cameras=cameras,
            raster_settings=raster_settings
        )

        out_map, out_depth=self.forward_interpolate(R, t, self.mesh, face_attribute, rasterizer, blur_radius=0, mode=mode)
        
        if not self.render_texture:
            return out_map, out_depth
        else:
            return torch.cat([ren_tex[0], out_map ], dim=1), out_depth

    def render_depth(self, T, K, render_image_size, near=0.1, far=6, mode='neareast'):
        """
        Args:
            T: (B,3,4) or (B,4,4)
            K: (B,3,3)
            render_image_size (tuple): (h,w)
            near (float, optional):  Defaults to 0.1.
            far (int, optional): Defaults to 6.
            mode: 'bilinear' or 'neareast'
        """

        B=T.shape[0]
        device = T.device

        T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]
        cameras = PerspectiveCameras(focal_length= torch.stack([K[:,0,0], K[:,1,1] ], dim=-1), 
            principal_point=K[:,:2,2], image_size=[render_image_size]*B, in_ndc=False, device=device)

        raster_settings = RasterizationSettings(
            image_size=render_image_size,
            blur_radius=0.0,
            faces_per_pixel=1,
            bin_size=0
        )

        rasterizer = MeshRasterizer(
            cameras=cameras,
            raster_settings=raster_settings
        )


        #render depths
        vert_depths= (self.verts@R+t).squeeze(0)[...,2:]
        face_depths = vert_depths[self.faces.long()]
        out_depth=self.forward_interpolate(R, t, self.mesh, face_depths, rasterizer, blur_radius=0, mode='nearest', return_depth=False)

        return out_depth


class DiffRendererWrapper(nn.Module):
    def __init__(self, obj_paths, device="cuda", render_texture=False ):
        super().__init__()

        self.renderers = []
        for obj_path in obj_paths:
            self.renderers.append( 
                DiffRender(obj_path, render_texture).to(device=device)
            )

        self.renderers=nn.ModuleList(self.renderers)
        self.cls2idx=None #could be updated outside

    def get_patch_center_depths(self, model_names, T, K):
        
        depths= []
        image_coords= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]
            depth, img_coord = self.renderers[model_idx].get_patch_center_depths(T[b:b+1], K )
            depths.append(depth)
            image_coords.append(img_coord)
        
        return torch.cat(depths, dim=0), torch.cat(image_coords, dim=0)


    def render_offset_map(self, model_names,  T, K, render_image_size, near=0.1, far=6):
        offsets= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]

            offset = self.renderers[model_idx].render_offset_map(T[b:b+1], K[b:b+1], render_image_size, near, far )
            offsets.append(offset)
        
        return torch.cat(offsets, dim=0)

    def render_pat_id(self, model_names,  T, K, render_image_size, near=0.1, far=6):

        pat_ids= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]
            
            pat_id,_ = self.renderers[model_idx].forward(self.renderers[model_idx].vert_frag_ids.float()+1,T[b:b+1], K[b:b+1], render_image_size, near, far, 'nearest' )
            pat_ids.append(pat_id-1) #+1 -1, set invalid parts as -1's  
        
        return torch.cat(pat_ids, dim=0)

    def render_depth(self, model_names,  T, K, render_image_size, near=0.1, far=6):

        depth_outputs= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]

            depth = self.renderers[model_idx].render_depth( T[b:b+1], K[b:b+1], render_image_size, near, far, 'nearest' )
            depth_outputs.append(depth)
        
        return torch.cat(depth_outputs, dim=0)

    def forward(self, model_names,  vert_attribute, T, K, render_image_size, near=0.1, far=6):

        map_outputs= []
        depth_outputs= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]

            feamap, depth= self.renderers[model_idx]( vert_attribute[b], T[b:b+1], K[b:b+1], render_image_size, near, far )

            map_outputs.append(feamap)
            depth_outputs.append(depth)
        return torch.cat(map_outputs, dim=0) , torch.cat(depth_outputs, dim=0)


================================================
FILE: geometry/diff_render_optim.py
================================================
## Speed optimized: sharing the rasterization among different rendering process

import torch
import torch.nn as nn 
import torch.nn.functional as F 

import numpy 

from pytorch3d.renderer import (
    OpenGLPerspectiveCameras, look_at_view_transform, look_at_rotation,
    RasterizationSettings, MeshRenderer, MeshRasterizer, BlendParams,
    camera_position_from_spherical_angles, HardPhongShader, PointLights,FoVPerspectiveCameras, PerspectiveCameras, SoftPhongShader, Materials 
) 
try:
    from pytorch3d.structures import Meshes, Textures
    use_textures = True
except:
    from pytorch3d.structures import Meshes
    from pytorch3d.renderer import TexturesVertex
    from pytorch3d.renderer import TexturesVertex as Textures

    use_textures = False

import pytorch3d.renderer.mesh.utils as utils
from pytorch3d.io import load_obj, load_ply, load_objs_as_meshes
from pytorch3d.renderer.mesh.rasterizer import Fragments

from plyfile import PlyData
from utils.furthest_point_sample import fragmentation_fps


def rasterize(R, T, meshes, rasterizer, blur_radius=0):
    # It will automatically update the camera settings -> R, T in rasterizer.camera
    fragments = rasterizer(meshes, R=R, T=T)

    # Copy from pytorch3D source code, try if it is necessary to do gradient decent
    if blur_radius > 0.0:
        clipped_bary_coords = utils._clip_barycentric_coordinates(
            fragments.bary_coords
        )
        clipped_zbuf = utils._interpolate_zbuf(
            fragments.pix_to_face, clipped_bary_coords, meshes
        )
        fragments = Fragments(
            bary_coords=clipped_bary_coords,
            zbuf=clipped_zbuf,
            dists=fragments.dists,
            pix_to_face=fragments.pix_to_face,
        )
    return fragments

def set_bary_coords_to_nearest(bary_coords_):
    ori_shape = bary_coords_.shape
    exr = bary_coords_ * (bary_coords_ < 0)
    bary_coords_ = bary_coords_.view(-1, bary_coords_.shape[-1])
    arg_max_idx = bary_coords_.argmax(1)
    return torch.zeros_like(bary_coords_).scatter(1, arg_max_idx.unsqueeze(1), 1.0).view(*ori_shape) + exr

class MeshRendererWithDepth(nn.Module):
    def __init__(self, rasterizer, shader):
        super().__init__()
        self.rasterizer = rasterizer
        self.shader = shader

    def to(self, device):
        # Rasterizer and shader have submodules which are not of type nn.Module
        self.rasterizer.to(device)
        self.shader.to(device)
        return self

    def forward(self, meshes_world, **kwargs) -> torch.Tensor:
        fragments = self.rasterizer(meshes_world, **kwargs)
        images = self.shader(fragments, meshes_world, **kwargs)
        return images, fragments.zbuf

class MeshRendererWithDepth_v2(nn.Module):
    def __init__(self, rasterizer, shader):
        super().__init__()
        self.rasterizer = rasterizer
        self.shader = shader

    # def to(self, device):
    def to(self, *args, **kwargs):
        if 'device' in kwargs.keys():
            device = kwargs['device']
        else:
            device = args[0]
        super().to(device)
        # Rasterizer and shader have submodules which are not of type nn.Module
        self.rasterizer.to(device)
        self.shader.to(device)
        return self

    def forward(self, meshes_world, **kwargs) -> torch.Tensor:
        if 'fragments' not in kwargs.keys() or kwargs['fragments'] is None: # sharing fragment results with others for speed, as the rasterizing process occupies most of time
            if 'fragments' in kwargs:
                del kwargs['fragments']
                
            fragments = self.rasterizer(meshes_world, **kwargs)
        else:
            fragments = kwargs['fragments']
            del kwargs['fragments']

        images = self.shader(fragments, meshes_world, **kwargs)
        return images, fragments.zbuf

class DiffRender(nn.Module):
    def __init__(self, mesh_path, render_texture=False):
        super().__init__()

        # self.mesh = mesh
        if mesh_path.endswith('.ply'):
            verts, faces = load_ply(mesh_path)
            self.mesh = Meshes(verts=[verts], faces=[faces])
        elif mesh_path.endswith('.obj'):
            verts, faces,_ = load_obj(mesh_path)
            # import pdb; pdb.set_trace()
            faces=faces.verts_idx
            self.mesh=load_objs_as_meshes([mesh_path])

        # self.mesh = Meshes(verts=verts, faces=faces, textures=None)
        self.verts = verts
        self.faces = faces
        # self.mesh = Meshes(verts=[verts], faces=[faces])
        # self.feature=feature
        self.cam_opencv2pytch3d = torch.tensor(
                                [[-1,0,0,0],
                                [0,-1,0, 0],
                                [0,0, 1, 0],
                                [0,0, 0, 1]], dtype=torch.float32
                                )
        self.render_texture = render_texture

        #get patch infos
        self.pat_centers, self.pat_center_inds,  self.vert_frag_ids= fragmentation_fps(verts.detach().cpu().numpy(), 64)
        self.pat_centers = torch.from_numpy(self.pat_centers)
        self.pat_center_inds = torch.from_numpy(self.pat_center_inds)
        self.vert_frag_ids = torch.from_numpy(self.vert_frag_ids)[...,None] #Nx1


    def to(self, *args, **kwargs):
        if 'device' in kwargs.keys():
            device = kwargs['device']
        else:
            device = args[0]
        super().to(device)
        # self.rasterizer.cameras = self.rasterizer.cameras.to(device)
        # self.face_memory = self.face_memory.to(device)
        self.mesh = self.mesh.to(device)
        self.verts = self.verts.to(device)
        self.faces = self.faces.to(device)
        self.pat_centers = self.pat_centers.to(device)
        self.pat_center_inds = self.pat_center_inds.to(device)
        self.vert_frag_ids = self.vert_frag_ids.to(device)

        
        # self.cam_opencv2pytch3d = self.cam_opencv2pytch3d.to(device=device)
        return self

    def get_patch_center_depths(self, T, K):

        #no need to pre-transform, as here we do not use pytorch3d rendering
        # T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]

        #render depths
        X_cam= (self.pat_centers@R+t) #BxKx3
        depth= X_cam[...,2:] #BxKx1
        x=X_cam@K.transpose(-1,-2)  #BxNx3
        x = x/x[...,-1:]
        img_coords= x[...,:2]


        return depth, img_coords 

    # Calculate interpolated maps -> [n, c, h, w]
    # face_memory.shape: [n_face, 3, c]
    @staticmethod
    def forward_interpolate(R, t, meshes, face_memory, rasterizer, blur_radius=0, mode='bilinear', return_depth=True):

        fragments = rasterize(R, t, meshes, rasterizer, blur_radius=blur_radius)

        # [n, h, w, 1, d]
        if mode == 'nearest':
            out_map = utils.interpolate_face_attributes(fragments.pix_to_face, set_bary_coords_to_nearest(fragments.bary_coords), face_memory)
        else:
            out_map = utils.interpolate_face_attributes(fragments.pix_to_face, fragments.bary_coords, face_memory)
        out_map = out_map.squeeze(dim=3)
        out_map = out_map.transpose(3, 2).transpose(2, 1)
        if return_depth:
            return out_map, fragments.zbuf.permute(0,3,1,2), fragments # depth
        else:
            return out_map, fragments

    def render_mesh(self,  T, K, render_image_size, near=0.1, far=6, lights=(1,1,-1), fragments=None ):
        B=T.shape[0]
        # face_attribute = vert_attribute[self.faces.long()]

        device = T.device
        T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]

        cameras = PerspectiveCameras(focal_length= torch.stack([K[:,0,0], K[:,1,1] ], dim=-1), 
            principal_point=K[:,:2,2],  R=R, T=t, image_size=[render_image_size]*B, in_ndc=False, device=device)
        lights = PointLights(device=device, location=[lights])

        raster_settings = RasterizationSettings(
            image_size=render_image_size,
            blur_radius=0.0,
            faces_per_pixel=1, #5,
            bin_size=None, #0
            perspective_correct=True
        )
        materials = Materials(
            device=device,
            # specular_color=[[0.0, 1.0, 0.0]],
            shininess=0
        )
        # renderer = MeshRendererWithDepth(
        renderer = MeshRendererWithDepth_v2(
            rasterizer=MeshRasterizer(
                cameras=cameras, 
                raster_settings=raster_settings
            ),
            shader=SoftPhongShader(
            # shader=SoftGouraudShader(
                device=device, 
                cameras=cameras,
                lights=lights, 
                blend_params=BlendParams(1e-4, 1e-4, (0, 0, 0))
            )
        )
        image,depth =renderer(self.mesh, lights=lights, materials=materials, fragments=fragments)

        return image.permute(0,3,1,2)[:,:3], depth.permute(0,3,1,2) # to BCHW

    def render_offset_map(self,  T, K, render_image_size, near=0.1, far=6):
        yy, xx = torch.meshgrid(torch.arange(render_image_size[0], device=T.device), torch.arange(render_image_size[1], device=T.device) )
        # xx = xx.to(dtype=torch.float32)
        # yy = yy.to(dtype=torch.float32)
        coords_grid = torch.stack( [ xx.to(dtype=torch.float32),  yy.to(dtype=torch.float32)], dim=-1 )

        #no need to pre-transform, as here we do not use pytorch3d rendering
        # T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]

        #render depths
        X_cam= (self.pat_centers@R+t)#.squeeze(0)[...,2:]
        x=X_cam@K.transpose(-1,-2)  #BxNx3
        x = x/x[...,-1:]

        offset = x[...,None,None,:2] - coords_grid #BxNx1x1x2-HxWx2
        
        return offset.permute(0,1,4,2,3) #BxNx2xHxW

    # def forward(self, face_attribute, T, K, render_image_size, near=0.1, far=6):
    def forward(self, vert_attribute, T, K, render_image_size, near=0.1, far=6, render_texture=None, mode='bilinear') :
        """
        Args:
            vert_attribute: (N,C)
            T: (B,3,4) or (B,4,4)
            K: (B,3,3)
            render_image_size (tuple): (h,w)
            near (float, optional):  Defaults to 0.1.
            far (int, optional): Defaults to 6.
        """

        # use default rendering settings 
        if render_texture is None:
            render_texture= self.render_texture 
            
        if vert_attribute is None:
            # only render the rgb image
            return self.render_mesh(T, K, render_image_size, near=0.1, far=6 )

        B=T.shape[0]
        face_attribute = vert_attribute[self.faces.long()]

        device = T.device

        T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]
        # t = -(R@T[...,:3,3:]).squeeze(-1)
        
        cameras = PerspectiveCameras(focal_length= torch.stack([K[:,0,0], K[:,1,1] ], dim=-1), 
            principal_point=K[:,:2,2], image_size=[render_image_size]*B, in_ndc=False, device=device)

        raster_settings = RasterizationSettings(
            image_size=render_image_size,
            blur_radius=0.0,
            faces_per_pixel=1,
            bin_size=None, #0
            perspective_correct=True
        )

        rasterizer = MeshRasterizer(
            cameras=cameras,
            raster_settings=raster_settings
        )

        # forward_interpolate(R, T, meshes, face_memory, rasterizer, blur_radius=0, mode='bilinear')
        out_map, out_depth, fragments=self.forward_interpolate(R, t, self.mesh, face_attribute, rasterizer, blur_radius=0, mode=mode)
        
        if not render_texture:
            return out_map, out_depth
        else:
            ren_tex=self.render_mesh(T, K, render_image_size, near=0.1, far=6, fragments=fragments  )

            #The first 3 channels contain the rendered textures
            return torch.cat([ren_tex[0], out_map ], dim=1), out_depth

    def render_depth(self, T, K, render_image_size, near=0.1, far=6, mode='neareast'):
        """
        Args:
            T: (B,3,4) or (B,4,4)
            K: (B,3,3)
            render_image_size (tuple): (h,w)
            near (float, optional):  Defaults to 0.1.
            far (int, optional): Defaults to 6.
            mode: 'bilinear' or 'neareast'
        """

        B=T.shape[0]
        device = T.device

        T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]
        cameras = PerspectiveCameras(focal_length= torch.stack([K[:,0,0], K[:,1,1] ], dim=-1), 
            principal_point=K[:,:2,2], image_size=[render_image_size]*B, in_ndc=False, device=device)

        raster_settings = RasterizationSettings(
            image_size=render_image_size,
            blur_radius=0.0,
            faces_per_pixel=1,
            bin_size=0
        )

        rasterizer = MeshRasterizer(
            cameras=cameras,
            raster_settings=raster_settings
        )


        #render depths
        vert_depths= (self.verts@R+t).squeeze(0)[...,2:]
        face_depths = vert_depths[self.faces.long()]
        out_depth, _ =self.forward_interpolate(R, t, self.mesh, face_depths, rasterizer, blur_radius=0, mode='nearest', return_depth=False)

        return out_depth

    def render_pointcloud(self, T, K, render_image_size, near=0.1, far=6):
        """
        Args:
            T: (B,3,4) or (B,4,4)
            K: (B,3,3)
            render_image_size (tuple): (h,w)
            near (float, optional):  Defaults to 0.1.
            far (int, optional): Defaults to 6.
            mode: 'bilinear' or 'neareast'
        """

        B=T.shape[0]
        device = T.device

        # T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        R = T[...,:3,:3].transpose(-1,-2)
        t = T[...,:3,3]

        #render depths
        # vert_depths= (self.verts@R+t).squeeze(0)[...,2:]
        X_cam= (self.verts@R+t)#.squeeze(0)

        x=X_cam@K.transpose(-1,-2)  #BxNx3
        depth = x[...,-1]
        x = x/x[...,-1:]

        out = torch.zeros([1,1, *render_image_size], dtype=R.dtype, device=R.device)
        out[:, :, 
            torch.round(x[0, :, 1]).long().clamp(0, out.shape[2]-1),
            torch.round(x[0, :, 0]).long().clamp(0, out.shape[3]-1)] = depth 

        return out #1x1xHxW


class DiffRendererWrapper(nn.Module):
    def __init__(self, obj_paths, device="cuda", render_texture=False ):
        super().__init__()

        self.renderers = []
        for obj_path in obj_paths:
            self.renderers.append( 
                DiffRender(obj_path, render_texture).to(device=device)
            )

        self.renderers=nn.ModuleList(self.renderers)
        self.cls2idx=None #updated outside

    def get_patch_center_depths(self, model_names, T, K):
        
        depths= []
        image_coords= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]
            depth, img_coord = self.renderers[model_idx].get_patch_center_depths(T[b:b+1], K )
            depths.append(depth)
            image_coords.append(img_coord)
        
        return torch.cat(depths, dim=0), torch.cat(image_coords, dim=0)


    def render_offset_map(self, model_names,  T, K, render_image_size, near=0.1, far=6):
        offsets= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]

            offset = self.renderers[model_idx].render_offset_map(T[b:b+1], K[b:b+1], render_image_size, near, far )
            offsets.append(offset)
        
        return torch.cat(offsets, dim=0)

    def render_pat_id(self, model_names,  T, K, render_image_size, near=0.1, far=6):

        pat_ids= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]
            # face_pat_id = self.renderers[model_idx].vert_frag_ids[self.renderers[model_idx].faces.long()]
            
            pat_id,_ = self.renderers[model_idx].forward(self.renderers[model_idx].vert_frag_ids.float()+1,T[b:b+1], K[b:b+1], render_image_size, near, far, 'nearest' )
            pat_ids.append(pat_id-1) #+1 -1, set invalid parts as -1's  
        
        return torch.cat(pat_ids, dim=0)

    def render_depth(self, model_names,  T, K, render_image_size, near=0.1, far=6):
    
        depth_outputs= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]

            depth = self.renderers[model_idx].render_depth( T[b:b+1], K[b:b+1], render_image_size, near, far, 'nearest' )
            depth_outputs.append(depth)
        
        return torch.cat(depth_outputs, dim=0)
    def render_mesh(self, model_names,  T, K, render_image_size, near=0.1, far=6):

        outputs= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]

            img= self.renderers[model_idx].render_mesh( T[b:b+1], K[b:b+1], render_image_size, near, far, )[0]
            outputs.append(img)
        
        return torch.cat(outputs, dim=0)

    def render_pointcloud(self, model_names, T, K, render_image_size, near=0.1, far=6):
        outputs= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]
            depth = self.renderers[model_idx].render_pointcloud( T[b:b+1], K[b:b+1], render_image_size, near, far )
            outputs.append(depth)
        
        return torch.cat(outputs, dim=0)

    def forward(self, model_names,  vert_attribute, T, K, render_image_size, near=0.1, far=6, render_tex=False):

        map_outputs= []
        depth_outputs= []
        for b,_ in enumerate(model_names):
            model_idx = self.cls2idx[model_names[b]]

            feamap, depth= self.renderers[model_idx]( vert_attribute[b], T[b:b+1], K[b:b+1], render_image_size, near, far, render_texture=render_tex )

            map_outputs.append(feamap)
            depth_outputs.append(depth)
        return torch.cat(map_outputs, dim=0) , torch.cat(depth_outputs, dim=0)


================================================
FILE: geometry/einsum.py
================================================
# import tensorflow as torch
import torch as torch

import numpy as np
import re
import string

def einsum(equation, *inputs):

    equation = equation.replace(' ', '')
    # input_shapes = [x.get_shape() for x in list(inputs)]
    input_shapes = [x.shape for x in list(inputs)]
    match = re.match('^([a-zA-Z,.]+)(->[a-zA-Z.]*)?$', equation)
    if not match:
        raise ValueError('Indices have incorrect format: %s' % equation)

    input_axis_labels = match.group(1).split(',')
    output_axis_labels = match.group(2)[2:] if match.group(2) else None

    if len(input_shapes) != len(input_axis_labels):
        raise ValueError('Got %d arguments for equation "%s", expecting %d' %
                        (len(input_shapes), equation, len(input_axis_labels)))

    # Resolve Ellipsis
    # Assign axes labels for unspecified dimensions in inputs. Labels taken
    # from unused labels. Follow numpy einsum broadcasting conventions for
    # tensors of different length and unlabeled output.
    ellipsis_axes = ''
    if '...' in equation:
        unused = ''.join([c for c in string.ascii_lowercase
                        if c not in ''.join(input_axis_labels)])
        for i, ax in enumerate(input_axis_labels):
            if '...' in ax:
                parts = ax.split('...')
                if len(parts) != 2:
                    raise ValueError('Unable to resolve ellipsis. Excess number found.')

                # n = input_shapes[i].ndims - len(''.join(parts))
                n = len(input_shapes[i]) - len(''.join(parts))
                if n < 0:
                    raise ValueError('Ellipses lengths do not match.')
                if len(unused) < n:
                    raise ValueError(
                        'Unable to resolve ellipsis, too many distinct labels.')
                replace_axes = unused[-n:] if n > 0 else ''
                input_axis_labels[i] = input_axis_labels[i].replace('...',
                                                                    replace_axes)
                if len(replace_axes) > len(ellipsis_axes):
                    ellipsis_axes = replace_axes
                    
    equation = equation.replace('...', ellipsis_axes)
    out = torch.einsum(equation, *inputs)
    # torch.add_to_collection("checkpoints", out)
    return out


================================================
FILE: geometry/intrinsics.py
================================================
import torch
import numpy as np
# from utils.einsum import einsum
from .einsum import einsum

def intrinsics_vec_to_matrix(kvec):
    fx, fy, cx, cy = torch.unbind(kvec, dim=-1)
    z = torch.zeros_like(fx)
    o = torch.ones_like(fx)

    K = torch.stack([fx, z, cx, z, fy, cy, z, z, o], dim=-1)
    K = torch.reshape(K, list(kvec.shape)[:-1] + [3,3])
    return K

def intrinsics_matrix_to_vec(kmat):
    fx = kmat[..., 0, 0]
    fy = kmat[..., 1, 1]
    cx = kmat[..., 0, 2]
    cy = kmat[..., 1, 2]
    return torch.stack([fx, fy, cx, cy], dim=-1)

def update_intrinsics(intrinsics, delta_focal):
    kvec = intrinsics_matrix_to_vec(intrinsics)
    fx, fy, cx, cy = torch.unstack(kvec, num=4, axis=-1)
    df = torch.squeeze(delta_focal, -1)

    # update the focal lengths
    fx = torch.exp(df) * fx
    fy = torch.exp(df) * fy

    kvec = torch.stack([fx, fy, cx, cy], axis=-1)
    kmat = intrinsics_vec_to_matrix(kvec)
    return kmat

def rescale_depth(depth, downscale=4):
    depth = depth[:,None]
    new_shape = [depth.shape[-2]//downscale, depth.shape[-1]//downscale]
    depth = torch.nn.functional.interpolate(depth, new_shape, mode='nearest')
    return torch.squeeze(depth, dim=1)

def rescale_depth_and_intrinsics(depth, intrinsics, downscale=4):
    sc = torch.tensor([1.0/downscale, 1.0/downscale, 1.0], dtype=torch.float32, device=depth.device)
    intrinsics = einsum('...ij,i->...ij', intrinsics, sc)
    depth = rescale_depth(depth, downscale=downscale)
    return depth, intrinsics

def rescale_depths_and_intrinsics(depth, intrinsics, downscale=4):
    batch, frames, height, width = [depth.shape[i] for i in range(4)]
    depth = torch.reshape(depth, [batch*frames, height, width])
    depth, intrinsics = rescale_depth_and_intrinsics(depth, intrinsics, downscale)
    depth = torch.reshape(depth,
        [batch, frames]+list(depth.shape)[1:])
    return depth, intrinsics


================================================
FILE: geometry/projective_ops.py
================================================
import numpy as np
import torch 

# from utils.einsum import einsum
from torch import einsum


# MIN_DEPTH = 0.1
MIN_DEPTH = 0.01

def normalize_coords_grid(coords):
    """ normalize the coordinates to [-1,1]

    Args:
        coords: BxKxHxWx2
    """
    coords=coords.clone()
    B,K,H,W,_ = coords.shape

    coords[...,0] = 2*coords[...,0]/(W-1)-1
    coords[...,1] = 2*coords[...,1]/(H-1)-1

    return coords

def coords_grid(ref, homogeneous=True):
    """ grid of pixel coordinates """
    shape = ref.shape

    yy, xx = torch.meshgrid(torch.arange(shape[-2], device=ref.device), torch.arange(shape[-1], device=ref.device) )

    xx = xx.to(dtype=torch.float32)
    yy = yy.to(dtype=torch.float32)

    if homogeneous:
        coords = torch.stack([xx, yy, torch.ones_like(xx)], dim=-1)
    else:
        coords = torch.stack([xx, yy], dim=-1)

    new_shape = [1]*len(shape[:-2]) +  list(shape[-2:]) + [-1]
    coords = torch.reshape(coords, new_shape)

    tile = list(shape[:-2])+ [1,1,1]
    coords = coords.repeat(tile)
    return coords # BxKxHxWx2


def extract_and_reshape_intrinsics(intrinsics, shape=None):
    """ Extracts (fx, fy, cx, cy) from intrinsics matrix """

    fx = intrinsics[:, 0, 0]
    fy = intrinsics[:, 1, 1]
    cx = intrinsics[:, 0, 2]
    cy = intrinsics[:, 1, 2]

    if shape is not None:
        batch = list(fx.shape[:1])
        fillr = [1]*len(shape[1:]) 
        k_shape = batch+fillr

        fx = torch.reshape(fx, k_shape)
        fy = torch.reshape(fy, k_shape)
        cx = torch.reshape(cx, k_shape)
        cy = torch.reshape(cy, k_shape)

    return (fx, fy, cx, cy)


def backproject(depth, intrinsics, jacobian=False, depth_coords=None):
    """ backproject depth map to point cloud """
    #depth_coords: (BxKxHxWx2)

    if depth_coords is None:
        coords = coords_grid(depth, homogeneous=True)
        x, y, _ = torch.unbind(coords, axis=-1)
    else:
        x, y =  torch.unbind(depth_coords, axis=-1)

    x_shape = x.shape 
    
    fx, fy, cx, cy = extract_and_reshape_intrinsics(intrinsics, x_shape) #Bx1x1x1

    Z = depth  #BxKxHxW
    X = Z * (x - cx) / fx
    Y = Z * (y - cy) / fy 
    points = torch.stack([X, Y, Z], axis=-1)

    if jacobian:
        o = torch.zeros_like(Z) # used to fill in zeros

        # jacobian w.r.t (fx, fy) , of shape BxKxHxWx4x1
        jacobian_intrinsics = torch.stack([
            torch.stack([-X / fx], dim=-1),
            torch.stack([-Y / fy], dim=-1),
            torch.stack([o], dim=-1),
            torch.stack([o], dim=-1)], axis=-2)

        return points, jacobian_intrinsics
    
    return points
    # return points, coords


def project(points, intrinsics, jacobian=False):
    
    """ project point cloud onto image """
    X, Y, Z = torch.unbind(points, axis=-1)
    Z = torch.clamp(Z, min=MIN_DEPTH)

    x_shape = X.shape
    fx, fy, cx, cy = extract_and_reshape_intrinsics(intrinsics, x_shape)

    x = fx * (X / Z) + cx
    y = fy * (Y / Z) + cy
    coords = torch.stack([x, y], axis=-1)

    if jacobian:
        o = torch.zeros_like(x) # used to fill in zeros
        zinv1 = torch.where(Z <= MIN_DEPTH+.01, torch.zeros_like(Z), 1.0 / Z)
        zinv2 = torch.where(Z <= MIN_DEPTH+.01, torch.zeros_like(Z), 1.0 / Z**2)

        # jacobian w.r.t (X, Y, Z)
        jacobian_points = torch.stack([
            torch.stack([fx * zinv1, o, -fx * X * zinv2], axis=-1),
            torch.stack([o, fy * zinv1, -fy * Y * zinv2], axis=-1)], axis=-2)

        # jacobian w.r.t (fx, fy)
        jacobian_intrinsics = torch.stack([
            torch.stack([X * zinv1], axis=-1),
            torch.stack([Y * zinv1], axis=-1),], axis=-2)

        return coords, (jacobian_points, jacobian_intrinsics)

    return coords


================================================
FILE: geometry/se3.py
================================================
"""
SO3 and SE3 operations, exponentials and logarithms adapted from Sophus
"""

import numpy as np
import torch
from .einsum import einsum


MIN_THETA = 1e-4

def matdotv(A,b):
    return torch.squeeze(torch.matmul(A, torch.expand_dims(b, -1)), -1)

def hat(a):
    a1, a2, a3 = torch.split(a, [1,1,1], dim=-1)
    zz = torch.zeros_like(a1)

    ax = torch.stack([
        torch.cat([zz,-a3,a2], dim=-1),
        torch.cat([a3,zz,-a1], dim=-1),
        torch.cat([-a2,a1,zz], dim=-1)
    ], dim=-2)

    return ax
    

### quaternion functions ###

def quaternion_rotate_point(q, pt, eq=None):
    if eq is None:
        w, vec = torch.split(q, [1, 3], axis=-1)
        uv = 2*matdotv(hat(vec), pt)
        return pt + w*uv + matdotv(hat(vec), uv)
    else:
        w, vec = torch.split(q, [1, 3], axis=-1)
        uv1 = 2*einsum(eq, hat(w*vec), pt)
        uv2 = 2*einsum(eq, hat(vec), pt)
        return pt + uv1 + einsum(eq, hat(vec), uv2)

def quaternion_rotate_matrix(q, mat, eq=None):
    if eq is None:
        w, vec = torch.split(q, [1, 3], axis=-1)
        uv = 2*torch.matmul(hat(vec), mat)
        return mat + w*uv + torch.matmul(hat(vec), uv)
    else:
        w, vec = torch.split(q, [1, 3], axis=-1)
        uv1 = 2*einsum(eq, hat(w*vec), mat)
        uv2 = 2*einsum(eq, hat(vec), mat)
        return mat + uv1 + einsum(eq, hat(vec), uv2)

def quaternion_inverse(q):
    return q * [1, -1, -1, -1]

def quaternion_multiply(a, b):
    aw, ax, ay, az = torch.split(a, [1,1,1,1], axis=-1)
    bw, bx, by, bz = torch.split(b, [1,1,1,1], axis=-1)
    
    q = torch.concat([
        aw * bw - ax * bx - ay * by - az * bz,
        aw * bx + ax * bw + ay * bz - az * by,
        aw * by + ay * bw + az * bx - ax * bz,
        aw * bz + az * bw + ax * by - ay * bx,
    ], axis=-1)

    return q

def quaternion_to_matrix(q):
    w, x, y, z = torch.split(q, [1,1,1,1], axis=-1)

    r11 = 1 - 2 * y**2 - 2 * z**2
    r12 = 2 * x * y - 2 * w * z
    r13 = 2 * z * x + 2 * w * y

    r21 = 2 * x * y + 2 * w * z
    r22 = 1 - 2 * x**2 - 2 * z**2
    r23 = 2 * y * z - 2 * w * x

    r31 = 2 * z * x - 2 * w * y
    r32 = 2 * y * z + 2 * w * x
    r33 = 1 - 2 * x**2 - 2 * y**2
    
    R = torch.stack([
        torch.concat([r11,r12,r13], axis=-1),
        torch.concat([r21,r22,r23], axis=-1),
        torch.concat([r31,r32,r33], axis=-1)
    ], axis=-2)

    return R

def rotation_matrix_to_quaternion(R):
    Rxx, Ryx, Rzx = R[...,0,0], R[...,0,1], R[...,0,2]
    Rxy, Ryy, Rzy = R[...,1,0], R[...,1,1], R[...,1,2]
    Rxz, Ryz, Rzz = R[...,2,0], R[...,2,1], R[...,2,2]

    zz = torch.zeros_like(Rxx)
    k1 = torch.stack([Rxx-Ryy-Rzz, zz, zz, zz], axis=-1)
    k2 = torch.stack([Ryx+Rxy, Ryy-Rxx-Rzz, zz, zz], axis=-1)
    k3 = torch.stack([Rzx+Rxz, Rzy+Ryz, Rzz-Rxx-Ryy,zz], axis=-1)
    k4 = torch.stack([Ryz-Rzy, Rzx-Rxz, Rxy-Ryx, Rxx+Ryy+Rzz], axis=-1)

    K = torch.stack([k1, k2, k3, k4], axis=-2)
    eigvals, eigvecs = torch.linalg.eigh(K)

    x, y, z, w = torch.split(eigvecs[...,-1], [1,1,1,1], axis=-1)
    qvec = torch.concat([w, x, y, z], axis=-1)
    qvec /= torch.sqrt(torch.reduce_sum(qvec**2, axis=-1, keepdims=True))

    return qvec * torch.sign(w)

def so3_expm_and_theta(omega):
    """ omega in \so3 """
    theta_sq = torch.reduce_sum(omega**2, axis=-1)
    theta = torch.sqrt(theta_sq)
    half_theta = 0.5*theta

    ### small ###
    imag_factor = torch.where(theta>MIN_THETA, 
        torch.sin(half_theta) / (theta + 1e-12), 
        0.5 - (1.0/48.0)*theta_sq + (1.0/3840.0)*theta_sq*theta_sq)

    real_factor = torch.where(theta>MIN_THETA, torch.cos(half_theta),
        1.0 - (1.0/8.0)*theta_sq + (1.0/384.0)*theta_sq*theta_sq)

    qw = real_factor
    qx = imag_factor * omega[...,0]
    qy = imag_factor * omega[...,1]
    qz = imag_factor * omega[...,2]

    quat = torch.stack([qw, qx, qy, qz], axis=-1)
    return quat, theta
        
def so3_logm_and_theta(so3):
    w, vec = torch.split(so3, [1,3], axis=-1)
    squared_n = torch.reduce_sum(vec**2, axis=-1, keepdims=True)
    n = torch.sqrt(squared_n)

    two_atan_nbyw_by_n = torch.where(n<MIN_THETA,
        2/w - w*squared_n / (w*w*w),
        2*torch.atan(n/w) / (n+1e-12))

    theta = two_atan_nbyw_by_n * n
    omega = two_atan_nbyw_by_n * vec
    return omega, theta

def se3_expm(xi):
    """ xi in \se3 """
    tau, omega = torch.split(xi, [3, 3], axis=-1)
    q, theta = so3_expm_and_theta(omega)


    theta = theta[...,torch.newaxis,torch.newaxis]
    theta = torch.tile(theta, 
        torch.concat([torch.ones_like(torch.shape(q)[:-1]), [3,3]], axis=-1))

    theta_sq = theta * theta
    Omega = hat(omega)
    Omega_sq = torch.matmul(Omega, Omega)

    Vs = torch.eye(3, batch_shape=torch.shape(xi)[:-1]) + \
         (1-torch.cos(theta)) / (theta_sq + 1e-12) * Omega + \
         (theta - torch.sin(theta)) / (theta_sq*theta + 1e-12) * Omega_sq

    V = torch.where(theta<MIN_THETA, quaternion_to_matrix(q), Vs)
    t = matdotv(V, tau)
    return q, t

def se3_logm(so3, t):
    omega, theta = so3_logm_and_theta(so3)
    Omega = hat(omega)
    Omega_sq = torch.matmul(Omega, Omega)

    theta = theta[...,torch.newaxis]
    theta = torch.tile(theta, 
        torch.concat([torch.ones_like(torch.shape(omega)[:-1]), [3,3]], axis=-1))
    half_theta = 0.5*theta

    Vinv_approx = torch.eye(3, batch_shape=torch.shape(omega)[:-1]) - \
        0.5*Omega + (1.0/12.0) * Omega_sq

    Vinv_exact = torch.eye(3, batch_shape=torch.shape(omega)[:-1]) - \
        0.5*Omega + (1-theta*torch.cos(half_theta) / \
        (2*torch.sin(half_theta)+1e-12)) / (theta*theta + 1e-12) * Omega_sq

    Vinv = torch.where(theta<MIN_THETA, Vinv_approx, Vinv_exact)
    tau = matdotv(Vinv, t)

    upsilon = torch.concat([tau, omega], axis=-1)
    return upsilon


### matrix functions ###

def se3_matrix_inverse(G):
    """ Invert SE3 matrix """
    inp_shape = G.shape 
    G = torch.reshape(G, [-1, 4, 4])

    R, t = G[:, :3, :3], G[:, :3, 3:]
    R = R.permute(0, 2, 1)
    t = -torch.matmul(R, t)

    filler = torch.tensor([0.0, 0.0, 0.0, 1.0], device=G.device, dtype=G.dtype) 
    filler = torch.reshape(filler, [1, 1, 4])
    filler = filler.repeat([G.shape[0], 1, 1]) 

    Ginv = torch.cat([R, t], dim=-1)
    Ginv = torch.cat([Ginv, filler], dim=-2)
    return torch.reshape(Ginv, inp_shape)


def _se3_matrix_expm_grad(grad):
    grad_upsilon_omega = torch.stack([
        grad[..., 0, 3],
        grad[..., 1, 3],
        grad[..., 2, 3],
        grad[..., 2, 1] - grad[..., 1, 2],
        grad[..., 0, 2] - grad[..., 2, 0],
        grad[..., 1, 0] - grad[..., 0, 1]
    ], axis=-1)

    return grad_upsilon_omega

def _se3_matrix_expm_shape(op):
    return [op.inputs[0].get_shape().as_list()[:-1] + [4, 4]]


def _se3_matrix_expm(upsilon_omega):
    """ se3 matrix exponential se(3) -> SE(3), works for arbitrary batch dimensions
    - Note: gradient is overridden with _se3_matrix_expm_grad, which approximates 
    gradient for small upsilon_omega
    """

    eps=1e-12
    inp_shape = upsilon_omega.shape 
    out_shape = list(inp_shape)[:-1]+[4,4] 

    upsilon_omega = torch.reshape(upsilon_omega, [-1, 6])
    batch = upsilon_omega.shape[0]
    v, w = torch.split(upsilon_omega, [3, 3], dim=-1)

    theta_sq = torch.sum(w**2, dim=1 )
    theta_sq = torch.reshape(theta_sq, [-1, 1, 1])

    theta = torch.sqrt(theta_sq)
    theta_po4 = theta_sq * theta_sq

    wx = hat(w)
    wx_sq = torch.matmul(wx, wx)
    I = torch.eye(3, dtype=upsilon_omega.dtype, device=upsilon_omega.device).repeat([batch,1,1])

    ### taylor approximations ###
    R1 =  I + (1.0 - (1.0/6.0)*theta_sq + (1.0/120.0)*theta_po4) * wx + \
        (0.5 - (1.0/12.0)*theta_sq + (1.0/720.0)*theta_po4) * wx_sq
    
    V1 = I + (0.5 - (1.0/24.0)*theta_sq + (1.0/720.0)*theta_po4)*wx + \
        ((1.0/6.0) - (1.0/120.0)*theta_sq + (1.0/5040.0)*theta_po4)*wx_sq

    ### exact values ###
    R2 = I + (torch.sin(theta) / (theta+eps)) * wx +\
        ((1 - torch.cos(theta)) / (theta_sq+eps)) * wx_sq

    V2 = I + ((1 - torch.cos(theta)) / (theta_sq + eps)) * wx + \
        ((theta - torch.sin(theta))/(theta_sq*theta + eps)) * wx_sq

    # print(theta.shape, R1.shape, R2.shape, ">>>", flush=True)
    # R = torch.where(theta[:, 0, 0]<MIN_THETA, R1, R2)
    # V = torch.where(theta[:, 0, 0]<MIN_THETA, V1, V2)
    R = torch.where(theta<MIN_THETA, R1, R2)
    V = torch.where(theta<MIN_THETA, V1, V2)

    t = torch.matmul(V, v[...,None]) 

    fill = torch.tensor([0, 0, 0, 1], dtype=torch.float32, device=R.device)
    fill = torch.reshape(fill, [1, 1, 4])
    fill = fill.repeat([batch, 1, 1])

    G = torch.cat([R, t], dim=2)
    G = torch.cat([G, fill], dim=1)
    G = torch.reshape(G, out_shape)
    return G


class SE3_Matrix_Expm(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, upsilon_omega):
        G=_se3_matrix_expm(upsilon_omega)
        # ctx.save_for_backward(G)
        return G 

    @staticmethod
    def backward(ctx, grad_output):
        # result, = ctx.saved_tensors
        
        return _se3_matrix_expm_grad(grad_output)


def se3_matrix_expm(upsilon_omega):
    return SE3_Matrix_Expm.apply(upsilon_omega)


def se3_matrix_increment(G, upsilon_omega):
    """ Left increment of rigid body transformation: G = expm(xi) G"""
    dG = se3_matrix_expm(upsilon_omega)
    return torch.matmul(dG, G)

================================================
FILE: geometry/transformation.py
================================================
import torch  
import numpy as np

# from core.config import cfg
from config.default import get_cfg
from .se3 import *
from .intrinsics import *
from . import projective_ops as pops
from . import cholesky

from .einsum import einsum

cholesky_solve = cholesky.solve


MIN_DEPTH = 0.1
MAX_RESIDUAL = 250.0

# can use both matrix or quaternions to represent rotations
DEFAULT_INTERNAL = 'matrix'


def clip_dangerous_gradients(x):
    return x


def jac_local_perturb(pt, fill=False):

    X, Y, Z = torch.split(pt,[1,1,1], dim=-1)  # torch.split(pt, [1, 1, 1], axis=-1)
    o, i = torch.zeros_like(X), torch.ones_like(X)
    if fill:
        j1 = torch.cat([i,  o,  o, o], dim=-1)
        j2 = torch.cat([o,  i,  o, o], dim=-1)
        j3 = torch.cat([o,  o,  i, o], dim=-1)
        j4 = torch.cat([o, -Z,  Y, o], dim=-1)
        j5 = torch.cat([Z,  o, -X, o], dim=-1)
        j6 = torch.cat([-Y,  X,  o, o],dim=-1)
    else:
        j1 = torch.cat([i,  o,  o], dim=-1)
        j2 = torch.cat([o,  i,  o], dim=-1)
        j3 = torch.cat([o,  o,  i], dim=-1)
        j4 = torch.cat([o, -Z,  Y], dim=-1)
        j5 = torch.cat([Z,  o, -X], dim=-1)
        j6 = torch.cat([-Y,  X,  o],dim=-1)
    jac = torch.stack([j1, j2, j3, j4, j5, j6], dim=-1)
    return jac


def cond_transform(cond, T1, T2):
    """ Return T1 if cond, else T2 """

    if T1.internal == 'matrix':
        mat = torch.cond(cond, lambda: T1.matrix(), lambda: T2.matrix())
        T = T1.__class__(matrix=mat, internal=T1.internal)

    elif T1.internal == 'quaternion':
        so3 = torch.cond(cond, lambda: T1.so3, lambda: T2.so3)
        translation = torch.cond(cond, lambda: T1.translation,
                              lambda: T2.translation)
        T = T1.__class__(so3=so3, translation=translation,
                         internal=T1.internal)
    return T


class SE3:
    def __init__(self, upsilon=None, matrix=None, so3=None, translation=None, eq=None, internal=DEFAULT_INTERNAL):
        self.eq = eq
        self.internal = internal

        if internal == 'matrix':
            if upsilon is not None:
                self.G = se3_matrix_expm(upsilon)
            elif matrix is not None:
                self.G = matrix
        else:
            raise NotImplementedError 

    def __call__(self, pt, jacobian=False):
        """ Transform set of points """

        if self.internal == 'matrix':

            pt = torch.cat([pt, torch.ones_like(pt[..., :1])],
                        dim=-1)  # convert to homogenous
            pt = einsum(self.eq, self.G[..., :3, :], pt)
        else:
            raise NotImplementedError

        if jacobian:
            jacobian = jac_local_perturb(pt)
            return pt, jacobian

        return pt

    def __mul__(self, other):
        if self.internal == 'matrix':
            G = torch.matmul(self.G, other.G)
            return self.__class__(matrix=G, internal=self.internal)
        else:
            raise NotImplementedError

    def identity_(self):
        if self.internal == 'matrix':
            shape=self.G.shape
            self.G=torch.eye(4, device=self.G.device).repeat([*shape[:-2],1,1])
        else:
            raise NotImplementedError


    def increment(self, upsilon):
        if self.internal == 'matrix':
            G = se3_matrix_increment(self.G, upsilon)
            return self.__class__(matrix=G, internal=self.internal)
        else:
            raise NotImplementedError

    def concat(self, other, axis=0):
        if self.internal == 'matrix':
            G = torch.concat([self.G, other.G], axis=axis)
        else:
            raise NotImplementedError


    def copy(self, stop_gradients=False):

        if self.internal == 'matrix':
            if stop_gradients:
                # return self.__class__(matrix=torch.stop_gradient(self.G), internal=self.internal)
                return self.__class__(matrix=self.G.detach(), internal=self.internal)
            else:
                return self.__class__(matrix=self.G, internal=self.internal)

        else:
            raise NotImplementedError

    def to_vec(self):
        return torch.concat([self.so3, self.translation], axis=-1)

    def inv(self):
        if self.internal == 'matrix':
            Ginv = se3_matrix_inverse(self.matrix())
            return self.__class__(matrix=Ginv, internal=self.internal)
        else:
            raise NotImplementedError

    def adj(self):
        if self.internal == 'matrix':
            R = self.G[..., :3, :3]
            t = self.G[..., :3, 3]
            A11 = R
            A12 = torch.matmul(hat(t), R)
            A21 = torch.zeros_like(A11)
            A22 = R
        else:
            raise NotImplementedError


        Ax = torch.concat([
            torch.concat([A11, A12], axis=-1),
            torch.concat([A21, A22], axis=-1)
        ], axis=-2)

        return Ax

    def logm(self):
        return se3_logm(self.so3, self.translation)

    def shape(self):
        # return torch.shape(self.so3)[:-1]
        if self.internal == 'matrix':
            my_shape = self.G.shape  # torch.shape(self.G)
        else:
            raise NotImplementedError

        return (my_shape[0], my_shape[1])

    def matrix(self, fill=True):
        if self.internal == 'matrix':
            return self.G
        else:
            raise NotImplementedError
       

    def transform(self, depth, intrinsics, valid_mask=False, return3d=False):
        
        # pt = pops.backproject(depth, intrinsics)
        pt = pops.backproject(depth, intrinsics)
        pt_new = self.__call__(pt)
        coords = pops.project(pt_new, intrinsics)
        if return3d:
            return coords, pt_new
        if valid_mask:
            vmask = (pt[..., -1] > MIN_DEPTH) & (pt_new[..., -1] > MIN_DEPTH)
            # vmask = torch.cast(vmask, torch.float32)[..., torch.newaxis]
            # vmask = vmask.to(dtype=torch.float32)[..., None, :,:] #BxKx1xHxW
            vmask = vmask.to(dtype=torch.float32)[..., :, :, None]  # BxKx1xHxW
            return coords, vmask
        return coords

    def induced_flow(self, depth, intrinsics, valid_mask=False):
        coords0 = pops.coords_grid(depth, homogeneous=False)

        if valid_mask:
            coords1, vmask = self.transform(
                depth, intrinsics, valid_mask=valid_mask)
            return coords1 - coords0, vmask
        coords1 = self.transform(depth, intrinsics, valid_mask=valid_mask)
        return coords1 - coords0

    def depth_change(self, depth, intrinsics):
        pt = pops.backproject(depth, intrinsics)
        pt_new = self.__call__(pt)
        return pt_new[..., -1] - pt[..., -1]
    
    def identity(self):
        """ Push identity transformation to start of collection """
        batch, frames = self.shape()
        if self.internal == 'matrix':
            # I = torch.eye(4, batch_shape=[batch, 1])
            I = torch.eye(4, dtype=self.G.dtype, device=self.G.device).repeat(
                [batch, 1, 1, 1])
            # return self.__class__(matrix=I, internal=self.internal, eq=self.eq)
            return self.__class__(matrix=I, internal=self.internal, eq=self.eq)
        else:
            raise NotImplementedError


class SE3Sequence(SE3):
    """ Stores collection of SE3 objects """

    def __init__(self, upsilon=None, matrix=None, so3=None, translation=None, eq= "aijk,ai...k->ai...j",internal=DEFAULT_INTERNAL):
        super().__init__(
            upsilon, matrix, so3, translation, internal=internal, eq=eq)

        # self.eq = "aijk,ai...k->ai...j"
    def __call__(self, pt, inds=None, jacobian=False):
        if self.internal == 'matrix':
            return super().__call__(pt, jacobian=jacobian)
        else:
            raise NotImplementedError


    def gather(self, inds):
        if self.internal == 'matrix':
            G = torch.index_select(self.G, index=inds, dim=1)
            return SE3Sequence(matrix=G, internal=self.internal)
        else:
            raise NotImplementedError

    # def append_identity(self):
    #     """ Push identity transformation to start of collection """
    #     batch, frames = self.shape()
    #     if self.internal == 'matrix':
    #         # I = torch.eye(4, batch_shape=[batch, 1])
    #         I = torch.eye(4, dtype=self.G.dtype, device=self.G.device).repeat(
    #             [batch, 1, 1, 1])

    #         G = torch.cat([I, self.G], dim=1)
    #         return SE3Sequence(matrix=G, internal=self.internal)
    #     else:
    #         raise NotImplementedError

    def reprojction_optim(self,
                       target,
                       weight,
                       depth,
                       intrinsics,
                       num_iters=2,
                       depth_img_coords=None
                       ):

        target = clip_dangerous_gradients(target).to(dtype=torch.float64)
        weight = clip_dangerous_gradients(weight).to(dtype=torch.float64)

        X0 = pops.backproject(depth, intrinsics, depth_coords=depth_img_coords)
        w = weight[..., None] 

        lm_lmbda = get_cfg("LM").LM_LMBDA
        ep_lmbda = get_cfg("LM").EP_LMBDA

        T = self.copy(stop_gradients=False)
        for i in range(num_iters):
            ### compute the jacobians of the transformation ###
            X1, jtran = T(X0, jacobian=True)
            x1, (jproj, jkvec) = pops.project(X1, intrinsics, jacobian=True)

            v = (X0[..., -1] > MIN_DEPTH) & (X1[..., -1] > MIN_DEPTH)
            # v = v.to(dtype=torch.float32)[..., None, None]
            v = v.to(dtype=torch.float64)[..., None, None]

            ### weighted gauss-newton update ###
            J = einsum('...ij,...jk->...ik', jproj.to(dtype=torch.float64), jtran.to(dtype=torch.float64 ))  

            H = einsum('ai...j,ai...k->aijk', v*w*J, J)
            b = einsum('ai...j,ai...->aij', v*w*J, target-x1)

            ### add dampening and apply increment ###
            H += ep_lmbda*torch.eye(6, dtype=H.dtype, device=H.device) + lm_lmbda*H*torch.eye(6,dtype=H.dtype, device=H.device)
            try:
                delta_upsilon = cholesky_solve(H, b)
            except:
                # print(w.shape,v.shape, w.mean(), v.mean(),H,b, '!!!!')
                raise
            T = T.increment(delta_upsilon)

        # update
        if self.internal == 'matrix':
            self.G = T.matrix()
            T = SE3Sequence(
                matrix=T.matrix(), internal=self.internal)
        else:
            raise NotImplementedError

        return T


    def transform(self, depth, intrinsics, valid_mask=False, return3d=False):
        return super().transform(depth, intrinsics, valid_mask, return3d)


================================================
FILE: model/CFNet.py
================================================


import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from thirdparty.raft.update import BasicUpdateBlock
from thirdparty.raft.extractor import BasicEncoder
from thirdparty.raft.corr import CorrBlock, AlternateCorrBlock
from thirdparty.raft.utils.utils import bilinear_sampler, coords_grid, upflow

try:
    autocast = torch.cuda.amp.autocast
except:
    # dummy autocast for PyTorch < 1.6
    class autocast:
        def __init__(self, enabled):
            pass
        def __enter__(self):
            pass
        def __exit__(self, *args):
            pass


class ImageFeaEncoder(nn.Module):
    def __init__(self, input_dim=3, output_dim=256):
        super().__init__()
        self.fnet = BasicEncoder(output_dim=output_dim, norm_fn='instance', dropout=False, input_dim=input_dim)        

        if 1:#self.args.pretrained_model is not None:
            print("Loading the weights of RAFT...")
            import os             
            self.load_state_dict(
                #  torch.load(self.args.pretrained_model, map_location='cpu'), strict=False
                 torch.load( f"{os.path.dirname(os.path.abspath(__file__)) }/../weights/img_fea_enc.pth", map_location='cpu'), strict=True
            )
        else:
            print("ImageFeaEncoder will be trained from scratch...")

    def forward(self, image1, image2):
        image1 = 2 * (image1 / 255.0) - 1.0
        image2 = 2 * (image2 / 255.0) - 1.0

        image1 = image1.contiguous()
        image2 = image2.contiguous()
        with autocast(enabled=True):
            fmap1, fmap2 = self.fnet([image1, image2])
        return fmap1, fmap2


class GRU_CFUpdator(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.args = args
        input_dim =  args.get("input_dim", 3)

        self.hidden_dim = hdim = 128
        self.context_dim = cdim = 128
        args.corr_levels = 4
        args.corr_radius = 4

        if 'alternate_corr' not in self.args:
            self.args.alternate_corr = False

        self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)

        if self.args.pretrained_model is not None:
            print("Loading the weights of RAFT...")
            import os             
            self.load_state_dict(
                #  torch.load(self.args.pretrained_model, map_location='cpu'), strict=False
                 torch.load( f"{os.path.dirname(os.path.abspath(__file__)) }/../weights/gru_update.pth", map_location='cpu'), strict=True
            )
        else:
            print("GRU_CFUpdator will be trained from scratch...")


    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()

    def initialize_flow(self, img, downsample_rate=8):
        """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
        N, C, H, W = img.shape
        coords0 = coords_grid(N, H//downsample_rate, W//downsample_rate).to(img.device)
        coords1 = coords_grid(N, H//downsample_rate, W//downsample_rate).to(img.device)

        # optical flow computed as difference: flow = coords1 - coords0
        return coords0, coords1

    def upsample_flow(self, flow, mask, upsample_scale=8):
        """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
        N, _, H, W = flow.shape
        mask = mask.view(N, 1, 9, upsample_scale, upsample_scale, H, W)
        mask = torch.softmax(mask, dim=2)

        up_flow = F.unfold(upsample_scale * flow, [3,3], padding=1)
        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)

        up_flow = torch.sum(mask * up_flow, dim=2)
        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
        return up_flow.reshape(N, 2, upsample_scale*H, upsample_scale*W)


    def forward(self, fmap1, fmap2, iters=1, flow_init=None, upsample=True, test_mode=False, context_fea=None, update_corr_fn=True):
        """ Estimate optical flow between pair of frames """

        hdim = self.hidden_dim
        cdim = self.context_dim

        if update_corr_fn: # need carful handling outside
            # run the feature network
            self.fmap1 = fmap1.float()
            self.fmap2 = fmap2.float()
            if self.args.alternate_corr:
                self.corr_fn = AlternateCorrBlock(self.fmap1, self.fmap2, radius=self.args.corr_radius)
            else:
                self.corr_fn = CorrBlock(self.fmap1, self.fmap2, radius=self.args.corr_radius)

        if update_corr_fn: 
            # run the context network
            with autocast(enabled=self.args.mixed_precision):
                assert context_fea is not None
                ds = context_fea.shape[-1]//self.fmap1.shape[-1]
                cnet = F.interpolate(context_fea, scale_factor=1/ds, mode='bilinear', align_corners=True)

                self.net, self.inp = torch.split(cnet, [hdim, cdim], dim=1)
                self.net = torch.tanh(self.net)
                self.inp = torch.relu(self.inp)

        # coords0, coords1 = self.initialize_flow(image1)
        coords0, coords1 = self.initialize_flow(flow_init)

        if flow_init is not None:
            ds = flow_init.shape[-1]//coords0.shape[-1]
            if ds !=1:
                flow_init /=ds
                flow_init = F.interpolate(flow_init, scale_factor=1/ds, mode='bilinear', align_corners=True)

            coords1 = coords1 + flow_init

        flow_predictions = []
        for itr in range(iters):
            coords1 = coords1.detach()
            corr = self.corr_fn(coords1) # index correlation volume

            flow = coords1 - coords0
            with autocast(enabled=self.args.mixed_precision):
                # net, up_mask, delta_flow = self.update_block(net, inp, corr, flow)
                self.net, up_mask, delta_flow = self.update_block(self.net, self.inp, corr, flow)

            # F(t+1) = F(t) + \Delta(t)
            coords1 = coords1 + delta_flow

            # upsample predictions
            if up_mask is None:
                flow_up = upflow(coords1 - coords0, scale=image1.shape[2]//coords0.shape[2],)
            else:
                if self.args.fea_net in ["bigdx4"]:
                    flow_up = self.upsample_flow(coords1 - coords0, up_mask, upsample_scale=4)
                else:
                    flow_up = self.upsample_flow(coords1 - coords0, up_mask)
            
            flow_predictions.append(flow_up)

        if test_mode:
            return coords1 - coords0, flow_up
            
        return flow_predictions


================================================
FILE: model/HybridNet.py
================================================

import torch 
import torch.nn as nn 

from thirdparty.kpconv.kpconv_blocks import *
import torch.nn.functional as F
import numpy as np
from kpconv.lib.utils import square_distance
from model.descriptor2D import  SuperPoint2D
from model.descriptor3D import  KPSuperpoint3Dv2


REGISTERED_HYBRID_NET_CLASSES={}
def register_hybrid_net(cls, name=None):
    global REGISTERED_HYBRID_NET_CLASSES
    if name is None:
        name = cls.__name__
    assert name not in REGISTERED_HYBRID_NET_CLASSES, f"exist class: {REGISTERED_HYBRID_NET_CLASSES}"
    REGISTERED_HYBRID_NET_CLASSES[name] = cls
    return cls


def get_hybrid_net(name):
    global REGISTERED_HYBRID_NET_CLASSES
    assert name in REGISTERED_HYBRID_NET_CLASSES, f"available class: {REGISTERED_HYBRID_NET_CLASSES}"
    return REGISTERED_HYBRID_NET_CLASSES[name]

class ContextFeatureNet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.context_fea_extractor_3d= KPSuperpoint3Dv2(config['context_fea_extractor_3d'] )
    
    def forward(self, batch):
        # x = batch['features'].clone().detach()
        # assert len(batch['stack_lengths'][-1])==1, "Only support bs=1 for now" 
        len_src_c = batch['stack_lengths'][-1][0]
        pcd_c = batch['model_points'][-1]
        pcd_c = pcd_c[:len_src_c]

        image=batch['image']

        ############### encode 3d and 2d features ###############
        batch3d={
            'points': batch['model_points'], 
            'neighbors': batch['neighbors'], 
            'pools':  batch['pools'], 
            'upsamples': batch['upsamples'],
            'features': batch['model_point_features'], 
            'stack_lengths': batch['stack_lengths'],
        }
        ctx_descriptors_3d = self.context_fea_extractor_3d(batch3d)


        return {
            "ctx_fea_3d":ctx_descriptors_3d,
        }


@register_hybrid_net
class HybridDescNet(nn.Module):
    #independent 2d and 3d network
    def __init__(self, config):
        super().__init__()

        self.corr_fea_extractor_2d= SuperPoint2D(config['keypoints_detector_2d'] )
        self.corr_fea_extractor_3d= KPSuperpoint3Dv2(config['keypoints_detector_3d'] )
        self.descriptors_3d = {}


    def forward(self, batch):
        assert len(set(batch['class_name']))==1, "A batch should contain data of the same class."
        class_name = batch['class_name'][0]

        len_src_c = batch['stack_lengths'][-1][0]
        pcd_c = batch['model_points'][-1]
        pcd_c = pcd_c[:len_src_c]#, pcd_c[len_src_c:]

        image=batch['image']

        ############### encode 3d and 2d features ###############
        batch3d={
            'points': batch['model_points'], 
            'neighbors': batch['neighbors'], 
            'pools':  batch['pools'], 
            'upsamples': batch['upsamples'],
            'features': batch['model_point_features'], 
            'stack_lengths': batch['stack_lengths'],
        }
        if self.training:
            self.descriptors_3d[class_name] = self.corr_fea_extractor_3d(batch3d)
        else:
            if class_name not in self.descriptors_3d:
                self.descriptors_3d[class_name] = self.corr_fea_extractor_3d(batch3d)

        descriptors_2d = self.corr_fea_extractor_2d(image)['descriptors']


        return {
            "descriptors_2d":descriptors_2d,
            "descriptors_3d":self.descriptors_3d[class_name],
            "scores_saliency_3d": None, 
            "scores_overlap_3d":None, 

        }


================================================
FILE: model/PoseRefiner.py
================================================
import os 
import time
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
import numpy as np
from easydict import EasyDict as edict
from functools import partial

from geometry.transformation import *
from geometry.intrinsics import *
from geometry.projective_ops import coords_grid, normalize_coords_grid
from model.CFNet import GRU_CFUpdator , ImageFeaEncoder
from utils.pose_utils import pose_padding
from config.default import get_cfg


EPS = 1e-5
MIN_DEPTH = 0.1
MAX_ERROR = 100.0

# exclude extremly large displacements
MAX_FLOW = 400


def raft_sequence_flow_loss(flow_preds, flow_gt, valid, gamma=0.8, max_flow=MAX_FLOW):
    """ Loss function defined over sequence of flow predictions """

    n_predictions = len(flow_preds)    
    flow_loss = 0.0
    
    # exlude invalid pixels and extremely large diplacements
    mag = torch.sum(flow_gt**2, dim=1).sqrt()
    
    valid = (valid >= 0.5) & (mag < max_flow)

    for i in range(n_predictions):
        i_weight = gamma**(n_predictions - i - 1)
        i_loss = (flow_preds[i] - flow_gt).abs()
        flow_loss += i_weight * (valid[:, None] * i_loss).mean()

    epe = torch.sum((flow_preds[-1] - flow_gt)**2, dim=1).sqrt()
    epe = epe.view(-1)[valid.view(-1)]

    metrics = {
        'epe': epe.mean().item(),
        '1px': (epe < 1).float().mean().item(),
        '3px': (epe < 3).float().mean().item(),
        '5px': (epe < 5).float().mean().item(),
    }

    return flow_loss, metrics


class PoseRefiner(nn.Module):
    def __init__(self, cfg,
                 reuse=False,
                 schedule=None,
                 use_regressor=True,
                 is_calibrated=True,
                 bn_is_training=False,
                 is_training=True,
                 renderer=None,
                 ):

        super().__init__()

        self.legacy=True
        self.cfg = cfg
        self.reuse = reuse
        self.sigma=nn.ParameterList( [nn.Parameter(torch.ones(1)*1 )] )
        self.with_corr_weight = self.cfg.get("with_corr_weight", True)
        if not self.with_corr_weight:
            print("Warning: the correlation weighting is disabled.")

        self.is_calibrated = cfg.IS_CALIBRATED
        if not is_calibrated:
            self.is_calibrated = is_calibrated

        self.is_training = is_training
        self.use_regressor = use_regressor

        self.residual_pose_history = []
        self.Ti_history =[]
        self.coords_history = []
        self.residual_history = []
        self.inds_history = []
        # self.weights_history = []
        self.flow_history = []
        self.intrinsics_history = []
        self.summaries = []

        if self.cfg.FLOW_NET=='raft':
            self.image_fea_enc= ImageFeaEncoder()
            self.cf_net = GRU_CFUpdator(self.cfg.raft) 
        else:
            raise NotImplementedError 
        self.renderer = renderer  

    def _clear(self,):
        self.residual_pose_history = []
        self.Ti_history =[]
        self.coords_history = []
        self.residual_history = []
        self.inds_history = []
        # self.weights_history = []
        self.flow_history = []
        self.intrinsics_history = []
        self.summaries = []

    def __len__(self):
        return len(self.residual_pose_history)

    def render(self, params, render_tex=False):
        """ render a batch of images given the intrinsic and extrinsic params 

        Args:
            params.K: np.array, of shape Bx3x3
            params.camera_extrinsics: np.array, of shape Bx3x4

        Returns:
            [type]: [description]
        """

        bs=params.K.shape[0]
        colors=[]
        depths=[]

        color,depth= self.renderer( params.obj_cls, params.vert_attribute, T=params.camera_extrinsics, K=params.K, 
            render_image_size=params.render_image_size, near=0.1, far=6, render_tex=render_tex)
        #set the invalid values to zeros
        depth[depth==-1] = 0
        return {
            # 1x3xHxW
            "syn_img": color, 
            "syn_depth": depth.detach(), # 1x1xHxW
        }


    def get_affine_transformation(self, mask, crop_center, with_intrinsic_transform=False, output_size=None, margin_ratio=0.4):
        B,_,H,W = mask.shape
        ratio = float(H) / float(W)
        affine_matrices = []
        intrinsic_matrices = []

        for b in range(B):
            zoom_c_x, zoom_c_y = crop_center[b] #crop_center

            ys, xs = np.nonzero(mask[b][0].detach().cpu().numpy() )
            if len(ys)>0 and len(xs)>0:
                obj_imgn_start_x = xs.min() 
                obj_imgn_start_y = ys.min()
                obj_imgn_end_x = xs.max()
                obj_imgn_end_y = ys.max()
            else:
                obj_imgn_start_x=0
                obj_imgn_start_y=0
                obj_imgn_end_x=0
                obj_imgn_end_y=0


            # mask region
            left_dist = zoom_c_x - obj_imgn_start_x
            right_dist = obj_imgn_end_x - zoom_c_x
            up_dist = zoom_c_y - obj_imgn_start_y
            down_dist = obj_imgn_end_y - zoom_c_y
            # crop_height = np.max([ratio * right_dist, ratio * left_dist, up_dist, down_dist]) * 2 * 1.4
            crop_height = np.max([ratio * right_dist, ratio * left_dist, up_dist, down_dist]) * 2 * (1+margin_ratio)
            crop_width = crop_height / ratio

            # affine transformation for PyTorch
            x1 = (zoom_c_x - crop_width / 2) * 2 / W - 1;
            x2 = (zoom_c_x + crop_width / 2) * 2 / W - 1;
            y1 = (zoom_c_y - crop_height / 2) * 2 / H - 1;
            y2 = (zoom_c_y + crop_height / 2) * 2 / H - 1;

            pts1 = np.float32([[x1, y1], [x1, y2], [x2, y1]])
            pts2 = np.float32([[-1, -1], [-1, 1], [1, -1]])
            affine_matrix = torch.tensor(cv2.getAffineTransform(pts2, pts1), device=mask.device, dtype=torch.float32)
            affine_matrices.append(affine_matrix)


            if with_intrinsic_transform:
                # affine transformation for PyTorch
                x1 = (zoom_c_x - crop_width / 2)
                x2 = (zoom_c_x + crop_width / 2)
                y1 = (zoom_c_y - crop_height / 2)
                y2 = (zoom_c_y + crop_height / 2)

                pts1 = np.float32([[x1, y1], [x1, y2], [x2, y1]])
                # pts2 = np.float32([[0, 0], [0, H-1], [W-1, 0]])
                pts2 = np.float32([[0, 0], [0, output_size[0]-1], [output_size[1]-1, 0]])
                # pts2 = np.float32([[0, 0], [0, 1], [1, 0]])
                intrinsic_matrix = torch.tensor(cv2.getAffineTransform(pts2, pts1), device=mask.device, dtype=torch.float32)
                intrinsic_matrices.append(intrinsic_matrix)
                
        if with_intrinsic_transform:
            return  torch.stack(affine_matrices, dim=0), torch.stack(intrinsic_matrices, dim=0)
        else:
            return torch.stack(affine_matrices, dim=0)

    def gen_zoom_crop_grids(self, fg_mask, K, T, output_size, model_center=[0,0,0], margin_ratio=0.4):
        ##Get the projected model center in image (assuming the model is zero-centered, which should be reconsidered!)  
        crop_center=K@T[:,:3,3:]
        crop_center = crop_center[:,:2]/crop_center[:,2:3]

        ##calculate affine transformation parameters
        affine_matrices, crop_intrinsic_transform=self.get_affine_transformation(fg_mask, crop_center=crop_center.detach().cpu().numpy(), with_intrinsic_transform=True, output_size=(output_size[-2], output_size[-1]),margin_ratio=margin_ratio ) 
        grids = F.affine_grid(affine_matrices, torch.Size(output_size) )
        ##Get cropped intrinsic_transform
        intrinsics_crop= torch.inverse(pose_padding(crop_intrinsic_transform) )@K

        return grids, intrinsics_crop

    # def forward(self, image, Ts, intrinsics, fea_3d=None, inds=None, Tj_gt=None, obj_cls=None, geofea_3d=None, geofea_2d=None):
    def forward(self, image, Ts, intrinsics, fea_3d=None, Tj_gt=None, obj_cls=None, geofea_3d=None, geofea_2d=None):
        #clear the history data
        self._clear()
        cfg = self.cfg

        RANDER_IMAGE_SIZE = get_cfg("BASIC").render_image_size
        ZOOM_CROP_SIZE=get_cfg("BASIC").zoom_crop_size 

        if cfg.RESCALE_IMAGES:
            images = 2 * (images / 255.0) - 1.0

        Tij_gt=[]
        syn_imgs=[]
        syn_depths=[]

        Ti = Ts
        Tij = Ti.copy().identity()

        for ren_iter in range(cfg.RENDER_ITER_COUNT):
            # update rendering params
            Ti = Tij*Ti # accumulate Ti 
            Tij.identity_() #set Tij to identity matrix at the begining of each ren_iter
            if self.legacy:
                Tij = Ti*Ti.inv() #set Tij to identity matrix at the begining of each ren_iter

            render_params = edict({
                    "K": intrinsics.detach(), 
                    "camera_extrinsics": Ti.matrix().detach().squeeze(1), 
                    "obj_cls": obj_cls,
                    "render_image_size": RANDER_IMAGE_SIZE, 
                })

            pc_depth = self.renderer.render_pointcloud(obj_cls, T=render_params.camera_extrinsics, K=render_params.K, 
                                    render_image_size=render_params.render_image_size)


            if self.cfg.ONLINE_CROP:
                #get the forground mask 
                fg_mask = pc_depth>0
                B,C,_,_= pc_depth.size()
                
                ############### Get zoom parameters ###############
                grids, intrinsics_crop = self.gen_zoom_crop_grids(fg_mask, render_params.K, render_params.camera_extrinsics, output_size=[B,C, *ZOOM_CROP_SIZE], model_center=None)

                ############### Render reference images ###############
                # Concatentate the 3D ctx feature "fea_3d" and 3d descriptor "geofea_3d" for feature rendering
                if geofea_3d is not None:
                    fea_3d_cat = torch.cat([fea_3d, geofea_3d ], dim=-1) # BxNxC
                else:
                    fea_3d_cat = fea_3d
                render_params.vert_attribute = fea_3d_cat #fea_3d
                render_params.K = intrinsics_crop.detach()
                render_params.render_image_size = ZOOM_CROP_SIZE
                ren_res = self.render(render_params, render_tex=True) 
                if geofea_3d is not None:
                    syn_img, cfea, geofea1 = torch.split(ren_res['syn_img'],[3, fea_3d.shape[-1], geofea_3d.shape[-1] ] ,dim=1)
                    syn_depth = ren_res['syn_depth']
                    
                else:
                    syn_img, cfea = torch.split(ren_res['syn_img'], [3, fea_3d.shape[-1] ] ,dim=1)
                    geofea1=None
                    syn_depth = ren_res['syn_depth']
                cfea = cfea*0.1 # balance the learning rate with the scale 0.1

                ## Crop and zoom images
                syn_image_crop = syn_img 
                image_crop= F.grid_sample(image, grids)
                cfea_crop= cfea 
                if geofea1 is not None and geofea_2d is not None:
                    # geofea1_crop = F.grid_sample(geofea1, grids)
                    geofea1_crop = geofea1
                    geofea2_crop = F.grid_sample(geofea_2d, grids)

                #Render again to get more accurate depth for supervisions in losses 
                #TODO: could be merged into the rendering process above. -> Done
                if self.legacy:#self.training:
                    depth_render_params=edict({
                            "K": intrinsics_crop.detach(), 
                            "camera_extrinsics": Ti.matrix().detach().squeeze(1), 
                            "obj_cls": obj_cls,
                            "render_image_size": ZOOM_CROP_SIZE, 
                        })
                    syn_depth=self.renderer.render_depth(obj_cls, T=depth_render_params.camera_extrinsics, K=depth_render_params.K, 
                                    render_image_size=depth_render_params.render_image_size, near=0.1, far=6)

            #for visualization only
            syn_imgs.append(syn_image_crop)
            syn_imgs.append(image_crop)

            # encode image features
            feats1, feats2=self.image_fea_enc(syn_image_crop, image_crop)
            # depths = torch.index_select(syn_depth, index=ii, dim=1) + EPS
            depths = syn_depth+EPS

            for i in range(cfg.ITER_COUNT):
                # save for loss calculation 
                self.intrinsics_history.append(intrinsics_crop)
                syn_depths.append(syn_depth)

                Tij = Tij.copy(stop_gradients=True)
                intrinsics_crop = intrinsics_crop.detach()
                
                #Get the projection in frame j of visible model points in frame i with the current relative pose estimation Tij
                reproj_coords, vmask = Tij.transform(
                    depths, intrinsics_crop, valid_mask=True)

                uniform_grids = coords_grid(depths)
                flow_init = torch.einsum( "...ijk->...kij", reproj_coords-uniform_grids[..., :2] ) * (depths>EPS) 
                flow = self.cf_net(feats1, feats2, flow_init=flow_init.squeeze(1), context_fea=cfea_crop, update_corr_fn=i==0)


                self.flow_history.append(flow)

                # Get the correspondences in frame j for each point in frame i, based on the current flow estimates
                if isinstance (flow, (list, tuple)): # flow net may return a list of flow maps
                    correspondence_target = torch.einsum("...ijk->...jki", flow[-1]) + uniform_grids[..., :2]
                else:
                    correspondence_target = torch.einsum("...ijk->...jki", flow) + uniform_grids[..., :2]
                
                # Optimize for the pose by minimizing errors between the constructed correspondence field 
                # (with the currently estimated pose) and the estimated correspondence field 
                if self.with_corr_weight and geofea1 is not None and geofea_2d is not None:
                    geofea2_crop_warpped =  F.grid_sample(geofea2_crop, normalize_coords_grid(correspondence_target).squeeze(1) )
                    corr_weight= torch.sum(geofea1_crop*geofea2_crop_warpped, dim=1,keepdim=True).permute(0,2,3,1)[:,None] #insert frame axis
                    corr_weight = torch.exp(-torch.abs(1-corr_weight)/self.sigma[0]) * (syn_depth>0)[...,None].float()
                else: 
                    corr_weight = weight
                
                Tij = Tij.reprojction_optim(
                    correspondence_target, corr_weight, depths, intrinsics_crop, num_iters=cfg.OPTIM_ITER_COUNT )

                reproj_coords, vmask1 = Tij.transform(
                    depths, intrinsics_crop, valid_mask=True)

                # For the loss calculation later
                self.residual_pose_history.append(Tij)

                self.Ti_history.append(Ti.copy(stop_gradients=True) )
                Tij_gt.append( (Tj_gt*Ti.inv()).copy(stop_gradients=True) )

                self.residual_history.append(
                    vmask*vmask1*(reproj_coords-correspondence_target))  # BxKxHxWx3

        # The final update of Ti 
        Ti = Tij*Ti
        return {
            "Tij": Tij,
            "Ti_pred": Ti,
            "intrinsics": intrinsics,
            "flow": self.flow_history[0],
            "vmask": syn_depth>0, 
            "weight": torch.einsum("...ijk->...kij", corr_weight), 
            "syn_depth": syn_depths, #ren_res['syn_depth'],
            "syn_img": syn_imgs+[image_crop, cfea_crop[:,:3]*10,geofea1[:,:3], geofea2_crop[:,:3]],
            "Tij_gt" : Tij_gt
        }

    def compute_loss(self, Tij_gts, depths, intrinsics, loss='l1', log_error=True, loss3d=None, ):

        total_loss = 0.0
        for i in range(len(self.residual_pose_history)):
            intrinsics = self.intrinsics_history[i]

            depth, intrinsics = rescale_depths_and_intrinsics(depths[i], intrinsics, downscale=1)

            Tij = self.residual_pose_history[i]

            Gij = Tij_gts[i] 

            # intrinsics_pred = intrinsics
            zstar = depth + EPS
            flow_pred, valid_mask_pred = Tij.induced_flow(
                zstar, intrinsics, valid_mask=True)  
            flow_star, valid_mask_star = Gij.induced_flow(
                zstar, intrinsics, valid_mask=True)

            valid_mask = valid_mask_pred * valid_mask_star

            #3D alignment loss 
            loss_3d_proj = 0 
            if loss3d is not None:
                Tj_pred=Tij*self.Ti_history[i]
                Tj_gt=Gij*self.Ti_history[i]
                loss_3d_proj = loss3d(
                    R_pred=Tj_pred.G[:, 0, :3, :3], t_pred=Tj_pred.G[:, 0, :3, 3], R_tgt=Tj_gt.G[:, 0, :3, :3], t_tgt=Tj_gt.G[:, 0, :3, 3])

            # flow loss
            if isinstance( self.flow_history[0], (list, tuple)):
                #squeeze the frame dimmension
                self.flow_history[i] = [self.flow_history[i][f].squeeze(1) for f in range(len(self.flow_history[i])) ]
                flow_mask = valid_mask.squeeze(1).squeeze(-1)
                loss_flow,_ = raft_sequence_flow_loss(self.flow_history[i], flow_gt=torch.einsum("...ijk->...kij", flow_star.squeeze(1)), valid= flow_mask, gamma=0.8, max_flow=MAX_FLOW)
            else:
                raise NotImplementedError
            
            # reprojection loss 
            reproj_diff = valid_mask * \
                torch.clamp(
                    torch.abs(flow_pred - flow_star), -MAX_ERROR, MAX_ERROR)
            reproj_loss = torch.mean(reproj_diff)


            total_loss +=self.cfg.get("TRAIN_PCALIGN_WEIGHT", 1)*loss_3d_proj+ self.cfg.TRAIN_FLOW_WEIGHT* loss_flow + self.cfg.TRAIN_REPROJ_WEIGHT*reproj_loss
        
        # clear the intermediate values
        self._clear()

        return {
            "total_loss": total_loss,
            "reproj_loss": reproj_loss,
            "flow_loss":loss_flow,
            "loss_3d_proj": loss_3d_proj,
            "valid_mask": valid_mask,
            "Tij": Tj_pred.G,
            "Gij": Tj_gt.G
        }


================================================
FILE: model/RNNPose.py
================================================
#
import time
import torch
from torch import nn
from torch.nn import functional as F
import torch.distributed as dist
import apex
import numpy as np
import os
from easydict import EasyDict as edict
from transforms3d.euler import mat2euler, euler2mat, euler2quat, quat2euler
from functools import partial


from model.HybridNet import HybridDescNet,ContextFeatureNet
from thirdparty.kpconv.lib.utils import square_distance
from model.PoseRefiner import PoseRefiner  

from utils.pose_utils import pose_padding
from geometry.transformation import SE3Sequence

from geometry.diff_render_optim import DiffRendererWrapper
from config.default import get_cfg
from utils.util import dict_recursive_op

# from model.RNNPose import register_posenet

REGISTERED_NETWORK_CLASSES = {}


def register_posenet(cls, name=None):
    global REGISTERED_NETWORK_CLASSES
    if name is None:
        name = cls.__name__
    assert name not in REGISTERED_NETWORK_CLASSES, f"exist class: {REGISTERED_NETWORK_CLASSES}"
    REGISTERED_NETWORK_CLASSES[name] = cls
    return cls


def get_posenet_class(name):
    global REGISTERED_NETWORK_CLASSES
    assert name in REGISTERED_NETWORK_CLASSES, f"available class: {REGISTERED_NETWORK_CLASSES}"
    return REGISTERED_NETWORK_CLASSES[name]


@register_posenet
class RNNPose(nn.Module):
    def __init__(self,
                 criterions,
                 opt,
                 name="RNNPose",
                 **kwargs):
        super().__init__()

        self.name = name
        self.opt = opt

        self.hybrid_desc_net=HybridDescNet(opt.descriptor_net)
        self.ctx_fea_net = ContextFeatureNet(opt.descriptor_net)

        self.ctx_fea = {} 

        self.render_params = edict({
            "width": opt.input_w,  # 128,
            "height": opt.input_h,  # 128,
            "gpu_id": opt.get('gpu_id', 0),
            "obj_seqs": opt.obj_seqs
        })

        renderer, diff_renderer= self._render_init(self.render_params)
        self.diff_renderer = diff_renderer

        self.motion_net = PoseRefiner(
            opt.motion_net, bn_is_training=self.training, is_training=self.training,
            renderer=diff_renderer 
        )
        self.contrastive_loss = criterions.get(
            "metric_loss", None)
        self.pose_loss = criterions.get("pose_loss", None)

        self.register_buffer("global_step", torch.LongTensor(1).zero_())


    def update_global_step(self):
        self.global_step += 1

    def get_global_step(self):
        return int(self.global_step.cpu().numpy()[0])

    def clear_global_step(self):
        self.global_step.zero_()
    
    def sample_poses(self, pose_tgt):
        SYN_STD_ROTATION = 15
        SYN_STD_TRANSLATION = 0.01
        ANGLE_MAX=45
        pose_src = pose_tgt.copy()
        num = pose_tgt.shape[0]
        for i in range(num):
            euler = mat2euler(pose_tgt[i, :3, :3])
            euler += SYN_STD_ROTATION * np.random.randn(3) * np.pi / 180.0
            pose_src[i, :3, :3] = euler2mat(euler[0], euler[1], euler[2])

            pose_src[i, 0, 3] = pose_tgt[i, 0, 3]+ SYN_STD_TRANSLATION * np.random.randn(1)
            pose_src[i, 1, 3] = pose_tgt[i, 1, 3] + SYN_STD_TRANSLATION * np.random.randn(1)
            pose_src[i, 2, 3] = pose_tgt[i, 2, 3]  + 5 * SYN_STD_TRANSLATION * np.random.randn(1)

            r_dist = np.arccos((np.trace(pose_src[i, :3,:3].transpose(-1,-2) @ pose_tgt[i, :3,:3]) - 1 )/2)/np.pi*180

            while r_dist > ANGLE_MAX:#or not (16 < center_x < (640 - 16) and 16 < center_y < (480 - 16)):
                print("r_dist > ANGLE_MAX, resampling...")
                euler = mat2euler(pose_tgt[i, :3, :3])
                euler += SYN_STD_ROTATION * np.random.randn(3) * np.pi / 180.0
                pose_src[i, :3, :3] = euler2mat(euler[0], euler[1], euler[2])

                pose_src[i, 0, 3] = pose_tgt[i, 0, 3]+ SYN_STD_TRANSLATION * np.random.randn(1)
                pose_src[i, 1, 3] = pose_tgt[i, 1, 3] + SYN_STD_TRANSLATION * np.random.randn(1)
                pose_src[i, 2, 3] = pose_tgt[i, 2, 3]  + 5 * SYN_STD_TRANSLATION * np.random.randn(1)

                r_dist = np.arccos((np.trace(pose_src[i, :3,:3].transpose(-1,-2) @ pose_tgt[i, :3,:3]) - 1 )/2)*np.pi/180
        return pose_src

    def _render_init(self, config):
        # from data.ycb.basic import bop_ycb_class2idx
        print("config.gpu_id:", config.gpu_id)

        obj_paths = []
        tex_paths = []

        # build cls2idx table for the renderer
        cls2idx = {}
        LM_SEQ=["ape", "benchvise", "camera","cam", "can", "cat", "driller", "duck", "eggbox", "glue", "holepuncher", "iron", "lamp", "phone"]
        # YCB_SEQ=bop_ycb_class2idx.keys()
        for i, seq in enumerate(set(config.obj_seqs)):

            if seq in LM_SEQ: 
                obj_path = f'{os.path.dirname(__file__)}/../EXPDATA/LM6d_converted/models/{seq}/textured.obj'
                tex_path = f'{os.path.dirname(__file__)}/../EXPDATA/LM6d_converted/models/{seq}/texture_map.png'
                assert os.path.exists(obj_path), f"'{obj_path}' dose not exist!" 
                assert os.path.exists(tex_path), f"'{tex_path}' dose not exist!" 
                obj_paths.append(obj_path)
                tex_paths.append(tex_path)
                cls2idx[seq] = i
            else:
                raise NotImplementedError
        renderer=None

        diff_renderer = DiffRendererWrapper(obj_paths)
        diff_renderer.cls2idx = cls2idx

        return renderer, diff_renderer


    def forward(self, sample):
        assert len(set(sample['class_name']))==1, "A batch should contain data of the same class."
        class_name = sample['class_name'][0]

        #encode 3d-2d descriptors
        preds_dict=self.hybrid_desc_net(sample)

        len_src_f = sample['stack_lengths'][0][0]
        geofea_3d = preds_dict.get('descriptors_3d', None) 
        geofea_2d = preds_dict.get('descriptors_2d', None) 


        #encode 3D context features 
        if self.training:
            self.ctx_fea[class_name]=self.ctx_fea_net(sample)
        else:
            if class_name not in self.ctx_fea:
                self.ctx_fea[class_name]=self.ctx_fea_net(sample)

        preds_dict.update(self.ctx_fea[class_name])
        ctx_fea_3d = preds_dict['ctx_fea_3d'][:len_src_f]

        if self.training:
            pose=pose_padding(sample['original_RT'])

            if sample.get("rendered_RT", None) is not None:
                syn_pose = pose_padding(sample['rendered_RT'])
            else:
                syn_pose = torch.tensor(self.sample_poses(sample['original_RT'].detach().cpu(
                ).numpy()), device=sample['original_RT'].device, dtype=sample['original_RT'].dtype)
                syn_pose = pose_padding(syn_pose)
        else:
            pose = pose_padding(sample['original_RT'])
            syn_pose = pose_padding(sample['rendered_RT'])
            

        # calculate the GT relative pose and the initial pose
        
        Ts_pred = SE3Sequence(
            matrix=torch.stack([syn_pose ], dim=1))
        mot_res = self.motion_net(
            Ts=Ts_pred,  
            intrinsics=sample['K'],
            image=sample['image'], 
            fea_3d=ctx_fea_3d[None], 
            Tj_gt=SE3Sequence(matrix=pose[:, None]),
            obj_cls=sample['class_name'],
            geofea_2d = geofea_2d, 
            geofea_3d = geofea_3d[None]
        )
        preds_dict.update(mot_res)
        sample['syn_depth'] = mot_res['syn_depth']
        if self.training:
            ret = self.loss(sample, preds_dict)

            ret['syn_img'] = mot_res['syn_img']
            ret['syn_depth'] = mot_res['syn_depth']
            ret['flow'] = mot_res['flow']
            ret['weight'] = mot_res['weight']

            return ret
        else:
            # ret = self.loss(sample, preds_dict)
            ret={}
            ret.update(preds_dict)
            return ret

    
    def loss(self, sample, preds_dict):

        len_src_f = sample['stack_lengths'][0][0]
        RT =sample['RT']
        camera_intrinsic=sample['K']
        descriptors_2d_map = preds_dict['descriptors_2d']
        descriptors_3d = preds_dict['descriptors_3d'][:len_src_f]
        rand_descriptors_3d = preds_dict['descriptors_3d'][len_src_f:]
        model_points=sample['model_points'][0][:len_src_f]
        orig_model_points = sample['original_model_points']
        rand_model_points=sample['model_points'][0][len_src_f:]
        lifted_points=sample['lifted_points'][0].squeeze(0)
        correspondence=sample['correspondences_2d3d'].squeeze(0)
        depth = sample['depth']
        

        # get the foreground 2d descriptors 
        ys_, xs_=torch.nonzero(sample['depth'].squeeze(), as_tuple=True )
        descriptors_2d=descriptors_2d_map[:,:,ys_,xs_].squeeze().permute([1,0])

        if self.training: 
            device= lifted_points.device
            fg_point_num = len(lifted_points)
            model_point_num = len(model_points)
          
            # append bg features 
            ys_bg, xs_bg = torch.nonzero(sample['depth'].squeeze()<=0, as_tuple=True )
            descriptors_2d_bg = descriptors_2d_map[:,:,ys_bg, xs_bg].squeeze().permute([1,0])
            # descriptors_2d_bg = descriptors_2d_bg[np.random.randint(0,len(descriptors_2d_bg), size=len(lifted_points) )]
            descriptors_2d = torch.cat([descriptors_2d, descriptors_2d_bg], dim=0) 
            descriptors_3d = torch.cat([descriptors_3d, descriptors_2d_bg], dim=0 )

            # append handcrafted coordinates to simplify the code(assign the same coordinates for the bg points far away from the fg points, i.e. 10e6 )
            lifted_points = torch.cat([lifted_points, torch.ones([len(descriptors_2d_bg) ,3],device=device )*10e6 ] ) #append very distant points
            model_points = torch.cat([model_points, torch.ones([len(descriptors_2d_bg) ,3], device=device)*10e6 ] ) #append very distant points

            #append one-to-one inds
            if len(descriptors_2d_bg)>0:
                # randomly sample the bg points to balance the learning process
                sample_inds=np.random.randint(0,len(descriptors_2d_bg), size=int(len(correspondence)*0.1) )
           
                bg_corr= torch.stack([
                    torch.arange(fg_point_num, fg_point_num+len(descriptors_2d_bg), device=device )[sample_inds], 
                    torch.arange(model_point_num, model_point_num+len(descriptors_2d_bg), device=device)[sample_inds]
                 ], dim=-1) #Nx2
                correspondence = torch.cat([correspondence, bg_corr ], dim=0)
            
        if len(lifted_points)>0:
            contra_loss=self.contrastive_loss(src_pcd=lifted_points, tgt_pcd=model_points,
                            src_feats=descriptors_2d, tgt_feats=descriptors_3d,
                            correspondence=correspondence,
                            scores_overlap=None, scores_saliency=None)
        else:
            print("Warning: Contrastive loss is skipped, as no lifted point is found!", flush=True)
            contra_loss={
                'circle_loss': torch.zeros([1], device=depth.device),
                'recall': torch.zeros([1], device=depth.device)
            }


        loss3d = partial(self.pose_loss.forward,
                         points=orig_model_points[None])
        motion_loss = self.motion_net.compute_loss(
            preds_dict['Tij_gt'], sample['syn_depth'], intrinsics=sample['K'], loss='l1', log_error=True, loss3d=loss3d)

        
        loss =  self.contrastive_loss.weight * contra_loss['circle_loss']  + motion_loss['total_loss'] 

        res = {
            "loss": loss, 
            "circle_loss": contra_loss['circle_loss'].detach(),
            "recall":contra_loss['recall'].detach(),
            # "geometric_loss": torch.zeros(1),
            "reproj_loss": motion_loss['reproj_loss'].detach(),
            "loss_3d_proj": motion_loss['loss_3d_proj'].detach(), 
            "valid_mask": motion_loss["valid_mask"].detach(),
        }
        return res


================================================
FILE: model/descriptor2D.py
================================================
from easydict import EasyDict as edict
from pathlib import Path
import torch
from torch import nn
from torchplus.nn.modules.common import Empty


class SuperPoint2D(nn.Module):
    """SuperPoint Convolutional Detector and Descriptor

    SuperPoint: Self-Supervised Interest Point Detection and
    Description. Daniel DeTone, Tomasz Malisiewicz, and Andrew
    Rabinovich. In CVPRW, 2019. https://arxiv.org/abs/1712.07629

    """
    default_config = {
        'descriptor_dim': 256,
        'nms_radius': 4,
        'keypoint_threshold': 0.005,
        'max_keypoints': -1,
        'remove_borders': 4,
        'saliency_score_normalization_fuc': 'sigmoid',
        "use_instance_norm": True
    }

    def __init__(self, config):
        super().__init__()
        self.default_config.update(config) 
        self.config=edict(self.default_config)

        self.normalize_output=config.get('normalize_output', True)

        self.saliency_score_normalization_fuc= self.config.saliency_score_normalization_fuc
        assert self.saliency_score_normalization_fuc in ['sigmoid', 'softmax']

        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        if self.config.use_instance_norm:
            self.Normalization=nn.InstanceNorm2d
        else:
            self.Normalization = Empty


        c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256
        self.input_dim=config.input_dim
        # self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1)
        self.conv1a = nn.Conv2d(config.input_dim, c1, kernel_size=3, stride=1, padding=1)
        self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1)
        self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1)
        self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1)
        self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1)
        self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1)
        self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1)
        self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1)

        self.convPa = nn.Sequential(
            nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1), 
            self.Normalization(c5)
        )

        self.convPb = nn.Conv2d(c5, 1, kernel_size=1, stride=1, padding=0)

        self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)

        self.convDb = nn.Conv2d(
            c5, self.config['descriptor_dim'],
            kernel_size=1, stride=1, padding=0)

        self.decode1=nn.Sequential(
            nn.Upsample(scale_factor=2,mode='bilinear'),
            nn.Conv2d(c4,c4, kernel_size=3, stride=1, padding=1),
            self.Normalization(c4),
            nn.ReLU())
        self.decode2=nn.Sequential(
            nn.Upsample(scale_factor=2,mode='bilinear'),
            nn.Conv2d(c4+c3,c4, kernel_size=3, stride=1, padding=1),
            self.Normalization(c4),
            nn.ReLU()
            )
        
        self.decode3=nn.Sequential(
            nn.Upsample(scale_factor=2,mode='bilinear'),
            nn.Conv2d(c4+c2,c4, kernel_size=3, stride=1, padding=1),
            self.Normalization(c4),
            nn.ReLU()
            )
        
        path = Path(__file__).parent.parent/ 'weights/superpoint_v1.pth'

        self.load_state_dict(torch.load(str(path), map_location='cpu' ), strict=False)

        mk = self.config['max_keypoints']
        if mk == 0 or mk < -1:
            raise ValueError('\"max_keypoints\" must be positive or \"-1\"')

        print('Loaded SuperPoint model')

    def load_state_dict(self,state_dict, strict=True):
 
        if not strict:
            updated_state_dict = {}
            model_dict = self.state_dict()
            for k, v in state_dict.items():
                if k in model_dict and v.shape == model_dict[k].shape:
                    updated_state_dict[k] = v
        else:
            updated_state_dict = state_dict
        super().load_state_dict(updated_state_dict, strict)


    def forward_encoder(self, x):
        if self.input_dim==1:
            x=x.mean(dim=1, keepdims=True) #
        x_skip=[]
        # Shared Encoder
        x = self.relu(self.conv1a(x))
        x = self.relu(self.conv1b(x))
        x_skip.append(x)
        x = self.pool(x)
        x = self.relu(self.conv2a(x))
        x = self.relu(self.conv2b(x))
        x_skip.append(x)
        x = self.pool(x)
        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x_skip.append(x)
        x = self.pool(x)
        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))

        return x, x_skip
    def forward_decoder(self, x, x_skip):
        #upsample first
        x = self.decode1(x)
        x=torch.cat([x, x_skip[-1]], dim=1)
        x = self.decode2(x)
        x=torch.cat([x, x_skip[-2]], dim=1)
        x = self.decode3(x)


        # Compute the dense keypoint scores
        cPa = self.relu(self.convPa(x))
        scores = self.convPb(cPa)

        if self.saliency_score_normalization_fuc == 'sigmoid':
            scores = nn.functional.sigmoid(scores)
        elif self.saliency_score_normalization_fuc == 'softmax':
            scores_shape = scores.shape
            scores = scores.reshape(*scores.shape[0:2], -1)
            scores = nn.functional.softmax(scores/1, dim=-1)
            scores= scores.reshape(*scores_shape)#.clone()
        else:
            raise ValueError

        keypoints=None

        # Compute the dense descriptors
        cDa = self.relu(self.convDa(x))
        descriptors = self.convDb(cDa)
        if self.normalize_output:
            descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)

        return keypoints, scores, descriptors
    def forward(self, data):
        """ Compute keypoints, scores, descriptors for image """
        # Shared Encoder
        x, x_skip=self.forward_encoder(data)

        # Compute the dense keypoint scores
        keypoints, scores, descriptors = self.forward_decoder(x, x_skip)

        return {
            'keypoints': keypoints,
            'scores': scores,
            'descriptors': descriptors,
        }


================================================
FILE: model/descriptor3D.py
================================================
import torch 
import torch.nn as nn 


from kpconv.kpconv_blocks import *
import torch.nn.functional as F
import numpy as np
from kpconv.lib.utils import square_distance

class KPSuperpoint3Dv2(nn.Module):
    #remove useless channels
    def __init__(self, config):
        super().__init__()
        self.normalize_output=config.get('normalize_output', True)

        # build the architectures
        config.architecture = [
        'simple',
        'resnetb',
        ]
        for i in range(config.num_layers-1):
            config.architecture.append('resnetb_strided')
            config.architecture.append('resnetb')
            config.architecture.append('resnetb')
        for i in range(config.num_layers-2):
            config.architecture.append('nearest_upsample')
            config.architecture.append('unary')
        config.architecture.append('nearest_upsample')
        config.architecture.append('last_unary')

        ############
        # Parameters
        ############
        # Current radius of convolution and feature dimension
        layer = 0
        r = config.first_subsampling_dl * config.conv_radius
        in_dim = config.in_features_dim
        out_dim = config.first_feats_dim
        self.K = config.num_kernel_points
        self.epsilon = torch.nn.Parameter(torch.tensor(-5.0))
        self.final_feats_dim = config.final_feats_dim

        #####################
        # List Encoder blocks
        #####################
        # Save all block operations in a list of modules
        self.encoder_blocks = nn.ModuleList()
        self.encoder_skip_dims = []
        self.encoder_skips = []

        # Loop over consecutive blocks
        for block_i, block in enumerate(config.architecture):

            # Check equivariance
            if ('equivariant' in block) and (not out_dim % 3 == 0):
                raise ValueError('Equivariant block but features dimension is not a factor of 3')

            # Detect change to next layer for skip connection
            if np.any([tmp in block for tmp in ['pool', 'strided', 'upsample', 'global']]):
                self.encoder_skips.append(block_i)
                self.encoder_skip_dims.append(in_dim)

            # Detect upsampling block to stop
            if 'upsample' in block:
                break

            # Apply the good block function defining tf ops
            self.encoder_blocks.append(block_decider(block,
                                                    r,
                                                    in_dim,
                                                    out_dim,
                                                    layer,
                                                    config))

            # Update dimension of input from output
            if 'simple' in block:
                in_dim = out_dim // 2
            else:
                in_dim = out_dim

            # Detect change to a subsampled layer
            if 'pool' in block or 'strided' in block:
                # Update radius and feature dimension for next layer
                layer += 1
                r *= 2
                out_dim *= 2

        #####################
        # bottleneck layer 
        #####################
        botneck_feats_dim = config.gnn_feats_dim
        self.bottle = nn.Conv1d(in_dim, botneck_feats_dim,kernel_size=1,bias=True)
        # num_head = config.num_head
        self.proj_gnn = nn.Conv1d(botneck_feats_dim,botneck_feats_dim,kernel_size=1, bias=True)

        
        #####################
        # List Decoder blocks
        #####################
        out_dim = botneck_feats_dim # + 2

        # Save all block operations in a list of modules
        self.decoder_blocks = nn.ModuleList()
        self.decoder_concats = []

        # Find first upsampling block
        start_i = 0
        for block_i, block in enumerate(config.architecture):
            if 'upsample' in block:
                start_i = block_i
                break
        
        # Loop over consecutive blocks
        for block_i, block in enumerate(config.architecture[start_i:]):

            # Add dimension of skip connection concat
            if block_i > 0 and 'upsample' in config.architecture[start_i + block_i - 1]:
                in_dim += self.encoder_skip_dims[layer]
                self.decoder_concats.append(block_i)

            # Apply the good block function defining tf ops
            self.decoder_blocks.append(block_decider(block,
                                                    r,
                                                    in_dim,
                                                    out_dim,
                                                    layer,
                                                    config))

                
            # Update dimension of input from output
            in_dim = out_dim

            # Detect change to a subsampled layer
            if 'upsample' in block:
                # Update radius and feature dimension for next layer
                layer -= 1
                r *= 0.5
                out_dim = out_dim // 2
        return

    def regular_score(self,score):
        score = torch.where(torch.isnan(score), torch.zeros_like(score), score)
        score = torch.where(torch.isinf(score), torch.zeros_like(score), score)
        return score

    def forward_encoder(self, batch):
        # Get input features
        x = batch['features'].clone().detach()
        len_src_c = batch['stack_lengths'][-1][0]
        len_src_f = batch['stack_lengths'][0][0]
        pcd_c = batch['points'][-1]
        pcd_f = batch['points'][0]
        src_pcd_c, tgt_pcd_c = pcd_c[:len_src_c], pcd_c[len_src_c:]

        sigmoid = nn.Sigmoid()
        #################################
        # 1. encoder 
        skip_x = []
        for block_i, block_op in enumerate(self.encoder_blocks):
            if block_i in self.encoder_skips:
                skip_x.append(x)
            x = block_op(x, batch)

        #################################
        # 2. project the bottleneck features
        feats_c = x.transpose(0,1).unsqueeze(0)  #[1, C, N]
        feats_c = self.bottle(feats_c)  #[1, C, N]
        
        return feats_c,skip_x

    def forward_middle(self, x):

        feats_c = self.proj_gnn(x)   
        feats_gnn_raw = feats_c.squeeze(0).transpose(0,1)

        return feats_gnn_raw


    def forward_decoder(self, x, skip_x, batch ):
        sigmoid = nn.Sigmoid()
        for block_i, block_op in enumerate(self.decoder_blocks):
            if block_i in self.decoder_concats:
                x = torch.cat([x, skip_x.pop()], dim=1)
            x = block_op(x, batch)
            
        feats_f = x[:,:self.final_feats_dim]

        # normalise point-wise features
        if self.normalize_output:
            feats_f = F.normalize(feats_f, p=2, dim=1)

        return feats_f 

    def forward(self, batch):
        # Get input features
        feats_c,skip_x = self.forward_encoder(batch)
        x = self.forward_middle(feats_c)
        feats_f = self.forward_decoder(x, skip_x, batch)

        return feats_f


================================================
FILE: model/losses.py
================================================
from sklearn.metrics import precision_recall_fscore_support
from thirdparty.kpconv.lib.utils import square_distance
from abc import ABCMeta, abstractmethod
import time

import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
import torchplus
# from utils.pca import pca_tch

import kornia
# import self_voxelo.utils.pose_utils as pose_utils
import apex.amp as amp

# class Loss(object):


class Loss(nn.Module):
    """Abstract base class for loss functions."""
    __metaclass__ = ABCMeta

    def __init__(self, loss_weight=1):
        super(Loss, self).__init__()
        self._loss_weight = loss_weight

    # def __call__(self,
    def forward(self,
                prediction_tensor,
                target_tensor,
                ignore_nan_targets=False,
                scope=None,
                **params):
        """Call the loss function.

        Args:
          prediction_tensor: an N-d tensor of shape [batch, anchors, ...]
            representing predicted quantities.
          target_tensor: an N-d tensor of shape [batch, anchors, ...] representing
            regression or classification targets.
          ignore_nan_targets: whether to ignore nan targets in the loss computation.
            E.g. can be used if the target tensor is missing groundtruth data that
            shouldn't be factored into the loss.
          scope: Op scope name. Defaults to 'Loss' if None.
          **params: Additional keyword arguments for specific implementations of
                  the Loss.

        Returns:
          loss: a tensor representing the value of the loss function.
        """
        if ignore_nan_targets:
            target_tensor = torch.where(torch.isnan(target_tensor),
                                        prediction_tensor,
                                        target_tensor)
        # ret = self._compute_loss(prediction_tensor, target_tensor, **params)
        # if isinstance(ret, (list, tuple)):
        #     return [self._loss_weight*ret[0]] + list(ret[1:])
        # else:
        return self._loss_weight*self._compute_loss(prediction_tensor, target_tensor, **params)

    @abstractmethod
    @amp.float_function
    def _compute_loss(self, prediction_tensor, target_tensor, **params):
        """Method to be overridden by implementations.

        Args:
          prediction_tensor: a tensor representing predicted quantities
          target_tensor: a tensor representing regression or classification targets
          **params: Additional keyword arguments for specific implementations of
                  the Loss.

        Returns:
          loss: an N-d tensor of shape [batch, anchors, ...] containing the loss per
            anchor
        """

        raise NotImplementedError


class L2Loss(Loss):

    def __init__(self, loss_weight=1):
        super(L2Loss, self).__init__(loss_weight)

    def _compute_loss(self, prediction_tensor, target_tensor, mask=None):
        """Compute loss function.

        Args:
          prediction_tensor: A float tensor of shape [batch_size, num_anchors,
            code_size] representing the (encoded) predicted locations of objects.
          target_tensor: A float tensor of shape [batch_size, num_anchors,
            code_size] representing the regression targets

        Returns:
          loss: a float tensor of shape [batch_size, num_anchors] tensor
            representing the value of the loss function.
        """
        diff = prediction_tensor - target_tensor

        if mask is not None:
            mask = mask.expand_as(diff).byte()
            diff = diff[mask]
        # square_diff = 0.5 * weighted_diff * weighted_diff
        square_diff = diff * diff
        return square_diff.mean()


class AdaptiveWeightedL2Loss(Loss):

    def __init__(self, init_alpha, learn_alpha=True, loss_weight=1, focal_gamma=0):
        super(AdaptiveWeightedL2Loss, self).__init__(loss_weight)
        self.learn_alpha = learn_alpha
        self.alpha = nn.Parameter(torch.Tensor(
            [init_alpha]), requires_grad=learn_alpha)
        self.focal_gamma = focal_gamma
        # self.alpha_shift = -13  # -10# TODO: temporarily test

    def _compute_loss(self, prediction_tensor, target_tensor, mask=None, alpha=None, focal_gamma=None):
        """Compute loss function.

        Args:
          prediction_tensor: A float tensor of shape [batch_size, num_anchors,
            code_size] representing the (encoded) predicted locations of objects.
          target_tensor: A float tensor of shape [batch_size, num_anchors,
            code_size] representing the regression targets

        Returns:
          loss: a float tensor of shape [batch_size, num_anchors] tensor
            representing the value of the loss function.
        """

        if focal_gamma is None:
            focal_gamma = self.focal_gamma
        _alpha = self.alpha
        if mask is None:
            mask = torch.ones_like(target_tensor)
        else:
            mask = mask.expand_as(target_tensor)

        diff = prediction_tensor - target_tensor
        square_diff = (diff * diff) * mask

        # loss = square_diff.mean()
        input_shape = prediction_tensor.shape
        loss = torch.sum(square_diff, dim=list(range(1, len(input_shape)))) / \
            (torch.sum(mask, dim=list(range(1, len(input_shape)))) + 1e-12)  # (B,)

        focal_weight = (torch.exp(-_alpha) * loss)**focal_gamma
        focal_weight = focal_weight/(torch.sum(focal_weight) + 1e-12)

        loss = focal_weight*(torch.exp(-_alpha) * loss)
        loss = loss.sum() + _alpha
        return loss


class MetricLoss(nn.Module):
    """
    We evaluate both contrastive loss and circle loss
    """

    def __init__(self, configs, log_scale=16, pos_optimal=0.1, neg_optimal=1.4, ):
        super(MetricLoss, self).__init__()
        self.log_scale = log_scale
        self.pos_optimal = pos_optimal
        self.neg_optimal = neg_optimal

        self.pos_margin = configs.pos_margin
        self.neg_margin = configs.neg_margin
        self.max_points = configs.max_points

        self.safe_radius = configs.safe_radius
        self.matchability_radius = configs.matchability_radius
        # just to take care of the numeric precision
        self.pos_radius = configs.pos_radius + 0.001
        self.weight = configs.get('loss_weight', 1)

    def get_circle_loss(self, coords_dist, feats_dist):
        """
        Modified from: https://github.com/XuyangBai/D3Feat.pytorch
        """
        pos_mask = coords_dist < self.pos_radius
        neg_mask = coords_dist > self.safe_radius

        # get anchors that have both positive and negative pairs
        row_sel = ((pos_mask.sum(-1) > 0) * (neg_mask.sum(-1) > 0)).detach()
        col_sel = ((pos_mask.sum(-2) > 0) * (neg_mask.sum(-2) > 0)).detach()

        # get alpha for both positive and negative pairs
        pos_weight = feats_dist - 1e5 * \
            (~pos_mask).float()  # mask the non-positive
        # mask the uninformative positive
        pos_weight = (pos_weight - self.pos_optimal)
        pos_weight = torch.max(torch.zeros_like(
            pos_weight), pos_weight).detach()

        neg_weight = feats_dist + 1e5 * \
            (~neg_mask).float()  # mask the non-negative
        # mask the uninformative negative
        neg_weight = (self.neg_optimal - neg_weight)
        neg_weight = torch.max(torch.zeros_like(
            neg_weight), neg_weight).detach()

        lse_pos_row = torch.logsumexp(
            self.log_scale * (feats_dist - self.pos_margin) * pos_weight, dim=-1)
        lse_pos_col = torch.logsumexp(
            self.log_scale * (feats_dist - self.pos_margin) * pos_weight, dim=-2)

        lse_neg_row = torch.logsumexp(
            self.log_scale * (self.neg_margin - feats_dist) * neg_weight, dim=-1)
        lse_neg_col = torch.logsumexp(
            self.log_scale * (self.neg_margin - feats_dist) * neg_weight, dim=-2)

        loss_row = F.softplus(lse_pos_row + lse_neg_row)/self.log_scale
        loss_col = F.softplus(lse_pos_col + lse_neg_col)/self.log_scale

        circle_loss = (loss_row[row_sel].mean() + loss_col[col_sel].mean()) / 2

        return circle_loss

    def get_recall(self, coords_dist, feats_dist):
        """
        Get feature match recall, divided by number of true inliers
        """
        pos_mask = coords_dist < self.pos_radius
        n_gt_pos = (pos_mask.sum(-1) > 0).float().sum()+1e-12
        _, sel_idx = torch.min(feats_dist, -1)

        sel_dist = torch.gather(coords_dist, dim=-1,
                                index=sel_idx[:, None])[pos_mask.sum(-1) > 0]
        n_pred_pos = (sel_dist < self.pos_radius).float().sum()
        recall = n_pred_pos / n_gt_pos
        return recall

    def get_weighted_bce_loss(self, prediction, gt):
        loss = nn.BCELoss(reduction='none')

        class_loss = loss(prediction, gt)

        weights = torch.ones_like(gt)
        w_negative = gt.sum()/gt.size(0)
        w_positive = 1 - w_negative

        weights[gt >= 0.5] = w_positive
        weights[gt < 0.5] = w_negative
        w_class_loss = torch.mean(weights * class_loss)

        #######################################
        # get classification precision and recall
        predicted_labels = prediction.detach().cpu().round().numpy()
        cls_precision, cls_recall, _, _ = precision_recall_fscore_support(
            gt.cpu().numpy(), predicted_labels, average='binary')

        return w_class_loss, cls_precision, cls_recall

    def forward(self, src_pcd, tgt_pcd, src_feats, tgt_feats, correspondence, scores_overlap, scores_saliency, rot=None, trans=None):
        """
        Circle loss for metric learning, here we feed the positive pairs only
        Input:
            src_pcd:        [N, 3], pcd of the 3d model 
            tgt_pcd:        [M, 3], pcd of the lifted model from 2d depth
            rot:            [3, 3], rotation used to rotate the src_pcd to the current frame
            trans:          [3, 1], translation used to translate the src_pcd to the current frame
            src_feats:      [N, C]
            tgt_feats:      [M, C]
        """

        if rot is not None and trans is not None:
            src_pcd = (torch.matmul(rot, src_pcd.transpose(0, 1)) +
                       trans).transpose(0, 1)

        stats = dict()

        #######################################
        # filter some of correspondence
        if(correspondence.size(0) > self.max_points):
            choice = np.random.permutation(correspondence.size(0))[
                :self.max_points]
            correspondence = correspondence[choice]

        src_idx = correspondence[:, 0]
        tgt_idx = correspondence[:, 1]
        src_pcd, tgt_pcd = src_pcd[src_idx], tgt_pcd[tgt_idx]
        src_feats, tgt_feats = src_feats[src_idx], tgt_feats[tgt_idx]

        #######################
        # get L2 distance between source / target point cloud

        coords_dist = torch.sqrt(square_distance(
            src_pcd[None, :, :], tgt_pcd[None, :, :]).squeeze(0))
        feats_dist = torch.sqrt(square_distance(
            src_feats[None, :, :], tgt_feats[None, :, :], normalised=True)).squeeze(0)

        ##############################
        # get FMR and circle loss
        ##############################
        recall = self.get_recall(coords_dist, feats_dist)
        circle_loss = self.get_circle_loss(coords_dist, feats_dist)

        stats['circle_loss'] = circle_loss
        stats['recall'] = recall

        return stats


class PointAlignmentLoss(nn.Module):
    def __init__(self, loss_weight=1, ):
        super().__init__()
        self._loss_weight = loss_weight

    def forward(self, R_pred, t_pred, R_tgt, t_tgt, points):
        return self._loss_weight*self._compute_loss(R_pred, t_pred, R_tgt, t_tgt, points)

    def _compute_loss(self, R_pred, t_pred, R_tgt, t_tgt, points, ):
        """[summary]

        Args:
            R_pred ([type]): [Bx3x3]
            t_pred ([type]): [Bx3]
            R_tgt ([type]): [Bx3x3]
            t_tgt ([type]): [Bx3]
            points ([type]): [BxNx3]

        Returns:
            loss [type]: [loss value]
        """

        loss = 0
        for b in range(len(points)):

            diff = points[b]@R_pred[b].transpose(-1, -2) + t_pred[b] - (
                points[b]@R_tgt[b].transpose(-1, -2)+t_tgt[b])

            square_diff = diff.abs()  
            loss += torch.mean(square_diff)*3

        # loss/=len(points)

        return loss


================================================
FILE: scripts/compile_3rdparty.sh
================================================
#!/usr/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

cd $SCRIPT_DIR/../thirdparty/kpconv/cpp_wrappers
bash compile_wrappers.sh

cd $SCRIPT_DIR/../thirdparty/nn
python setup.py build_ext --inplace

================================================
FILE: scripts/eval.sh
================================================
export PROJECT_ROOT_PATH=/home/RNNPose/Projects/Works/RNNPose_release

export PYTHONPATH="$PROJECT_ROOT_PATH:$PYTHONPATH"
export PYTHONPATH="$PROJECT_ROOT_PATH/thirdparty:$PYTHONPATH"
export model_dir='outputs'
seq=cat
gpu=1
start_gpu_id=0
mkdir $model_dir

train_file=$PROJECT_ROOT_PATH/tools/eval.py
config_path=/$PROJECT_ROOT_PATH/config/linemod/"$seq"_fw0.5.yml
pretrain=$PROJECT_ROOT_PATH/weights/trained_models/"$seq".tckpt

python -u $train_file multi_proc_train  \
        --config_path $config_path \
        --model_dir $model_dir/results \
        --use_dist True \
        --dist_port 10000 \
        --gpus_per_node $gpu \
        --optim_eval True \
        --use_apex False \
        --world_size $gpu \
        --start_gpu_id $start_gpu_id \
        --pretrained_path $pretrain 
 

================================================
FILE: scripts/eval_lmocc.sh
================================================
export PROJECT_ROOT_PATH=/home/RNNPose/Projects/Works/RNNPose_release

export PYTHONPATH="$PROJECT_ROOT_PATH:$PYTHONPATH"
export PYTHONPATH="$PROJECT_ROOT_PATH/thirdparty:$PYTHONPATH"
export model_dir='outputs'
seq=ape
gpu=1
start_gpu_id=0
mkdir $model_dir

train_file=$PROJECT_ROOT_PATH/tools/eval.py
config_path=/$PROJECT_ROOT_PATH/config/linemod/"$seq"_fw0.5_occ.yml
pretrain=$PROJECT_ROOT_PATH/weights/trained_models/"$seq".tckpt

python -u $train_file multi_proc_train  \
        --config_path $config_path \
        --model_dir $model_dir/results \
        --use_dist True \
        --dist_port 10000 \
        --gpus_per_node $gpu \
        --optim_eval True \
        --use_apex False \
        --world_size $gpu \
        --start_gpu_id $start_gpu_id \
        --pretrained_path $pretrain 
        # --use_apex True \
 

================================================
FILE: scripts/run_dataformatter.sh
================================================
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
PROJ_ROOT=$SCRIPT_DIR/../

python $PROJ_ROOT/tools/transform_data_format.py run --data_type "LM_FUSE_PVNET"  --data_info_path $PROJ_ROOT/EXPDATA/"/data_info/linemod_all_10k_default.info.all"  --image_root $PROJ_ROOT/EXPDATA/raw_data/fuse --depth_root $PROJ_ROOT/EXPDATA/raw_data/orig_renders --save_dir $PROJ_ROOT/EXPDATA/LINEMOD/fuse_formatted  && touch 3.finished 


================================================
FILE: scripts/run_datainfo_generation.sh
================================================

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
PROJ_ROOT=$SCRIPT_DIR/../
export PYTHONPATH=$PROJ_ROOT:PYTHONPATH

mkdir --parent $PROJ_ROOT/EXPDATA/data_info/deepim/

python $PROJ_ROOT/tools/generate_data_info_deepim_0_orig.py  create_data_info  --data_root $PROJ_ROOT/EXPDATA/LM6d_converted/LM6d_refine  --saving_path $PROJ_ROOT/EXPDATA/data_info/deepim/linemod_orig_deepim.info  

python $PROJ_ROOT/tools/generate_data_info_deepim_1_syn.py  create_data_info  --data_root $PROJ_ROOT/EXPDATA/LM6d_converted/LM6d_refine_syn  --saving_path $PROJ_ROOT/EXPDATA/data_info/deepim/linemod_syn_deepim.info  --with_assertion True

python $PROJ_ROOT/tools/generate_data_info_deepim_2_posecnnval.py  create_data_info  --data_root $PROJ_ROOT/EXPDATA/LM6d_converted/LM6d_refine  --saving_path $PROJ_ROOT/EXPDATA/data_info/linemod_posecnn.info  --with_assertion True

python $PROJ_ROOT/tools/generate_data_info_v2_deepim.py  create_data_info  --data_root $PROJ_ROOT/EXPDATA/LINEMOD/fuse_formatted/  --saving_path $PROJ_ROOT/EXPDATA/data_info/linemod_fusesformatted_all10k_deepim.info  --training_data_ratio 1 --shuffle=False


================================================
FILE: scripts/train.sh
================================================
export PROJECT_ROOT_PATH=/home/RNNPose/Projects/Works/RNNPose_release

export PYTHONPATH="$PROJECT_ROOT_PATH:$PYTHONPATH"
export PYTHONPATH="$PROJECT_ROOT_PATH/thirdparty:$PYTHONPATH"
export model_dir='outputs'
seq=cat
gpu=1
start_gpu_id=0
mkdir $model_dir

train_file=$PROJECT_ROOT_PATH/tools/train.py
config_path=/$PROJECT_ROOT_PATH/config/linemod/"$seq"_fw0.5.yml
# pretrain=$PROJECT_ROOT_PATH/weights/trained_models/"$seq".tckpt

python -u $train_file multi_proc_train  \
        --config_path $config_path \
        --model_dir $model_dir/results \
        --use_dist True \
        --dist_port 10000 \
        --gpus_per_node $gpu \
        --optim_eval True \
        --use_apex False \
        --world_size $gpu \
        --start_gpu_id $start_gpu_id \
        # --pretrained_path $pretrain 
 

================================================
FILE: thirdparty/kpconv/__init__.py
================================================


================================================
FILE: thirdparty/kpconv/cpp_wrappers/compile_wrappers.sh
================================================
#!/bin/bash

# Compile cpp subsampling
cd cpp_subsampling
python3 setup.py build_ext --inplace
cd ..

# Compile cpp neighbors
cd cpp_neighbors
python3 setup.py build_ext --inplace
cd ..

================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_neighbors/build.bat
================================================
@echo off
py setup.py build_ext --inplace


pause

================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_neighbors/neighbors/neighbors.cpp
================================================

#include "neighbors.h"


void brute_neighbors(vector<PointXYZ>& queries, vector<PointXYZ>& supports, vector<int>& neighbors_indices, float radius, int verbose)
{

	// Initialize variables
	// ******************

	// square radius
	float r2 = radius * radius;

	// indices
	int i0 = 0;

	// Counting vector
	int max_count = 0;
	vector<vector<int>> tmp(queries.size());

	// Search neigbors indices
	// ***********************

	for (auto& p0 : queries)
	{
		int i = 0;
		for (auto& p : supports)
		{
			if ((p0 - p).sq_norm() < r2)
			{
				tmp[i0].push_back(i);
				if (tmp[i0].size() > max_count)
					max_count = tmp[i0].size();
			}
			i++;
		}
		i0++;
	}

	// Reserve the memory
	neighbors_indices.resize(queries.size() * max_count);
	i0 = 0;
	for (auto& inds : tmp)
	{
		for (int j = 0; j < max_count; j++)
		{
			if (j < inds.size())
				neighbors_indices[i0 * max_count + j] = inds[j];
			else
				neighbors_indices[i0 * max_count + j] = -1;
		}
		i0++;
	}

	return;
}

void ordered_neighbors(vector<PointXYZ>& queries,
                        vector<PointXYZ>& supports,
                        vector<int>& neighbors_indices,
                        float radius)
{

	// Initialize variables
	// ******************

	// square radius
	float r2 = radius * radius;

	// indices
	int i0 = 0;

	// Counting vector
	int max_count = 0;
	float d2;
	vector<vector<int>> tmp(queries.size());
	vector<vector<float>> dists(queries.size());

	// Search neigbors indices
	// ***********************

	for (auto& p0 : queries)
	{
		int i = 0;
		for (auto& p : supports)
		{
		    d2 = (p0 - p).sq_norm();
			if (d2 < r2)
			{
			    // Find order of the new point
			    auto it = std::upper_bound(dists[i0].begin(), dists[i0].end(), d2);
			    int index = std::distance(dists[i0].begin(), it);

			    // Insert element
                dists[i0].insert(it, d2);
                tmp[i0].insert(tmp[i0].begin() + index, i);

			    // Update max count
				if (tmp[i0].size() > max_count)
					max_count = tmp[i0].size();
			}
			i++;
		}
		i0++;
	}

	// Reserve the memory
	neighbors_indices.resize(queries.size() * max_count);
	i0 = 0;
	for (auto& inds : tmp)
	{
		for (int j = 0; j < max_count; j++)
		{
			if (j < inds.size())
				neighbors_indices[i0 * max_count + j] = inds[j];
			else
				neighbors_indices[i0 * max_count + j] = -1;
		}
		i0++;
	}

	return;
}

void batch_ordered_neighbors(vector<PointXYZ>& queries,
                                vector<PointXYZ>& supports,
                                vector<int>& q_batches,
                                vector<int>& s_batches,
                                vector<int>& neighbors_indices,
                                float radius)
{

	// Initialize variables
	// ******************

	// square radius
	float r2 = radius * radius;

	// indices
	int i0 = 0;

	// Counting vector
	int max_count = 0;
	float d2;
	vector<vector<int>> tmp(queries.size());
	vector<vector<float>> dists(queries.size());

	// batch index
	int b = 0;
	int sum_qb = 0;
	int sum_sb = 0;


	// Search neigbors indices
	// ***********************

	for (auto& p0 : queries)
	{
	    // Check if we changed batch
	    if (i0 == sum_qb + q_batches[b])
	    {
	        sum_qb += q_batches[b];
	        sum_sb += s_batches[b];
	        b++;
	    }

	    // Loop only over the supports of current batch
	    vector<PointXYZ>::iterator p_it;
		int i = 0;
        for(p_it = supports.begin() + sum_sb; p_it < supports.begin() + sum_sb + s_batches[b]; p_it++ )
        {
		    d2 = (p0 - *p_it).sq_norm();
			if (d2 < r2)
			{
			    // Find order of the new point
			    auto it = std::upper_bound(dists[i0].begin(), dists[i0].end(), d2);
			    int index = std::distance(dists[i0].begin(), it);

			    // Insert element
                dists[i0].insert(it, d2);
                tmp[i0].insert(tmp[i0].begin() + index, sum_sb + i);

			    // Update max count
				if (tmp[i0].size() > max_count)
					max_count = tmp[i0].size();
			}
			i++;
		}
		i0++;
	}

	// Reserve the memory
	neighbors_indices.resize(queries.size() * max_count);
	i0 = 0;
	for (auto& inds : tmp)
	{
		for (int j = 0; j < max_count; j++)
		{
			if (j < inds.size())
				neighbors_indices[i0 * max_count + j] = inds[j];
			else
				neighbors_indices[i0 * max_count + j] = supports.size();
		}
		i0++;
	}

	return;
}


void batch_nanoflann_neighbors(vector<PointXYZ>& queries,
                                vector<PointXYZ>& supports,
                                vector<int>& q_batches,
                                vector<int>& s_batches,
                                vector<int>& neighbors_indices,
                                float radius)
{

	// Initialize variables
	// ******************

	// indices
	int i0 = 0;

	// Square radius
	float r2 = radius * radius;

	// Counting vector
	int max_count = 0;
	float d2;
	vector<vector<pair<size_t, float>>> all_inds_dists(queries.size());

	// batch index
	int b = 0;
	int sum_qb = 0;
	int sum_sb = 0;

	// Nanoflann related variables
	// ***************************

	// CLoud variable
	PointCloud current_cloud;

	// Tree parameters
	nanoflann::KDTreeSingleIndexAdaptorParams tree_params(10 /* max leaf */);

	// KDTree type definition
    typedef nanoflann::KDTreeSingleIndexAdaptor< nanoflann::L2_Simple_Adaptor<float, PointCloud > ,
                                                        PointCloud,
                                                        3 > my_kd_tree_t;

    // Pointer to trees
    my_kd_tree_t* index;

    // Build KDTree for the first batch element
    current_cloud.pts = vector<PointXYZ>(supports.begin() + sum_sb, supports.begin() + sum_sb + s_batches[b]);
    index = new my_kd_tree_t(3, current_cloud, tree_params);
    index->buildIndex();


	// Search neigbors indices
	// ***********************

    // Search params
    nanoflann::SearchParams search_params;
    search_params.sorted = true;

	for (auto& p0 : queries)
	{

	    // Check if we changed batch
	    if (i0 == sum_qb + q_batches[b])
	    {
	        sum_qb += q_batches[b];
	        sum_sb += s_batches[b];
	        b++;

	        // Change the points
	        current_cloud.pts.clear();
            current_cloud.pts = vector<PointXYZ>(supports.begin() + sum_sb, supports.begin() + sum_sb + s_batches[b]);

	        // Build KDTree of the current element of the batch
            delete index;
            index = new my_kd_tree_t(3, current_cloud, tree_params);
            index->buildIndex();
	    }

	    // Initial guess of neighbors size
        all_inds_dists[i0].reserve(max_count);

	    // Find neighbors
	    float query_pt[3] = { p0.x, p0.y, p0.z};
		size_t nMatches = index->radiusSearch(query_pt, r2, all_inds_dists[i0], search_params);

        // Update max count
        if (nMatches > max_count)
            max_count = nMatches;

        // Increment query idx
		i0++;
	}

	// Reserve the memory
	neighbors_indices.resize(queries.size() * max_count);
	i0 = 0;
	sum_sb = 0;
	sum_qb = 0;
	b = 0;
	for (auto& inds_dists : all_inds_dists)
	{
	    // Check if we changed batch
	    if (i0 == sum_qb + q_batches[b])
	    {
	        sum_qb += q_batches[b];
	        sum_sb += s_batches[b];
	        b++;
	    }

		for (int j = 0; j < max_count; j++)
		{
			if (j < inds_dists.size())
				neighbors_indices[i0 * max_count + j] = inds_dists[j].first + sum_sb;
			else
				neighbors_indices[i0 * max_count + j] = supports.size();
		}
		i0++;
	}

	delete index;

	return;
}


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_neighbors/neighbors/neighbors.h
================================================


#include "../../cpp_utils/cloud/cloud.h"
#include "../../cpp_utils/nanoflann/nanoflann.hpp"

#include <set>
#include <cstdint>

using namespace std;


void ordered_neighbors(vector<PointXYZ>& queries,
                        vector<PointXYZ>& supports,
                        vector<int>& neighbors_indices,
                        float radius);

void batch_ordered_neighbors(vector<PointXYZ>& queries,
                                vector<PointXYZ>& supports,
                                vector<int>& q_batches,
                                vector<int>& s_batches,
                                vector<int>& neighbors_indices,
                                float radius);

void batch_nanoflann_neighbors(vector<PointXYZ>& queries,
                                vector<PointXYZ>& supports,
                                vector<int>& q_batches,
                                vector<int>& s_batches,
                                vector<int>& neighbors_indices,
                                float radius);


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_neighbors/setup.py
================================================
from distutils.core import setup, Extension
import numpy.distutils.misc_util

# Adding OpenCV to project
# ************************

# Adding sources of the project
# *****************************

SOURCES = ["../cpp_utils/cloud/cloud.cpp",
             "neighbors/neighbors.cpp",
             "wrapper.cpp"]

module = Extension(name="radius_neighbors",
                    sources=SOURCES,
                    extra_compile_args=['-std=c++11',
                                        '-D_GLIBCXX_USE_CXX11_ABI=0'])


setup(ext_modules=[module], include_dirs=numpy.distutils.misc_util.get_numpy_include_dirs())


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_neighbors/wrapper.cpp
================================================
#include <Python.h>
#include <numpy/arrayobject.h>
#include "neighbors/neighbors.h"
#include <string>


// docstrings for our module
// *************************

static char module_docstring[] = "This module provides two methods to compute radius neighbors from pointclouds or batch of pointclouds";

static char batch_query_docstring[] = "Method to get radius neighbors in a batch of stacked pointclouds";


// Declare the functions
// *********************

static PyObject *batch_neighbors(PyObject *self, PyObject *args, PyObject *keywds);


// Specify the members of the module
// *********************************

static PyMethodDef module_methods[] = 
{
	{ "batch_query", (PyCFunction)batch_neighbors, METH_VARARGS | METH_KEYWORDS, batch_query_docstring },
	{NULL, NULL, 0, NULL}
};


// Initialize the module
// *********************

static struct PyModuleDef moduledef = 
{
    PyModuleDef_HEAD_INIT,
    "radius_neighbors",		// m_name
    module_docstring,       // m_doc
    -1,                     // m_size
    module_methods,         // m_methods
    NULL,                   // m_reload
    NULL,                   // m_traverse
    NULL,                   // m_clear
    NULL,                   // m_free
};

PyMODINIT_FUNC PyInit_radius_neighbors(void)
{
    import_array();
	return PyModule_Create(&moduledef);
}


// Definition of the batch_subsample method
// **********************************

static PyObject* batch_neighbors(PyObject* self, PyObject* args, PyObject* keywds)
{

	// Manage inputs
	// *************

	// Args containers
	PyObject* queries_obj = NULL;
	PyObject* supports_obj = NULL;
	PyObject* q_batches_obj = NULL;
	PyObject* s_batches_obj = NULL;

	// Keywords containers
	static char* kwlist[] = { "queries", "supports", "q_batches", "s_batches", "radius", NULL };
	float radius = 0.1;

	// Parse the input  
	if (!PyArg_ParseTupleAndKeywords(args, keywds, "OOOO|$f", kwlist, &queries_obj, &supports_obj, &q_batches_obj, &s_batches_obj, &radius))
	{
		PyErr_SetString(PyExc_RuntimeError, "Error parsing arguments");
		return NULL;
	}


	// Interpret the input objects as numpy arrays.
	PyObject* queries_array = PyArray_FROM_OTF(queries_obj, NPY_FLOAT, NPY_IN_ARRAY);
	PyObject* supports_array = PyArray_FROM_OTF(supports_obj, NPY_FLOAT, NPY_IN_ARRAY);
	PyObject* q_batches_array = PyArray_FROM_OTF(q_batches_obj, NPY_INT, NPY_IN_ARRAY);
	PyObject* s_batches_array = PyArray_FROM_OTF(s_batches_obj, NPY_INT, NPY_IN_ARRAY);

	// Verify data was load correctly.
	if (queries_array == NULL)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting query points to numpy arrays of type float32");
		return NULL;
	}
	if (supports_array == NULL)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting support points to numpy arrays of type float32");
		return NULL;
	}
	if (q_batches_array == NULL)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting query batches to numpy arrays of type int32");
		return NULL;
	}
	if (s_batches_array == NULL)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting support batches to numpy arrays of type int32");
		return NULL;
	}

	// Check that the input array respect the dims
	if ((int)PyArray_NDIM(queries_array) != 2 || (int)PyArray_DIM(queries_array, 1) != 3)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : query.shape is not (N, 3)");
		return NULL;
	}
	if ((int)PyArray_NDIM(supports_array) != 2 || (int)PyArray_DIM(supports_array, 1) != 3)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : support.shape is not (N, 3)");
		return NULL;
	}
	if ((int)PyArray_NDIM(q_batches_array) > 1)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : queries_batches.shape is not (B,) ");
		return NULL;
	}
	if ((int)PyArray_NDIM(s_batches_array) > 1)
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : supports_batches.shape is not (B,) ");
		return NULL;
	}
	if ((int)PyArray_DIM(q_batches_array, 0) != (int)PyArray_DIM(s_batches_array, 0))
	{
		Py_XDECREF(queries_array);
		Py_XDECREF(supports_array);
		Py_XDECREF(q_batches_array);
		Py_XDECREF(s_batches_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong number of batch elements: different for queries and supports ");
		return NULL;
	}

	// Number of points
	int Nq = (int)PyArray_DIM(queries_array, 0);
	int Ns= (int)PyArray_DIM(supports_array, 0);

	// Number of batches
	int Nb = (int)PyArray_DIM(q_batches_array, 0);

	// Call the C++ function
	// *********************

	// Convert PyArray to Cloud C++ class
	vector<PointXYZ> queries;
	vector<PointXYZ> supports;
	vector<int> q_batches;
	vector<int> s_batches;
	queries = vector<PointXYZ>((PointXYZ*)PyArray_DATA(queries_array), (PointXYZ*)PyArray_DATA(queries_array) + Nq);
	supports = vector<PointXYZ>((PointXYZ*)PyArray_DATA(supports_array), (PointXYZ*)PyArray_DATA(supports_array) + Ns);
	q_batches = vector<int>((int*)PyArray_DATA(q_batches_array), (int*)PyArray_DATA(q_batches_array) + Nb);
	s_batches = vector<int>((int*)PyArray_DATA(s_batches_array), (int*)PyArray_DATA(s_batches_array) + Nb);

	// Create result containers
	vector<int> neighbors_indices;

	// Compute results
	//batch_ordered_neighbors(queries, supports, q_batches, s_batches, neighbors_indices, radius);
	batch_nanoflann_neighbors(queries, supports, q_batches, s_batches, neighbors_indices, radius);

	// Check result
	if (neighbors_indices.size() < 1)
	{
		PyErr_SetString(PyExc_RuntimeError, "Error");
		return NULL;
	}

	// Manage outputs
	// **************

	// Maximal number of neighbors
	int max_neighbors = neighbors_indices.size() / Nq;

	// Dimension of output containers
	npy_intp* neighbors_dims = new npy_intp[2];
	neighbors_dims[0] = Nq;
	neighbors_dims[1] = max_neighbors;

	// Create output array
	PyObject* res_obj = PyArray_SimpleNew(2, neighbors_dims, NPY_INT);
	PyObject* ret = NULL;

	// Fill output array with values
	size_t size_in_bytes = Nq * max_neighbors * sizeof(int);
	memcpy(PyArray_DATA(res_obj), neighbors_indices.data(), size_in_bytes);

	// Merge results
	ret = Py_BuildValue("N", res_obj);

	// Clean up
	// ********

	Py_XDECREF(queries_array);
	Py_XDECREF(supports_array);
	Py_XDECREF(q_batches_array);
	Py_XDECREF(s_batches_array);

	return ret;
}


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_subsampling/build.bat
================================================
@echo off
py setup.py build_ext --inplace


pause

================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_subsampling/grid_subsampling/grid_subsampling.cpp
================================================

#include "grid_subsampling.h"


void grid_subsampling(vector<PointXYZ>& original_points,
                      vector<PointXYZ>& subsampled_points,
                      vector<float>& original_features,
                      vector<float>& subsampled_features,
                      vector<int>& original_classes,
                      vector<int>& subsampled_classes,
                      float sampleDl,
                      int verbose) {

	// Initialize variables
	// ******************

	// Number of points in the cloud
	size_t N = original_points.size();

	// Dimension of the features
	size_t fdim = original_features.size() / N;
	size_t ldim = original_classes.size() / N;

	// Limits of the cloud
	PointXYZ minCorner = min_point(original_points);
	PointXYZ maxCorner = max_point(original_points);
	PointXYZ originCorner = floor(minCorner * (1/sampleDl)) * sampleDl;

	// Dimensions of the grid
	size_t sampleNX = (size_t)floor((maxCorner.x - originCorner.x) / sampleDl) + 1;
	size_t sampleNY = (size_t)floor((maxCorner.y - originCorner.y) / sampleDl) + 1;
	//size_t sampleNZ = (size_t)floor((maxCorner.z - originCorner.z) / sampleDl) + 1;

	// Check if features and classes need to be processed
	bool use_feature = original_features.size() > 0;
	bool use_classes = original_classes.size() > 0;


	// Create the sampled map
	// **********************

	// Verbose parameters
	int i = 0;
	int nDisp = N / 100;

	// Initialize variables
	size_t iX, iY, iZ, mapIdx;
	unordered_map<size_t, SampledData> data;

	for (auto& p : original_points)
	{
		// Position of point in sample map
		iX = (size_t)floor((p.x - originCorner.x) / sampleDl);
		iY = (size_t)floor((p.y - originCorner.y) / sampleDl);
		iZ = (size_t)floor((p.z - originCorner.z) / sampleDl);
		mapIdx = iX + sampleNX*iY + sampleNX*sampleNY*iZ;

		// If not already created, create key
		if (data.count(mapIdx) < 1)
			data.emplace(mapIdx, SampledData(fdim, ldim));

		// Fill the sample map
		if (use_feature && use_classes)
			data[mapIdx].update_all(p, original_features.begin() + i * fdim, original_classes.begin() + i * ldim);
		else if (use_feature)
			data[mapIdx].update_features(p, original_features.begin() + i * fdim);
		else if (use_classes)
			data[mapIdx].update_classes(p, original_classes.begin() + i * ldim);
		else
			data[mapIdx].update_points(p);

		// Display
		i++;
		if (verbose > 1 && i%nDisp == 0)
			std::cout << "\rSampled Map : " << std::setw(3) << i / nDisp << "%";

	}

	// Divide for barycentre and transfer to a vector
	subsampled_points.reserve(data.size());
	if (use_feature)
		subsampled_features.reserve(data.size() * fdim);
	if (use_classes)
		subsampled_classes.reserve(data.size() * ldim);
	for (auto& v : data)
	{
		subsampled_points.push_back(v.second.point * (1.0 / v.second.count));
		if (use_feature)
		{
		    float count = (float)v.second.count;
		    transform(v.second.features.begin(),
                      v.second.features.end(),
                      v.second.features.begin(),
                      [count](float f) { return f / count;});
            subsampled_features.insert(subsampled_features.end(),v.second.features.begin(),v.second.features.end());
		}
		if (use_classes)
		{
		    for (int i = 0; i < ldim; i++)
		        subsampled_classes.push_back(max_element(v.second.labels[i].begin(), v.second.labels[i].end(),
		        [](const pair<int, int>&a, const pair<int, int>&b){return a.second < b.second;})->first);
		}
	}

	return;
}


void batch_grid_subsampling(vector<PointXYZ>& original_points,
                              vector<PointXYZ>& subsampled_points,
                              vector<float>& original_features,
                              vector<float>& subsampled_features,
                              vector<int>& original_classes,
                              vector<int>& subsampled_classes,
                              vector<int>& original_batches,
                              vector<int>& subsampled_batches,
                              float sampleDl,
                              int max_p)
{
	// Initialize variables
	// ******************

	int b = 0;
	int sum_b = 0;

	// Number of points in the cloud
	size_t N = original_points.size();

	// Dimension of the features
	size_t fdim = original_features.size() / N;
	size_t ldim = original_classes.size() / N;

	// Handle max_p = 0
	if (max_p < 1)
	    max_p = N;

	// Loop over batches
	// *****************

	for (b = 0; b < original_batches.size(); b++)
	{

	    // Extract batch points features and labels
	    vector<PointXYZ> b_o_points = vector<PointXYZ>(original_points.begin () + sum_b,
	                                                   original_points.begin () + sum_b + original_batches[b]);

        vector<float> b_o_features;
        if (original_features.size() > 0)
        {
            b_o_features = vector<float>(original_features.begin () + sum_b * fdim,
                                         original_features.begin () + (sum_b + original_batches[b]) * fdim);
	    }

	    vector<int> b_o_classes;
        if (original_classes.size() > 0)
        {
            b_o_classes = vector<int>(original_classes.begin () + sum_b * ldim,
                                      original_classes.begin () + sum_b + original_batches[b] * ldim);
	    }


        // Create result containers
        vector<PointXYZ> b_s_points;
        vector<float> b_s_features;
        vector<int> b_s_classes;

        // Compute subsampling on current batch
        grid_subsampling(b_o_points,
                         b_s_points,
                         b_o_features,
                         b_s_features,
                         b_o_classes,
                         b_s_classes,
                         sampleDl,
						 0);

        // Stack batches points features and labels
        // ****************************************

        // If too many points remove some
        if (b_s_points.size() <= max_p)
        {
            subsampled_points.insert(subsampled_points.end(), b_s_points.begin(), b_s_points.end());

            if (original_features.size() > 0)
                subsampled_features.insert(subsampled_features.end(), b_s_features.begin(), b_s_features.end());

            if (original_classes.size() > 0)
                subsampled_classes.insert(subsampled_classes.end(), b_s_classes.begin(), b_s_classes.end());

            subsampled_batches.push_back(b_s_points.size());
        }
        else
        {
            subsampled_points.insert(subsampled_points.end(), b_s_points.begin(), b_s_points.begin() + max_p);

            if (original_features.size() > 0)
                subsampled_features.insert(subsampled_features.end(), b_s_features.begin(), b_s_features.begin() + max_p * fdim);

            if (original_classes.size() > 0)
                subsampled_classes.insert(subsampled_classes.end(), b_s_classes.begin(), b_s_classes.begin() + max_p * ldim);

            subsampled_batches.push_back(max_p);
        }

        // Stack new batch lengths
        sum_b += original_batches[b];
	}

	return;
}


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_subsampling/grid_subsampling/grid_subsampling.h
================================================


#include "../../cpp_utils/cloud/cloud.h"

#include <set>
#include <cstdint>

using namespace std;

class SampledData
{
public:

	// Elements
	// ********

	int count;
	PointXYZ point;
	vector<float> features;
	vector<unordered_map<int, int>> labels;


	// Methods
	// *******

	// Constructor
	SampledData() 
	{ 
		count = 0; 
		point = PointXYZ();
	}

	SampledData(const size_t fdim, const size_t ldim)
	{
		count = 0;
		point = PointXYZ();
	    features = vector<float>(fdim);
	    labels = vector<unordered_map<int, int>>(ldim);
	}

	// Method Update
	void update_all(const PointXYZ p, vector<float>::iterator f_begin, vector<int>::iterator l_begin)
	{
		count += 1;
		point += p;
		transform (features.begin(), features.end(), f_begin, features.begin(), plus<float>());
		int i = 0;
		for(vector<int>::iterator it = l_begin; it != l_begin + labels.size(); ++it)
		{
		    labels[i][*it] += 1;
		    i++;
		}
		return;
	}
	void update_features(const PointXYZ p, vector<float>::iterator f_begin)
	{
		count += 1;
		point += p;
		transform (features.begin(), features.end(), f_begin, features.begin(), plus<float>());
		return;
	}
	void update_classes(const PointXYZ p, vector<int>::iterator l_begin)
	{
		count += 1;
		point += p;
		int i = 0;
		for(vector<int>::iterator it = l_begin; it != l_begin + labels.size(); ++it)
		{
		    labels[i][*it] += 1;
		    i++;
		}
		return;
	}
	void update_points(const PointXYZ p)
	{
		count += 1;
		point += p;
		return;
	}
};

void grid_subsampling(vector<PointXYZ>& original_points,
                      vector<PointXYZ>& subsampled_points,
                      vector<float>& original_features,
                      vector<float>& subsampled_features,
                      vector<int>& original_classes,
                      vector<int>& subsampled_classes,
                      float sampleDl,
                      int verbose);

void batch_grid_subsampling(vector<PointXYZ>& original_points,
                            vector<PointXYZ>& subsampled_points,
                            vector<float>& original_features,
                            vector<float>& subsampled_features,
                            vector<int>& original_classes,
                            vector<int>& subsampled_classes,
                            vector<int>& original_batches,
                            vector<int>& subsampled_batches,
                            float sampleDl,
                            int max_p);


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_subsampling/setup.py
================================================
from distutils.core import setup, Extension
import numpy.distutils.misc_util

# Adding OpenCV to project
# ************************

# Adding sources of the project
# *****************************

SOURCES = ["../cpp_utils/cloud/cloud.cpp",
             "grid_subsampling/grid_subsampling.cpp",
             "wrapper.cpp"]

module = Extension(name="grid_subsampling",
                    sources=SOURCES,
                    extra_compile_args=['-std=c++11',
                                        '-D_GLIBCXX_USE_CXX11_ABI=0'])


setup(ext_modules=[module], include_dirs=numpy.distutils.misc_util.get_numpy_include_dirs())


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_subsampling/wrapper.cpp
================================================
#include <Python.h>
#include <numpy/arrayobject.h>
#include "grid_subsampling/grid_subsampling.h"
#include <string>


// docstrings for our module
// *************************

static char module_docstring[] = "This module provides an interface for the subsampling of a batch of stacked pointclouds";

static char subsample_docstring[] = "function subsampling a pointcloud";

static char subsample_batch_docstring[] = "function subsampling a batch of stacked pointclouds";


// Declare the functions
// *********************

static PyObject *cloud_subsampling(PyObject* self, PyObject* args, PyObject* keywds);
static PyObject *batch_subsampling(PyObject *self, PyObject *args, PyObject *keywds);


// Specify the members of the module
// *********************************

static PyMethodDef module_methods[] = 
{
	{ "subsample", (PyCFunction)cloud_subsampling, METH_VARARGS | METH_KEYWORDS, subsample_docstring },
	{ "subsample_batch", (PyCFunction)batch_subsampling, METH_VARARGS | METH_KEYWORDS, subsample_batch_docstring },
	{NULL, NULL, 0, NULL}
};


// Initialize the module
// *********************

static struct PyModuleDef moduledef = 
{
    PyModuleDef_HEAD_INIT,
    "grid_subsampling",     // m_name
    module_docstring,       // m_doc
    -1,                     // m_size
    module_methods,         // m_methods
    NULL,                   // m_reload
    NULL,                   // m_traverse
    NULL,                   // m_clear
    NULL,                   // m_free
};

PyMODINIT_FUNC PyInit_grid_subsampling(void)
{
    import_array();
	return PyModule_Create(&moduledef);
}


// Definition of the batch_subsample method
// **********************************

static PyObject* batch_subsampling(PyObject* self, PyObject* args, PyObject* keywds)
{

	// Manage inputs
	// *************

	// Args containers
	PyObject* points_obj = NULL;
	PyObject* features_obj = NULL;
	PyObject* classes_obj = NULL;
	PyObject* batches_obj = NULL;

	// Keywords containers
	static char* kwlist[] = { "points", "batches", "features", "classes", "sampleDl", "method", "max_p", "verbose", NULL };
	float sampleDl = 0.1;
	const char* method_buffer = "barycenters";
	int verbose = 0;
	int max_p = 0;

	// Parse the input  
	if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|$OOfsii", kwlist, &points_obj, &batches_obj, &features_obj, &classes_obj, &sampleDl, &method_buffer, &max_p, &verbose))
	{
		PyErr_SetString(PyExc_RuntimeError, "Error parsing arguments");
		return NULL;
	}

	// Get the method argument
	string method(method_buffer);

	// Interpret method
	if (method.compare("barycenters") && method.compare("voxelcenters"))
	{
		PyErr_SetString(PyExc_RuntimeError, "Error parsing method. Valid method names are \"barycenters\" and \"voxelcenters\" ");
		return NULL;
	}

	// Check if using features or classes
	bool use_feature = true, use_classes = true;
	if (features_obj == NULL)
		use_feature = false;
	if (classes_obj == NULL)
		use_classes = false;

	// Interpret the input objects as numpy arrays.
	PyObject* points_array = PyArray_FROM_OTF(points_obj, NPY_FLOAT, NPY_IN_ARRAY);
	PyObject* batches_array = PyArray_FROM_OTF(batches_obj, NPY_INT, NPY_IN_ARRAY);
	PyObject* features_array = NULL;
	PyObject* classes_array = NULL;
	if (use_feature)
		features_array = PyArray_FROM_OTF(features_obj, NPY_FLOAT, NPY_IN_ARRAY);
	if (use_classes)
		classes_array = PyArray_FROM_OTF(classes_obj, NPY_INT, NPY_IN_ARRAY);

	// Verify data was load correctly.
	if (points_array == NULL)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting input points to numpy arrays of type float32");
		return NULL;
	}
	if (batches_array == NULL)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting input batches to numpy arrays of type int32");
		return NULL;
	}
	if (use_feature && features_array == NULL)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting input features to numpy arrays of type float32");
		return NULL;
	}
	if (use_classes && classes_array == NULL)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting input classes to numpy arrays of type int32");
		return NULL;
	}

	// Check that the input array respect the dims
	if ((int)PyArray_NDIM(points_array) != 2 || (int)PyArray_DIM(points_array, 1) != 3)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : points.shape is not (N, 3)");
		return NULL;
	}
	if ((int)PyArray_NDIM(batches_array) > 1)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : batches.shape is not (B,) ");
		return NULL;
	}
	if (use_feature && ((int)PyArray_NDIM(features_array) != 2))
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : features.shape is not (N, d)");
		return NULL;
	}

	if (use_classes && (int)PyArray_NDIM(classes_array) > 2)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : classes.shape is not (N,) or (N, d)");
		return NULL;
	}

	// Number of points
	int N = (int)PyArray_DIM(points_array, 0);

	// Number of batches
	int Nb = (int)PyArray_DIM(batches_array, 0);

	// Dimension of the features
	int fdim = 0;
	if (use_feature)
		fdim = (int)PyArray_DIM(features_array, 1);

	//Dimension of labels
	int ldim = 1;
	if (use_classes && (int)PyArray_NDIM(classes_array) == 2)
		ldim = (int)PyArray_DIM(classes_array, 1);

	// Check that the input array respect the number of points
	if (use_feature && (int)PyArray_DIM(features_array, 0) != N)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : features.shape is not (N, d)");
		return NULL;
	}
	if (use_classes && (int)PyArray_DIM(classes_array, 0) != N)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(batches_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : classes.shape is not (N,) or (N, d)");
		return NULL;
	}


	// Call the C++ function
	// *********************

	// Create pyramid
	if (verbose > 0)
		cout << "Computing cloud pyramid with support points: " << endl;


	// Convert PyArray to Cloud C++ class
	vector<PointXYZ> original_points;
	vector<int> original_batches;
	vector<float> original_features;
	vector<int> original_classes;
	original_points = vector<PointXYZ>((PointXYZ*)PyArray_DATA(points_array), (PointXYZ*)PyArray_DATA(points_array) + N);
	original_batches = vector<int>((int*)PyArray_DATA(batches_array), (int*)PyArray_DATA(batches_array) + Nb);
	if (use_feature)
		original_features = vector<float>((float*)PyArray_DATA(features_array), (float*)PyArray_DATA(features_array) + N * fdim);
	if (use_classes)
		original_classes = vector<int>((int*)PyArray_DATA(classes_array), (int*)PyArray_DATA(classes_array) + N * ldim);

	// Subsample
	vector<PointXYZ> subsampled_points;
	vector<float> subsampled_features;
	vector<int> subsampled_classes;
	vector<int> subsampled_batches;
	batch_grid_subsampling(original_points,
							subsampled_points,
							original_features,
							subsampled_features,
							original_classes,
							subsampled_classes,
							original_batches,
							subsampled_batches,
							sampleDl,
							max_p);

	// Check result
	if (subsampled_points.size() < 1)
	{
		PyErr_SetString(PyExc_RuntimeError, "Error");
		return NULL;
	}

	// Manage outputs
	// **************

	// Dimension of input containers
	npy_intp* point_dims = new npy_intp[2];
	point_dims[0] = subsampled_points.size();
	point_dims[1] = 3;
	npy_intp* feature_dims = new npy_intp[2];
	feature_dims[0] = subsampled_points.size();
	feature_dims[1] = fdim;
	npy_intp* classes_dims = new npy_intp[2];
	classes_dims[0] = subsampled_points.size();
	classes_dims[1] = ldim;
	npy_intp* batches_dims = new npy_intp[1];
	batches_dims[0] = Nb;

	// Create output array
	PyObject* res_points_obj = PyArray_SimpleNew(2, point_dims, NPY_FLOAT);
	PyObject* res_batches_obj = PyArray_SimpleNew(1, batches_dims, NPY_INT);
	PyObject* res_features_obj = NULL;
	PyObject* res_classes_obj = NULL;
	PyObject* ret = NULL;

	// Fill output array with values
	size_t size_in_bytes = subsampled_points.size() * 3 * sizeof(float);
	memcpy(PyArray_DATA(res_points_obj), subsampled_points.data(), size_in_bytes);
	size_in_bytes = Nb * sizeof(int);
	memcpy(PyArray_DATA(res_batches_obj), subsampled_batches.data(), size_in_bytes);
	if (use_feature)
	{
		size_in_bytes = subsampled_points.size() * fdim * sizeof(float);
		res_features_obj = PyArray_SimpleNew(2, feature_dims, NPY_FLOAT);
		memcpy(PyArray_DATA(res_features_obj), subsampled_features.data(), size_in_bytes);
	}
	if (use_classes)
	{
		size_in_bytes = subsampled_points.size() * ldim * sizeof(int);
		res_classes_obj = PyArray_SimpleNew(2, classes_dims, NPY_INT);
		memcpy(PyArray_DATA(res_classes_obj), subsampled_classes.data(), size_in_bytes);
	}


	// Merge results
	if (use_feature && use_classes)
		ret = Py_BuildValue("NNNN", res_points_obj, res_batches_obj, res_features_obj, res_classes_obj);
	else if (use_feature)
		ret = Py_BuildValue("NNN", res_points_obj, res_batches_obj, res_features_obj);
	else if (use_classes)
		ret = Py_BuildValue("NNN", res_points_obj, res_batches_obj, res_classes_obj);
	else
		ret = Py_BuildValue("NN", res_points_obj, res_batches_obj);

	// Clean up
	// ********

	Py_DECREF(points_array);
	Py_DECREF(batches_array);
	Py_XDECREF(features_array);
	Py_XDECREF(classes_array);

	return ret;
}

// Definition of the subsample method
// ****************************************

static PyObject* cloud_subsampling(PyObject* self, PyObject* args, PyObject* keywds)
{

	// Manage inputs
	// *************

	// Args containers
	PyObject* points_obj = NULL;
	PyObject* features_obj = NULL;
	PyObject* classes_obj = NULL;

	// Keywords containers
	static char* kwlist[] = { "points", "features", "classes", "sampleDl", "method", "verbose", NULL };
	float sampleDl = 0.1;
	const char* method_buffer = "barycenters";
	int verbose = 0;

	// Parse the input  
	if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|$OOfsi", kwlist, &points_obj, &features_obj, &classes_obj, &sampleDl, &method_buffer, &verbose))
	{
		PyErr_SetString(PyExc_RuntimeError, "Error parsing arguments");
		return NULL;
	}

	// Get the method argument
	string method(method_buffer);

	// Interpret method
	if (method.compare("barycenters") && method.compare("voxelcenters"))
	{
		PyErr_SetString(PyExc_RuntimeError, "Error parsing method. Valid method names are \"barycenters\" and \"voxelcenters\" ");
		return NULL;
	}

	// Check if using features or classes
	bool use_feature = true, use_classes = true;
	if (features_obj == NULL)
		use_feature = false;
	if (classes_obj == NULL)
		use_classes = false;

	// Interpret the input objects as numpy arrays.
	PyObject* points_array = PyArray_FROM_OTF(points_obj, NPY_FLOAT, NPY_IN_ARRAY);
	PyObject* features_array = NULL;
	PyObject* classes_array = NULL;
	if (use_feature)
		features_array = PyArray_FROM_OTF(features_obj, NPY_FLOAT, NPY_IN_ARRAY);
	if (use_classes)
		classes_array = PyArray_FROM_OTF(classes_obj, NPY_INT, NPY_IN_ARRAY);

	// Verify data was load correctly.
	if (points_array == NULL)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting input points to numpy arrays of type float32");
		return NULL;
	}
	if (use_feature && features_array == NULL)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting input features to numpy arrays of type float32");
		return NULL;
	}
	if (use_classes && classes_array == NULL)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Error converting input classes to numpy arrays of type int32");
		return NULL;
	}

	// Check that the input array respect the dims
	if ((int)PyArray_NDIM(points_array) != 2 || (int)PyArray_DIM(points_array, 1) != 3)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : points.shape is not (N, 3)");
		return NULL;
	}
	if (use_feature && ((int)PyArray_NDIM(features_array) != 2))
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : features.shape is not (N, d)");
		return NULL;
	}

	if (use_classes && (int)PyArray_NDIM(classes_array) > 2)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : classes.shape is not (N,) or (N, d)");
		return NULL;
	}

	// Number of points
	int N = (int)PyArray_DIM(points_array, 0);

	// Dimension of the features
	int fdim = 0;
	if (use_feature)
		fdim = (int)PyArray_DIM(features_array, 1);

	//Dimension of labels
	int ldim = 1;
	if (use_classes && (int)PyArray_NDIM(classes_array) == 2)
		ldim = (int)PyArray_DIM(classes_array, 1);

	// Check that the input array respect the number of points
	if (use_feature && (int)PyArray_DIM(features_array, 0) != N)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : features.shape is not (N, d)");
		return NULL;
	}
	if (use_classes && (int)PyArray_DIM(classes_array, 0) != N)
	{
		Py_XDECREF(points_array);
		Py_XDECREF(classes_array);
		Py_XDECREF(features_array);
		PyErr_SetString(PyExc_RuntimeError, "Wrong dimensions : classes.shape is not (N,) or (N, d)");
		return NULL;
	}


	// Call the C++ function
	// *********************

	// Create pyramid
	if (verbose > 0)
		cout << "Computing cloud pyramid with support points: " << endl;


	// Convert PyArray to Cloud C++ class
	vector<PointXYZ> original_points;
	vector<float> original_features;
	vector<int> original_classes;
	original_points = vector<PointXYZ>((PointXYZ*)PyArray_DATA(points_array), (PointXYZ*)PyArray_DATA(points_array) + N);
	if (use_feature)
		original_features = vector<float>((float*)PyArray_DATA(features_array), (float*)PyArray_DATA(features_array) + N * fdim);
	if (use_classes)
		original_classes = vector<int>((int*)PyArray_DATA(classes_array), (int*)PyArray_DATA(classes_array) + N * ldim);

	// Subsample
	vector<PointXYZ> subsampled_points;
	vector<float> subsampled_features;
	vector<int> subsampled_classes;
	grid_subsampling(original_points,
		subsampled_points,
		original_features,
		subsampled_features,
		original_classes,
		subsampled_classes,
		sampleDl,
		verbose);

	// Check result
	if (subsampled_points.size() < 1)
	{
		PyErr_SetString(PyExc_RuntimeError, "Error");
		return NULL;
	}

	// Manage outputs
	// **************

	// Dimension of input containers
	npy_intp* point_dims = new npy_intp[2];
	point_dims[0] = subsampled_points.size();
	point_dims[1] = 3;
	npy_intp* feature_dims = new npy_intp[2];
	feature_dims[0] = subsampled_points.size();
	feature_dims[1] = fdim;
	npy_intp* classes_dims = new npy_intp[2];
	classes_dims[0] = subsampled_points.size();
	classes_dims[1] = ldim;

	// Create output array
	PyObject* res_points_obj = PyArray_SimpleNew(2, point_dims, NPY_FLOAT);
	PyObject* res_features_obj = NULL;
	PyObject* res_classes_obj = NULL;
	PyObject* ret = NULL;

	// Fill output array with values
	size_t size_in_bytes = subsampled_points.size() * 3 * sizeof(float);
	memcpy(PyArray_DATA(res_points_obj), subsampled_points.data(), size_in_bytes);
	if (use_feature)
	{
		size_in_bytes = subsampled_points.size() * fdim * sizeof(float);
		res_features_obj = PyArray_SimpleNew(2, feature_dims, NPY_FLOAT);
		memcpy(PyArray_DATA(res_features_obj), subsampled_features.data(), size_in_bytes);
	}
	if (use_classes)
	{
		size_in_bytes = subsampled_points.size() * ldim * sizeof(int);
		res_classes_obj = PyArray_SimpleNew(2, classes_dims, NPY_INT);
		memcpy(PyArray_DATA(res_classes_obj), subsampled_classes.data(), size_in_bytes);
	}


	// Merge results
	if (use_feature && use_classes)
		ret = Py_BuildValue("NNN", res_points_obj, res_features_obj, res_classes_obj);
	else if (use_feature)
		ret = Py_BuildValue("NN", res_points_obj, res_features_obj);
	else if (use_classes)
		ret = Py_BuildValue("NN", res_points_obj, res_classes_obj);
	else
		ret = Py_BuildValue("N", res_points_obj);

	// Clean up
	// ********

	Py_DECREF(points_array);
	Py_XDECREF(features_array);
	Py_XDECREF(classes_array);

	return ret;
}

================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_utils/cloud/cloud.cpp
================================================
//
//
//		0==========================0
//		|    Local feature test    |
//		0==========================0
//
//		version 1.0 : 
//			> 
//
//---------------------------------------------------
//
//		Cloud source :
//		Define usefull Functions/Methods
//
//----------------------------------------------------
//
//		Hugues THOMAS - 10/02/2017
//


#include "cloud.h"


// Getters
// *******

PointXYZ max_point(std::vector<PointXYZ> points)
{
	// Initialize limits
	PointXYZ maxP(points[0]);

	// Loop over all points
	for (auto p : points)
	{
		if (p.x > maxP.x)
			maxP.x = p.x;

		if (p.y > maxP.y)
			maxP.y = p.y;

		if (p.z > maxP.z)
			maxP.z = p.z;
	}

	return maxP;
}

PointXYZ min_point(std::vector<PointXYZ> points)
{
	// Initialize limits
	PointXYZ minP(points[0]);

	// Loop over all points
	for (auto p : points)
	{
		if (p.x < minP.x)
			minP.x = p.x;

		if (p.y < minP.y)
			minP.y = p.y;

		if (p.z < minP.z)
			minP.z = p.z;
	}

	return minP;
}

================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_utils/cloud/cloud.h
================================================
//
//
//		0==========================0
//		|    Local feature test    |
//		0==========================0
//
//		version 1.0 : 
//			> 
//
//---------------------------------------------------
//
//		Cloud header
//
//----------------------------------------------------
//
//		Hugues THOMAS - 10/02/2017
//


# pragma once

#include <vector>
#include <unordered_map>
#include <map>
#include <algorithm>
#include <numeric>
#include <iostream>
#include <iomanip>
#include <cmath>

#include <time.h>


// Point class
// ***********


class PointXYZ
{
public:

	// Elements
	// ********

	float x, y, z;


	// Methods
	// *******
	
	// Constructor
	PointXYZ() { x = 0; y = 0; z = 0; }
	PointXYZ(float x0, float y0, float z0) { x = x0; y = y0; z = z0; }
	
	// array type accessor
	float operator [] (int i) const
	{
		if (i == 0) return x;
		else if (i == 1) return y;
		else return z;
	}

	// opperations
	float dot(const PointXYZ P) const
	{
		return x * P.x + y * P.y + z * P.z;
	}

	float sq_norm()
	{
		return x*x + y*y + z*z;
	}

	PointXYZ cross(const PointXYZ P) const
	{
		return PointXYZ(y*P.z - z*P.y, z*P.x - x*P.z, x*P.y - y*P.x);
	}	

	PointXYZ& operator+=(const PointXYZ& P)
	{
		x += P.x;
		y += P.y;
		z += P.z;
		return *this;
	}

	PointXYZ& operator-=(const PointXYZ& P)
	{
		x -= P.x;
		y -= P.y;
		z -= P.z;
		return *this;
	}

	PointXYZ& operator*=(const float& a)
	{
		x *= a;
		y *= a;
		z *= a;
		return *this;
	}
};


// Point Opperations
// *****************

inline PointXYZ operator + (const PointXYZ A, const PointXYZ B)
{
	return PointXYZ(A.x + B.x, A.y + B.y, A.z + B.z);
}

inline PointXYZ operator - (const PointXYZ A, const PointXYZ B)
{
	return PointXYZ(A.x - B.x, A.y - B.y, A.z - B.z);
}

inline PointXYZ operator * (const PointXYZ P, const float a)
{
	return PointXYZ(P.x * a, P.y * a, P.z * a);
}

inline PointXYZ operator * (const float a, const PointXYZ P)
{
	return PointXYZ(P.x * a, P.y * a, P.z * a);
}

inline std::ostream& operator << (std::ostream& os, const PointXYZ P)
{
	return os << "[" << P.x << ", " << P.y << ", " << P.z << "]";
}

inline bool operator == (const PointXYZ A, const PointXYZ B)
{
	return A.x == B.x && A.y == B.y && A.z == B.z;
}

inline PointXYZ floor(const PointXYZ P)
{
	return PointXYZ(std::floor(P.x), std::floor(P.y), std::floor(P.z));
}


PointXYZ max_point(std::vector<PointXYZ> points);
PointXYZ min_point(std::vector<PointXYZ> points);


struct PointCloud
{

	std::vector<PointXYZ>  pts;

	// Must return the number of data points
	inline size_t kdtree_get_point_count() const { return pts.size(); }

	// Returns the dim'th component of the idx'th point in the class:
	// Since this is inlined and the "dim" argument is typically an immediate value, the
	//  "if/else's" are actually solved at compile time.
	inline float kdtree_get_pt(const size_t idx, const size_t dim) const
	{
		if (dim == 0) return pts[idx].x;
		else if (dim == 1) return pts[idx].y;
		else return pts[idx].z;
	}

	// Optional bounding-box computation: return false to default to a standard bbox computation loop.
	//   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
	//   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
	template <class BBOX>
	bool kdtree_get_bbox(BBOX& /* bb */) const { return false; }

};


================================================
FILE: thirdparty/kpconv/cpp_wrappers/cpp_utils/nanoflann/nanoflann.hpp
================================================
/***********************************************************************
 * Software License Agreement (BSD License)
 *
 * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
 * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
 * Copyright 2011-2016  Jose Luis Blanco (joseluisblancoc@gmail.com).
 *   All rights reserved.
 *
 * THE BSD LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *************************************************************************/

/** \mainpage nanoflann C++ API documentation
 *  nanoflann is a C++ header-only library for building KD-Trees, mostly
 *  optimized for 2D or 3D point clouds.
 *
 *  nanoflann does not require compiling or installing, just an
 *  #include <nanoflann.hpp> in your code.
 *
 *  See:
 *   - <a href="modules.html" >C++ API organized by modules</a>
 *   - <a href="https://github.com/jlblancoc/nanoflann" >Online README</a>
 *   - <a href="http://jlblancoc.github.io/nanoflann/" >Doxygen
 * documentation</a>
 */

#ifndef NANOFLANN_HPP_
#define NANOFLANN_HPP_

#include <algorithm>
#include <array>
#include <cassert>
#include <cmath>   // for abs()
#include <cstdio>  // for fwrite()
#include <cstdlib> // for abs()
#include <functional>
#include <limits> // std::reference_wrapper
#include <stdexcept>
#include <vector>

/** Library version: 0xMmP (M=Major,m=minor,P=patch) */
#define NANOFLANN_VERSION 0x130

// Avoid conflicting declaration of min/max macros in windows headers
#if !defined(NOMINMAX) &&                                                      \
    (defined(_WIN32) || defined(_WIN32_) || defined(WIN32) || defined(_WIN64))
#define NOMINMAX
#ifdef max
#undef max
#undef min
#endif
#endif

namespace nanoflann {
/** @addtogroup nanoflann_grp nanoflann C++ library for ANN
 *  @{ */

/** the PI constant (required to avoid MSVC missing symbols) */
template <typename T> T pi_const() {
  return static_cast<T>(3.14159265358979323846);
}

/**
 * Traits if object is resizable and assignable (typically has a resize | assign
 * method)
 */
template <typename T, typename = int> struct has_resize : std::false_type {};

template <typename T>
struct has_resize<T, decltype((void)std::declval<T>().resize(1), 0)>
    : std::true_type {};

template <typename T, typename = int> struct has_assign : std::false_type {};

template <typename T>
struct has_assign<T, decltype((void)std::declval<T>().assign(1, 0), 0)>
    : std::true_type {};

/**
 * Free function to resize a resizable object
 */
template <typename Container>
inline typename std::enable_if<has_resize<Container>::value, void>::type
resize(Container &c, const size_t nElements) {
  c.resize(nElements);
}

/**
 * Free function that has no effects on non resizable containers (e.g.
 * std::array) It raises an exception if the expected size does not match
 */
template <typename Container>
inline typename std::enable_if<!has_resize<Container>::value, void>::type
resize(Container &c, const size_t nElements) {
  if (nElements != c.size())
    throw std::logic_error("Try to change the size of a std::array.");
}

/**
 * Free function to assign to a container
 */
template <typename Container, typename T>
inline typename std::enable_if<has_assign<Container>::value, void>::type
assign(Container &c, const size_t nElements, const T &value) {
  c.assign(nElements, value);
}

/**
 * Free function to assign to a std::array
 */
template <typename Container, typename T>
inline typename std::enable_if<!has_assign<Container>::value, void>::type
assign(Container &c, const size_t nElements, const T &value) {
  for (size_t i = 0; i < nElements; i++)
    c[i] = value;
}

/** @addtogroup result_sets_grp Result set classes
 *  @{ */
template <typename _DistanceType, typename _IndexType = size_t,
          typename _CountType = size_t>
class KNNResultSet {
public:
  typedef _DistanceType DistanceType;
  typedef _IndexType IndexType;
  typedef _CountType CountType;

private:
  IndexType *indices;
  DistanceType *dists;
  CountType capacity;
  CountType count;

public:
  inline KNNResultSet(CountType capacity_)
      : indices(0), dists(0), capacity(capacity_), count(0) {}

  inline void init(IndexType *indices_, DistanceType *dists_) {
    indices = indices_;
    dists = dists_;
    count = 0;
    if (capacity)
      dists[capacity - 1] = (std::numeric_limits<DistanceType>::max)();
  }

  inline CountType size() const { return count; }

  inline bool full() const { return count == capacity; }

  /**
   * Called during search to add an element matching the criteria.
   * @return true if the search should be continued, false if the results are
   * sufficient
   */
  inline bool addPoint(DistanceType dist, IndexType index) {
    CountType i;
    for (i = count; i > 0; --i) {
#ifdef NANOFLANN_FIRST_MATCH // If defined and two points have the same
                             // distance, the one with the lowest-index will be
                             // returned first.
      if ((dists[i - 1] > dist) ||
          ((dist == dists[i - 1]) && (indices[i - 1] > index))) {
#else
      if (dists[i - 1] > dist) {
#endif
        if (i < capacity) {
          dists[i] = dists[i - 1];
          indices[i] = indices[i - 1];
        }
      } else
        break;
    }
    if (i < capacity) {
      dists[i] = dist;
      indices[i] = index;
    }
    if (count < capacity)
      count++;

    // tell caller that the search shall continue
    return true;
  }

  inline DistanceType worstDist() const { return dists[capacity - 1]; }
};

/** operator "<" for std::sort() */
struct IndexDist_Sorter {
  /** PairType will be typically: std::pair<IndexType,DistanceType> */
  template <typename PairType>
  inline bool operator()(const PairType &p1, const PairType &p2) const {
    return p1.second < p2.second;
  }
};

/**
 * A result-set class used when performing a radius based search.
 */
template <typename _DistanceType, typename _IndexType = size_t>
class RadiusResultSet {
public:
  typedef _DistanceType DistanceType;
  typedef _IndexType IndexType;

public:
  const DistanceType radius;

  std::vector<std::pair<IndexType, DistanceType>> &m_indices_dists;

  inline RadiusResultSet(
      DistanceType radius_,
      std::vector<std::pair<IndexType, DistanceType>> &indices_dists)
      : radius(radius_), m_indices_dists(indices_dists) {
    init();
  }

  inline void init() { clear(); }
  inline void clear() { m_indices_dists.clear(); }

  inline size_t size() const { return m_indices_dists.size(); }

  inline bool full() const { return true; }

  /**
   * Called during search to add an element matching the criteria.
   * @return true if the search should be continued, false if the results are
   * sufficient
   */
  inline bool addPoint(DistanceType dist, IndexType index) {
    if (dist < radius)
      m_indices_dists.push_back(std::make_pair(index, dist));
    return true;
  }

  inline DistanceType worstDist() const { return radius; }

  /**
   * Find the worst result (furtherest neighbor) without copying or sorting
   * Pre-conditions: size() > 0
   */
  std::pair<IndexType, DistanceType> worst_item() const {
    if (m_indices_dists.empty())
      throw std::runtime_error("Cannot invoke RadiusResultSet::worst_item() on "
                               "an empty list of results.");
    typedef
        typename std::vector<std::pair<IndexType, DistanceType>>::const_iterator
            DistIt;
    DistIt it = std::max_element(m_indices_dists.begin(), m_indices_dists.end(),
                                 IndexDist_Sorter());
    return *it;
  }
};

/** @} */

/** @addtogroup loadsave_grp Load/save auxiliary functions
 * @{ */
template <typename T>
void save_value(FILE *stream, const T &value, size_t count = 1) {
  fwrite(&value, sizeof(value), count, stream);
}

template <typename T>
void save_value(FILE *stream, const std::vector<T> &value) {
  size_t size = value.size();
  fwrite(&size, sizeof(size_t), 1, stream);
  fwrite(&value[0], sizeof(T), size, stream);
}

template <typename T>
void load_value(FILE *stream, T &value, size_t count = 1) {
  size_t read_cnt = fread(&value, sizeof(value), count, stream);
  if (read_cnt != count) {
    throw std::runtime_error("Cannot read from file");
  }
}

template <typename T> void load_value(FILE *stream, std::vector<T> &value) {
  size_t size;
  size_t read_cnt = fread(&size, sizeof(size_t), 1, stream);
  if (read_cnt != 1) {
    throw std::runtime_error("Cannot read from file");
  }
  value.resize(size);
  read_cnt = fread(&value[0], sizeof(T), size, stream);
  if (read_cnt != size) {
    throw std::runtime_error("Cannot read from file");
  }
}
/** @} */

/** @addtogroup metric_grp Metric (distance) classes
 * @{ */

struct Metric {};

/** Manhattan distance functor (generic version, optimized for
 * high-dimensionality data sets). Corresponding distance traits:
 * nanoflann::metric_L1 \tparam T Type of the elements (e.g. double, float,
 * uint8_t) \tparam _DistanceType Type of distance variables (must be signed)
 * (e.g. float, double, int64_t)
 */
template <class T, class DataSource, typename _DistanceType = T>
struct L1_Adaptor {
  typedef T ElementType;
  typedef _DistanceType DistanceType;

  const DataSource &data_source;

  L1_Adaptor(const DataSource &_data_source) : data_source(_data_source) {}

  inline DistanceType evalMetric(const T *a, const size_t b_idx, size_t size,
                                 DistanceType worst_dist = -1) const {
    DistanceType result = DistanceType();
    const T *last = a + size;
    const T *lastgroup = last - 3;
    size_t d = 0;

    /* Process 4 items with each loop for efficiency. */
    while (a < lastgroup) {
      const DistanceType diff0 =
          std::abs(a[0] - data_source.kdtree_get_pt(b_idx, d++));
      const DistanceType diff1 =
          std::abs(a[1] - data_source.kdtree_get_pt(b_idx, d++));
      const DistanceType diff2 =
          std::abs(a[2] - data_source.kdtree_get_pt(b_idx, d++));
      const DistanceType diff3 =
          std::abs(a[3] - data_source.kdtree_get_pt(b_idx, d++));
      result += diff0 + diff1 + diff2 + diff3;
      a += 4;
      if ((worst_dist > 0) && (result > worst_dist)) {
        return result;
      }
    }
    /* Process last 0-3 components.  Not needed for standard vector lengths. */
    while (a < last) {
      result += std::abs(*a++ - data_source.kdtree_get_pt(b_idx, d++));
    }
    return result;
  }

  template <typename U, typename V>
  inline DistanceType accum_dist(const U a, const V b, const size_t) const {
    return std::abs(a - b);
  }
};

/** Squared Euclidean distance functor (generic version, optimized for
 * high-dimensionality data sets). Corresponding distance traits:
 * nanoflann::metric_L2 \tparam T Type of the elements (e.g. double, float,
 * uint8_t) \tparam _DistanceType Type of distance variables (must be signed)
 * (e.g. float, double, int64_t)
 */
template <class T, class DataSource, typename _DistanceType = T>
struct L2_Adaptor {
  typedef T ElementType;
  typedef _DistanceType DistanceType;

  const DataSource &data_source;

  L2_Adaptor(const DataSource &_data_source) : data_source(_data_source) {}

  inline DistanceType evalMetric(const T *a, const size_t b_idx, size_t size,
                                 DistanceType worst_dist = -1) const {
    DistanceType result = DistanceType();
    const T *last = a + size;
    const T *lastgroup = last - 3;
    size_t d = 0;

    /* Process 4 items with each loop for efficiency. */
    while (a < lastgroup) {
      const DistanceType diff0 = a[0] - data_source.kdtree_get_pt(b_idx, d++);
      const DistanceType diff1 = a[1] - data_source.kdtree_get_pt(b_idx, d++);
      const DistanceType diff2 = a[2] - data_source.kdtree_get_pt(b_idx, d++);
      const DistanceType diff3 = a[3] - data_source.kdtree_get_pt(b_idx, d++);
      result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
      a += 4;
      if ((worst_dist > 0) && (result > worst_dist)) {
        return result;
      }
    }
    /* Process last 0-3 components.  Not needed for standard vector lengths. */
    while (a < last) {
      const DistanceType diff0 = *a++ - data_source.kdtree_get_pt(b_idx, d++);
      result += diff0 * diff0;
    }
    return result;
  }

  template <typename U, typename V>
  inline DistanceType accum_dist(const U a, const V b, const size_t) const {
    return (a - b) * (a - b);
  }
};

/** Squared Euclidean (L2) distance functor (suitable for low-dimensionality
 * datasets, like 2D or 3D point clouds) Corresponding distance traits:
 * nanoflann::metric_L2_Simple \tparam T Type of the elements (e.g. double,
 * float, uint8_t) \tparam _DistanceType Type of distance variables (must be
 * signed) (e.g. float, double, int64_t)
 */
template <class T, class DataSource, typename _DistanceType = T>
struct L2_Simple_Adaptor {
  typedef T ElementType;
  typedef _DistanceType DistanceType;

  const DataSource &data_source;

  L2_Simple_Adaptor(const DataSource &_data_source)
      : data_source(_data_source) {}

  inline DistanceType evalMetric(const T *a, const size_t b_idx,
                                 size_t size) const {
    DistanceType result = DistanceType();
    for (size_t i = 0; i < size; ++i) {
      const DistanceType diff = a[i] - data_source.kdtree_get_pt(b_idx, i);
      result += diff * diff;
    }
    return result;
  }

  template <typename U, typename V>
  inline DistanceType accum_dist(const U a, const V b, const size_t) const {
    return (a - b) * (a - b);
  }
};

/** SO2 distance functor
 *  Corresponding distance traits: nanoflann::metric_SO2
 * \tparam T Type of the elements (e.g. double, float)
 * \tparam _DistanceType Type of distance variables (must be signed) (e.g.
 * float, double) orientation is constrained to be in [-pi, pi]
 */
template <class T, class DataSource, typename _DistanceType = T>
struct SO2_Adaptor {
  typedef T ElementType;
  typedef _DistanceType DistanceType;

  const DataSource &data_source;

  SO2_Adaptor(const DataSource &_data_source) : data_source(_data_source) {}

  inline DistanceType evalMetric(const T *a, const size_t b_idx,
                                 size_t size) const {
    return accum_dist(a[size - 1], data_source.kdtree_get_pt(b_idx, size - 1),
                      size - 1);
  }

  /** Note: this assumes that input angles are already in the range [-pi,pi] */
  template <typename U, typename V>
  inline DistanceType accum_dist(const U a, const V b, const size_t) const {
    DistanceType result = DistanceType(), PI = pi_const<DistanceType>();
    result = b - a;
    if (result > PI)
      result -= 2 * PI;
    else if (result < -PI)
      result += 2 * PI;
    return result;
  }
};

/** SO3 distance functor (Uses L2_Simple)
 *  Corresponding distance traits: nanoflann::metric_SO3
 * \tparam T Type of the elements (e.g. double, float)
 * \tparam _DistanceType Type of distance variables (must be signed) (e.g.
 * float, double)
 */
template <class T, class DataSource, typename _DistanceType = T>
struct SO3_Adaptor {
  typedef T ElementType;
  typedef _DistanceType DistanceType;

  L2_Simple_Adaptor<T, DataSource> distance_L2_Simple;

  SO3_Adaptor(const DataSource &_data_source)
      : distance_L2_Simple(_data_source) {}

  inline DistanceType evalMetric(const T *a, const size_t b_idx,
                                 size_t size) const {
    return distance_L2_Simple.evalMetric(a, b_idx, size);
  }

  template <typename U, typename V>
  inline DistanceType accum_dist(const U a, const V b, const size_t idx) const {
    return distance_L2_Simple.accum_dist(a, b, idx);
  }
};

/** Metaprogramming helper traits class for the L1 (Manhattan) metric */
struct metric_L1 : public Metric {
  template <class T, class DataSource> struct traits {
    typedef L1_Adaptor<T, DataSource> distance_t;
  };
};
/** Metaprogramming helper traits class for the L2 (Euclidean) metric */
struct metric_L2 : public Metric {
  template <class T, class DataSource> struct traits {
    typedef L2_Adaptor<T, DataSource> distance_t;
  };
};
/** Metaprogramming helper traits class for the L2_simple (Euclidean) metric */
struct metric_L2_Simple : public Metric {
  template <class T, class DataSource> struct traits {
    typedef L2_Simple_Adaptor<T, DataSource> distance_t;
  };
};
/** Metaprogramming helper traits class for the SO3_InnerProdQuat metric */
struct metric_SO2 : public Metric {
  template <class T, class DataSource> struct traits {
    typedef SO2_Adaptor<T, DataSource> distance_t;
  };
};
/** Metaprogramming helper traits class for the SO3_InnerProdQuat metric */
struct metric_SO3 : public Metric {
  template <class T, class DataSource> struct traits {
    typedef SO3_Adaptor<T, DataSource> distance_t;
  };
};

/** @} */

/** @addtogroup param_grp Parameter structs
 * @{ */

/**  Parameters (see README.md) */
struct KDTreeSingleIndexAdaptorParams {
  KDTreeSingleIndexAdaptorParams(size_t _leaf_max_size = 10)
      : leaf_max_size(_leaf_max_size) {}

  size_t leaf_max_size;
};

/** Search options for KDTreeSingleIndexAdaptor::findNeighbors() */
struct SearchParams {
  /** Note: The first argument (checks_IGNORED_) is ignored, but kept for
   * compatibility with the FLANN interface */
  SearchParams(int checks_IGNORED_ = 32, float eps_ = 0, bool sorted_ = true)
      : checks(checks_IGNORED_), eps(eps_), sorted(sorted_) {}

  int checks;  //!< Ignored parameter (Kept for compatibility with the FLANN
               //!< interface).
  float eps;   //!< search for eps-approximate neighbours (default: 0)
  bool sorted; //!< only for radius search, require neighbours sorted by
               //!< distance (default: true)
};
/** @} */

/** @addtogroup memalloc_grp Memory allocation
 * @{ */

/**
 * Allocates (using C's malloc) a generic type T.
 *
 * Params:
 *     count = number of instances to allocate.
 * Returns: pointer (of type T*) to memory buffer
 */
template <typename T> inline T *allocate(size_t count = 1) {
  T *mem = static_cast<T *>(::malloc(sizeof(T) * count));
  return mem;
}

/**
 * Pooled storage allocator
 *
 * The following routines allow for the efficient allocation of storage in
 * small chunks from a specified pool.  Rather than allowing each structure
 * to be freed individually, an entire pool of storage is freed at once.
 * This method has two advantages over just using malloc() and free().  First,
 * it is far more efficient for allocating small objects, as there is
 * no overhead for remembering all the information needed to free each
 * object or consolidating fragmented memory.  Second, the decision about
 * how long to keep an object is made at the time of allocation, and there
 * is no need to track down all the objects to free them.
 *
 */

const size_t WORDSIZE = 16;
const size_t BLOCKSIZE = 8192;

class PooledAllocator {
  /* We maintain memory alignment to word boundaries by requiring that all
      allocations be in multiples of the machine wordsize.  */
  /* Size of machine word in bytes.  Must be power of 2. */
  /* Minimum number of bytes requested at a time from	the system.  Must be
   * multiple of WORDSIZE. */

  size_t remaining; /* Number of bytes left in current block of storage. */
  void *base;       /* Pointer to base of current block of storage. */
  void *loc;        /* Current location in block to next allocate memory. */

  void internal_init() {
    remaining = 0;
    base = NULL;
    usedMemory = 0;
    wastedMemory = 0;
  }

public:
  size_t usedMemory;
  size_t wastedMemory;

  /**
      Default constructor. Initializes a new pool.
   */
  PooledAllocator() { internal_init(); }

  /**
   * Destructor. Frees all the memory allocated in this pool.
   */
  ~PooledAllocator() { free_all(); }

  /** Frees all allocated memory chunks */
  void free_all() {
    while (base != NULL) {
      void *prev =
          *(static_cast<void **>(base)); /* Get pointer to prev block. */
      ::free(base);
      base = prev;
    }
    internal_init();
  }

  /**
   * Returns a pointer to a piece of new memory of the given size in bytes
   * allocated from the pool.
   */
  void *malloc(const size_t req_size) {
    /* Round size up to a multiple of wordsize.  The following expression
        only works for WORDSIZE that is a power of 2, by masking last bits of
        incremented size to zero.
     */
    const size_t size = (req_size + (WORDSIZE - 1)) & ~(WORDSIZE - 1);

    /* Check whether a new block must be allocated.  Note that the first word
        of a block is reserved for a pointer to the previous block.
     */
    if (size > remaining) {

      wastedMemory += remaining;

      /* Allocate new storage. */
      const size_t blocksize =
          (size + sizeof(void *) + (WORDSIZE - 1) > BLOCKSIZE)
              ? size + sizeof(void *) + (WORDSIZE - 1)
              : BLOCKSIZE;

      // use the standard C malloc to allocate memory
      void *m = ::malloc(blocksize);
      if (!m) {
        fprintf(stderr, "Failed to allocate memory.\n");
        return NULL;
      }

      /* Fill first word of new block with pointer to previous block. */
      static_cast<void **>(m)[0] = base;
      base = m;

      size_t shift = 0;
      // int size_t = (WORDSIZE - ( (((size_t)m) + sizeof(void*)) &
      // (WORDSIZE-1))) & (WORDSIZE-1);

      remaining = blocksize - sizeof(void *) - shift;
      loc = (static_cast<char *>(m) + sizeof(void *) + shift);
    }
    void *rloc = loc;
    loc = static_cast<char *>(loc) + size;
    remaining -= size;

    usedMemory += size;

    return rloc;
  }

  /**
   * Allocates (using this pool) a generic type T.
   *
   * Params:
   *     count = number of instances to allocate.
   * Returns: pointer (of type T*) to memory buffer
   */
  template <typename T> T *allocate(const size_t count = 1) {
    T *mem = static_cast<T *>(this->malloc(sizeof(T) * count));
    return mem;
  }
};
/** @} */

/** @addtogroup nanoflann_metaprog_grp Auxiliary metaprogramming stuff
 * @{ */

/** Used to declare fixed-size arrays when DIM>0, dynamically-allocated vectors
 * when DIM=-1. Fixed size version for a generic DIM:
 */
template <int DIM, typename T> struct array_or_vector_selector {
  typedef std::array<T, DIM> container_t;
};
/** Dynamic size version */
template <typename T> struct array_or_vector_selector<-1, T> {
  typedef std::vector<T> container_t;
};

/** @} */

/** kd-tree base-class
 *
 * Contains the member functions common to the classes KDTreeSingleIndexAdaptor
 * and KDTreeSingleIndexDynamicAdaptor_.
 *
 * \tparam Derived The name of the class which inherits this class.
 * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
 * \tparam Distance The distance metric to use, these are all classes derived
 * from nanoflann::Metric \tparam DIM Dimensionality of data points (e.g. 3 for
 * 3D points) \tparam IndexType Will be typically size_t or int
 */

template <class Derived, typename Distance, class DatasetAdaptor, int DIM = -1,
          typename IndexType = size_t>
class KDTreeBaseClass {

public:
  /** Frees the previously-built index. Automatically called within
   * buildIndex(). */
  void freeIndex(Derived &obj) {
    obj.pool.free_all();
    obj.root_node = NULL;
    obj.m_size_at_index_build = 0;
  }

  typedef typename Distance::ElementType ElementType;
  typedef typename Distance::DistanceType DistanceType;

  /*--------------------- Internal Data Structures --------------------------*/
  struct Node {
    /** Union used because a node can be either a LEAF node or a non-leaf node,
     * so both data fields are never used simultaneously */
    union {
      struct leaf {
        IndexType left, right; //!< Indices of points in leaf node
      } lr;
      struct nonleaf {
        int divfeat;                  //!< Dimension used for subdivision.
        DistanceType divlow, divhigh; //!< The values used for subdivision.
      } sub;
    } node_type;
    Node *child1, *child2; //!< Child nodes (both=NULL mean its a leaf node)
  };

  typedef Node *NodePtr;

  struct Interval {
    ElementType low, high;
  };

  /**
   *  Array of indices to vectors in the dataset.
   */
  std::vector<IndexType> vind;

  NodePtr root_node;

  size_t m_leaf_max_size;

  size_t m_size;                //!< Number of current points in the dataset
  size_t m_size_at_index_build; //!< Number of points in the dataset when the
                                //!< index was built
  int dim;                      //!< Dimensionality of each data point

  /** Define "BoundingBox" as a fixed-size or variable-size container depending
   * on "DIM" */
  typedef
      typename array_or_vector_selector<DIM, Interval>::container_t BoundingBox;

  /** Define "distance_vector_t" as a fixed-size or variable-size container
   * depending on "DIM" */
  typedef typename array_or_vector_selector<DIM, DistanceType>::container_t
      distance_vector_t;

  /** The KD-tree used to find neighbours */

  BoundingBox root_bbox;

  /**
   * Pooled memory allocator.
   *
   * Using a pooled memory allocator is more efficient
   * than allocating memory directly when there is a large
   * number small of memory allocations.
   */
  PooledAllocator pool;

  /** Returns number of points in dataset  */
  size_t size(const Derived &obj) const { return obj.m_size; }

  /** Returns the length of each point in the dataset */
  size_t veclen(const Derived &obj) {
    return static_cast<size_t>(DIM > 0 ? DIM : obj.dim);
  }

  /// Helper accessor to the dataset points:
  inline ElementType dataset_get(const Derived &obj, size_t idx,
                                 int component) const {
    return obj.dataset.kdtree_get_pt(idx, component);
  }

  /**
   * Computes the inde memory usage
   * Returns: memory used by the index
   */
  size_t usedMemory(Derived &obj) {
    return obj.pool.usedMemory + obj.pool.wastedMemory +
           obj.dataset.kdtree_get_point_count() *
               sizeof(IndexType); // pool memory and vind array memory
  }

  void computeMinMax(const Derived &obj, IndexType *ind, IndexType count,
                     int element, ElementType &min_elem,
                     ElementType &max_elem) {
    min_elem = dataset_get(obj, ind[0], element);
    max_elem = dataset_get(obj, ind[0], element);
    for (IndexType i = 1; i < count; ++i) {
      ElementType val = dataset_get(obj, ind[i], element);
      if (val < min_elem)
        min_elem = val;
      if (val > max_elem)
        max_elem = val;
    }
  }

  /**
   * Create a tree node that subdivides the list of vecs from vind[first]
   * to vind[last].  The routine is called recursively on each sublist.
   *
   * @param left index of the first vector
   * @param right index of the last vector
   */
  NodePtr divideTree(Derived &obj, const IndexType left, const IndexType right,
                     BoundingBox &bbox) {
    NodePtr node = obj.pool.template allocate<Node>(); // allocate memory

    /* If too few exemplars remain, then make this a leaf node. */
    if ((right - left) <= static_cast<IndexType>(obj.m_leaf_max_size)) {
      node->child1 = node->child2 = NULL; /* Mark as leaf node. */
      node->node_type.lr.left = left;
      node->node_type.lr.right = right;

      // compute bounding-box of leaf points
      for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
        bbox[i].low = dataset_get(obj, obj.vind[left], i);
        bbox[i].high = dataset_get(obj, obj.vind[left], i);
      }
      for (IndexType k = left + 1; k < right; ++k) {
        for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
          if (bbox[i].low > dataset_get(obj, obj.vind[k], i))
            bbox[i].low = dataset_get(obj, obj.vind[k], i);
          if (bbox[i].high < dataset_get(obj, obj.vind[k], i))
            bbox[i].high = dataset_get(obj, obj.vind[k], i);
        }
      }
    } else {
      IndexType idx;
      int cutfeat;
      DistanceType cutval;
      middleSplit_(obj, &obj.vind[0] + left, right - left, idx, cutfeat, cutval,
                   bbox);

      node->node_type.sub.divfeat = cutfeat;

      BoundingBox left_bbox(bbox);
      left_bbox[cutfeat].high = cutval;
      node->child1 = divideTree(obj, left, left + idx, left_bbox);

      BoundingBox right_bbox(bbox);
      right_bbox[cutfeat].low = cutval;
      node->child2 = divideTree(obj, left + idx, right, right_bbox);

      node->node_type.sub.divlow = left_bbox[cutfeat].high;
      node->node_type.sub.divhigh = right_bbox[cutfeat].low;

      for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
        bbox[i].low = std::min(left_bbox[i].low, right_bbox[i].low);
        bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high);
      }
    }

    return node;
  }

  void middleSplit_(Derived &obj, IndexType *ind, IndexType count,
                    IndexType &index, int &cutfeat, DistanceType &cutval,
                    const BoundingBox &bbox) {
    const DistanceType EPS = static_cast<DistanceType>(0.00001);
    ElementType max_span = bbox[0].high - bbox[0].low;
    for (int i = 1; i < (DIM > 0 ? DIM : obj.dim); ++i) {
      ElementType span = bbox[i].high - bbox[i].low;
      if (span > max_span) {
        max_span = span;
      }
    }
    ElementType max_spread = -1;
    cutfeat = 0;
    for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
      ElementType span = bbox[i].high - bbox[i].low;
      if (span > (1 - EPS) * max_span) {
        ElementType min_elem, max_elem;
        computeMinMax(obj, ind, count, i, min_elem, max_elem);
        ElementType spread = max_elem - min_elem;
        ;
        if (spread > max_spread) {
          cutfeat = i;
          max_spread = spread;
        }
      }
    }
    // split in the middle
    DistanceType split_val = (bbox[cutfeat].low + bbox[cutfeat].high) / 2;
    ElementType min_elem, max_elem;
    computeMinMax(obj, ind, count, cutfeat, min_elem, max_elem);

    if (split_val < min_elem)
      cutval = min_elem;
    else if (split_val > max_elem)
      cutval = max_elem;
    else
      cutval = split_val;

    IndexType lim1, lim2;
    planeSplit(obj, ind, count, cutfeat, cutval, lim1, lim2);

    if (lim1 > count / 2)
      index = lim1;
    else if (lim2 < count / 2)
      index = lim2;
    else
      index = count / 2;
  }

  /**
   *  Subdivide the list of points by a plane perpendicular on axe corresponding
   *  to the 'cutfeat' dimension at 'cutval' position.
   *
   *  On return:
   *  dataset[ind[0..lim1-1]][cutfeat]<cutval
   *  dataset[ind[lim1..lim2-1]][cutfeat]==cutval
   *  dataset[ind[lim2..count]][cutfeat]>cutval
   */
  void planeSplit(Derived &obj, IndexType *ind, const IndexType count,
                  int cutfeat, DistanceType &cutval, IndexType &lim1,
                  IndexType &lim2) {
    /* Move vector indices for left subtree to front of list. */
    IndexType left = 0;
    IndexType right = count - 1;
    for (;;) {
      while (left <= right && dataset_get(obj, ind[left], cutfeat) < cutval)
        ++left;
      while (right && left <= right &&
             dataset_get(obj, ind[right], cutfeat) >= cutval)
        --right;
      if (left > right || !right)
        break; // "!right" was added to support unsigned Index types
      std::swap(ind[left], ind[right]);
      ++left;
      --right;
    }
    /* If either list is empty, it means that all remaining features
     * are identical. Split in the middle to maintain a balanced tree.
     */
    lim1 = left;
    right = count - 1;
    for (;;) {
      while (left <= right && dataset_get(obj, ind[left], cutfeat) <= cutval)
        ++left;
      while (right && left <= right &&
             dataset_get(obj, ind[right], cutfeat) > cutval)
        --right;
      if (left > right || !right)
        break; // "!right" was added to support unsigned Index types
      std::swap(ind[left], ind[right]);
      ++left;
      --right;
    }
    lim2 = left;
  }

  DistanceType computeInitialDistances(const Derived &obj,
                                       const ElementType *vec,
                                       distance_vector_t &dists) const {
    assert(vec);
    DistanceType distsq = DistanceType();

    for (int i = 0; i < (DIM > 0 ? DIM : obj.dim); ++i) {
      if (vec[i] < obj.root_bbox[i].low) {
        dists[i] = obj.distance.accum_dist(vec[i], obj.root_bbox[i].low, i);
        distsq += dists[i];
      }
      if (vec[i] > obj.root_bbox[i].high) {
        dists[i] = obj.distance.accum_dist(vec[i], obj.root_bbox[i].high, i);
        distsq += dists[i];
      }
    }
    return distsq;
  }

  void save_tree(Derived &obj, FILE *stream, NodePtr tree) {
    save_value(stream, *tree);
    if (tree->child1 != NULL) {
      save_tree(obj, stream, tree->child1);
    }
    if (tree->child2 != NULL) {
      save_tree(obj, stream, tree->child2);
    }
  }

  void load_tree(Derived &obj, FILE *stream, NodePtr &tree) {
    tree = obj.pool.template allocate<Node>();
    load_value(stream, *tree);
    if (tree->child1 != NULL) {
      load_tree(obj, stream, tree->child1);
    }
    if (tree->child2 != NULL) {
      load_tree(obj, stream, tree->child2);
    }
  }

  /**  Stores the index in a binary file.
   *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so when
   * loading the index object it must be constructed associated to the same
   * source of data points used while building it. See the example:
   * examples/saveload_example.cpp \sa loadIndex  */
  void saveIndex_(Derived &obj, FILE *stream) {
    save_value(stream, obj.m_size);
    save_value(stream, obj.dim);
    save_value(stream, obj.root_bbox);
    save_value(stream, obj.m_leaf_max_size);
    save_value(stream, obj.vind);
    save_tree(obj, stream, obj.root_node);
  }

  /**  Loads a previous index from a binary file.
   *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so the
   * index object must be constructed associated to the same source of data
   * points used while building the index. See the example:
   * examples/saveload_example.cpp \sa loadIndex  */
  void loadIndex_(Derived &obj, FILE *stream) {
    load_value(stream, obj.m_size);
    load_value(stream, obj.dim);
    load_value(stream, obj.root_bbox);
    load_value(stream, obj.m_leaf_max_size);
    load_value(stream, obj.vind);
    load_tree(obj, stream, obj.root_node);
  }
};

/** @addtogroup kdtrees_grp KD-tree classes and adaptors
 * @{ */

/** kd-tree static index
 *
 * Contains the k-d trees and other information for indexing a set of points
 * for nearest-neighbor matching.
 *
 *  The class "DatasetAdaptor" must provide the following interface (can be
 * non-virtual, inlined methods):
 *
 *  \code
 *   // Must return the number of data poins
 *   inline size_t kdtree_get_point_count() const { ... }
 *
 *
 *   // Must return the dim'th component of the idx'th point in the class:
 *   inline T kdtree_get_pt(const size_t idx, const size_t dim) const { ... }
 *
 *   // Optional bounding-box computation: return false to default to a standard
 * bbox computation loop.
 *   //   Return true if the BBOX was already computed by the class and returned
 * in "bb" so it can be avoided to redo it again.
 *   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3
 * for point clouds) template <class BBOX> bool kdtree_get_bbox(BBOX &bb) const
 *   {
 *      bb[0].low = ...; bb[0].high = ...;  // 0th dimension limits
 *      bb[1].low = ...; bb[1].high = ...;  // 1st dimension limits
 *      ...
 *      return true;
 *   }
 *
 *  \endcode
 *
 * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
 * \tparam Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc. \tparam DIM
 * Dimensionality of data points (e.g. 3 for 3D points) \tparam IndexType Will
 * be typically size_t or int
 */
template <typename Distance, class DatasetAdaptor, int DIM = -1,
          typename IndexType = size_t>
class KDTreeSingleIndexAdaptor
    : public KDTreeBaseClass<
          KDTreeSingleIndexAdaptor<Distance, DatasetAdaptor, DIM, IndexType>,
          Distance, DatasetAdaptor, DIM, IndexType> {
public:
  /** Deleted copy constructor*/
  KDTreeSingleIndexAdaptor(
      const KDTreeSingleIndexAdaptor<Distance, DatasetAdaptor, DIM, IndexType>
          &) = delete;

  /**
   * The dataset used by this index
   */
  const DatasetAdaptor &dataset; //!< The source of our data

  const KDTreeSingleIndexAdaptorParams index_params;

  Distance distance;

  typedef typename nanoflann::KDTreeBaseClass<
      nanoflann::KDTreeSingleIndexAdaptor<Distance, DatasetAdaptor, DIM,
                                          IndexType>,
      Distance, DatasetAdaptor, DIM, IndexType>
      BaseClassRef;

  typedef typename BaseClassRef::ElementType ElementType;
  typedef typename BaseClassRef::DistanceType DistanceType;

  typedef typename BaseClassRef::Node Node;
  typedef Node *NodePtr;

  typedef typename BaseClassRef::Interval Interval;
  /** Define "BoundingBox" as a fixed-size or variable-size container depending
   * on "DIM" */
  typedef typename BaseClassRef::BoundingBox BoundingBox;

  /** Define "distance_vector_t" as a fixed-size or variable-size container
   * depending on "DIM" */
  typedef typename BaseClassRef::distance_vector_t distance_vector_t;

  /**
   * KDTree constructor
   *
   * Refer to docs in README.md or online in
   * https://github.com/jlblancoc/nanoflann
   *
   * The KD-Tree point dimension (the length of each point in the datase, e.g. 3
   * for 3D points) is determined by means of:
   *  - The \a DIM template parameter if >0 (highest priority)
   *  - Otherwise, the \a dimensionality parameter of this constructor.
   *
   * @param inputData Dataset with the input features
   * @param params Basically, the maximum leaf node size
   */
  KDTreeSingleIndexAdaptor(const int dimensionality,
                           const DatasetAdaptor &inputData,
                           const KDTreeSingleIndexAdaptorParams &params =
                               KDTreeSingleIndexAdaptorParams())
      : dataset(inputData), index_params(params), distance(inputData) {
    BaseClassRef::root_node = NULL;
    BaseClassRef::m_size = dataset.kdtree_get_point_count();
    BaseClassRef::m_size_at_index_build = BaseClassRef::m_size;
    BaseClassRef::dim = dimensionality;
    if (DIM > 0)
      BaseClassRef::dim = DIM;
    BaseClassRef::m_leaf_max_size = params.leaf_max_size;

    // Create a permutable array of indices to the input vectors.
    init_vind();
  }

  /**
   * Builds the index
   */
  void buildIndex() {
    BaseClassRef::m_size = dataset.kdtree_get_point_count();
    BaseClassRef::m_size_at_index_build = BaseClassRef::m_size;
    init_vind();
    this->freeIndex(*this);
    BaseClassRef::m_size_at_index_build = BaseClassRef::m_size;
    if (BaseClassRef::m_size == 0)
      return;
    computeBoundingBox(BaseClassRef::root_bbox);
    BaseClassRef::root_node =
        this->divideTree(*this, 0, BaseClassRef::m_size,
                         BaseClassRef::root_bbox); // construct the tree
  }

  /** \name Query methods
   * @{ */

  /**
   * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored
   * inside the result object.
   *
   * Params:
   *     result = the result object in which the indices of the
   * nearest-neighbors are stored vec = the vector for which to search the
   * nearest neighbors
   *
   * \tparam RESULTSET Should be any ResultSet<DistanceType>
   * \return  True if the requested neighbors could be found.
   * \sa knnSearch, radiusSearch
   */
  template <typename RESULTSET>
  bool findNeighbors(RESULTSET &result, const ElementType *vec,
                     const SearchParams &searchParams) const {
    assert(vec);
    if (this->size(*this) == 0)
      return false;
    if (!BaseClassRef::root_node)
      throw std::runtime_error(
          "[nanoflann] findNeighbors() called before building the index.");
    float epsError = 1 + searchParams.eps;

    distance_vector_t
        dists; // fixed or variable-sized container (depending on DIM)
    auto zero = static_cast<decltype(result.worstDist())>(0);
    assign(dists, (DIM > 0 ? DIM : BaseClassRef::dim),
           zero); // Fill it with zeros.
    DistanceType distsq = this->computeInitialDistances(*this, vec, dists);

    searchLevel(result, vec, BaseClassRef::root_node, distsq, dists,
                epsError); // "count_leaf" parameter removed since was neither
                           // used nor returned to the user.

    return result.full();
  }

  /**
   * Find the "num_closest" nearest neighbors to the \a query_point[0:dim-1].
   * Their indices are stored inside the result object. \sa radiusSearch,
   * findNeighbors \note nChecks_IGNORED is ignored but kept for compatibility
   * with the original FLANN interface. \return Number `N` of valid points in
   * the result set. Only the first `N` entries in `out_indices` and
   * `out_distances_sq` will be valid. Return may be less than `num_closest`
   * only if the number of elements in the tree is less than `num_closest`.
   */
  size_t knnSearch(const ElementType *query_point, const size_t num_closest,
                   IndexType *out_indices, DistanceType *out_distances_sq,
                   const int /* nChecks_IGNORED */ = 10) const {
    nanoflann::KNNResultSet<DistanceType, IndexType> resultSet(num_closest);
    resultSet.init(out_indices, out_distances_sq);
    this->findNeighbors(resultSet, query_point, nanoflann::SearchParams());
    return resultSet.size();
  }

  /**
   * Find all the neighbors to \a query_point[0:dim-1] within a maximum radius.
   *  The output is given as a vector of pairs, of which the first element is a
   * point index and the second the corresponding distance. Previous contents of
   * \a IndicesDists are cleared.
   *
   *  If searchParams.sorted==true, the output list is sorted by ascending
   * distances.
   *
   *  For a better performance, it is advisable to do a .reserve() on the vector
   * if you have any wild guess about the number of expected matches.
   *
   *  \sa knnSearch, findNeighbors, radiusSearchCustomCallback
   * \return The number of points within the given radius (i.e. indices.size()
   * or dists.size() )
   */
  size_t
  radiusSearch(const ElementType *query_point, const DistanceType &radius,
               std::vector<std::pair<IndexType, DistanceType>> &IndicesDists,
               const SearchParams &searchParams) const {
    RadiusResultSet<DistanceType, IndexType> resultSet(radius, IndicesDists);
    const size_t nFound =
        radiusSearchCustomCallback(query_point, resultSet, searchParams);
    if (searchParams.sorted)
      std::sort(IndicesDists.begin(), IndicesDists.end(), IndexDist_Sorter());
    return nFound;
  }

  /**
   * Just like radiusSearch() but with a custom callback class for each point
   * found in the radius of the query. See the source of RadiusResultSet<> as a
   * start point for your own classes. \sa radiusSearch
   */
  template <class SEARCH_CALLBACK>
  size_t radiusSearchCustomCallback(
      const ElementType *query_point, SEARCH_CALLBACK &resultSet,
      const SearchParams &searchParams = SearchParams()) const {
    this->findNeighbors(resultSet, query_point, searchParams);
    return resultSet.size();
  }

  /** @} */

public:
  /** Make sure the auxiliary list \a vind has the same size than the current
   * dataset, and re-generate if size has changed. */
  void init_vind() {
    // Create a permutable array of indices to the input vectors.
    BaseClassRef::m_size = dataset.kdtree_get_point_count();
    if (BaseClassRef::vind.size() != BaseClassRef::m_size)
      BaseClassRef::vind.resize(BaseClassRef::m_size);
    for (size_t i = 0; i < BaseClassRef::m_size; i++)
      BaseClassRef::vind[i] = i;
  }

  void computeBoundingBox(BoundingBox &bbox) {
    resize(bbox, (DIM > 0 ? DIM : BaseClassRef::dim));
    if (dataset.kdtree_get_bbox(bbox)) {
      // Done! It was implemented in derived class
    } else {
      const size_t N = dataset.kdtree_get_point_count();
      if (!N)
        throw std::runtime_error("[nanoflann] computeBoundingBox() called but "
                                 "no data points found.");
      for (int i = 0; i < (DIM > 0 ? DIM : BaseClassRef::dim); ++i) {
        bbox[i].low = bbox[i].high = this->dataset_get(*this, 0, i);
      }
      for (size_t k = 1; k < N; ++k) {
        for (int i = 0; i < (DIM > 0 ? DIM : BaseClassRef::dim); ++i) {
          if (this->dataset_get(*this, k, i) < bbox[i].low)
            bbox[i].low = this->dataset_get(*this, k, i);
          if (this->dataset_get(*this, k, i) > bbox[i].high)
            bbox[i].high = this->dataset_get(*this, k, i);
        }
      }
    }
  }

  /**
   * Performs an exact search in the tree starting from a node.
   * \tparam RESULTSET Should be any ResultSet<DistanceType>
   * \return true if the search should be continued, false if the results are
   * sufficient
   */
  template <class RESULTSET>
  bool searchLevel(RESULTSET &result_set, const ElementType *vec,
                   const NodePtr node, DistanceType mindistsq,
                   distance_vector_t &dists, const float epsError) const {
    /* If this is a leaf node, then do check and return. */
    if ((node->child1 == NULL) && (node->child2 == NULL)) {
      // count_leaf += (node->lr.right-node->lr.left);  // Removed since was
      // neither used nor returned to the user.
      DistanceType worst_dist = result_set.worstDist();
      for (IndexType i = node->node_type.lr.left; i < node->node_type.lr.right;
           ++i) {
        const IndexType index = BaseClassRef::vind[i]; // reorder... : i;
        DistanceType dist = distance.evalMetric(
            vec, index, (DIM > 0 ? DIM : BaseClassRef::dim));
        if (dist < worst_dist) {
          if (!result_set.addPoint(dist, BaseClassRef::vind[i])) {
            // the resultset doesn't want to receive any more points, we're done
            // searching!
            return false;
          }
        }
      }
      return true;
    }

    /* Which child branch should be taken first? */
    int idx = node->node_type.sub.divfeat;
    ElementType val = vec[idx];
    DistanceType diff1 = val - node->node_type.sub.divlow;
    DistanceType diff2 = val - node->node_type.sub.divhigh;

    NodePtr bestChild;
    NodePtr otherChild;
    DistanceType cut_dist;
    if ((diff1 + diff2) < 0) {
      bestChild = node->child1;
      otherChild = node->child2;
      cut_dist = distance.accum_dist(val, node->node_type.sub.divhigh, idx);
    } else {
      bestChild = node->child2;
      otherChild = node->child1;
      cut_dist = distance.accum_dist(val, node->node_type.sub.divlow, idx);
    }

    /* Call recursively to search next level down. */
    if (!searchLevel(result_set, vec, bestChild, mindistsq, dists, epsError)) {
      // the resultset doesn't want to receive any more points, we're done
      // searching!
      return false;
    }

    DistanceType dst = dists[idx];
    mindistsq = mindistsq + cut_dist - dst;
    dists[idx] = cut_dist;
    if (mindistsq * epsError <= result_set.worstDist()) {
      if (!searchLevel(result_set, vec, otherChild, mindistsq, dists,
                       epsError)) {
        // the resultset doesn't want to receive any more points, we're done
        // searching!
        return false;
      }
    }
    dists[idx] = dst;
    return true;
  }

public:
  /**  Stores the index in a binary file.
   *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so when
   * loading the index object it must be constructed associated to the same
   * source of data points used while building it. See the example:
   * examples/saveload_example.cpp \sa loadIndex  */
  void saveIndex(FILE *stream) { this->saveIndex_(*this, stream); }

  /**  Loads a previous index from a binary file.
   *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so the
   * index object must be constructed associated to the same source of data
   * points used while building the index. See the example:
   * examples/saveload_example.cpp \sa loadIndex  */
  void loadIndex(FILE *stream) { this->loadIndex_(*this, stream); }

}; // class KDTree

/** kd-tree dynamic index
 *
 * Contains the k-d trees and other information for indexing a set of points
 * for nearest-neighbor matching.
 *
 *  The class "DatasetAdaptor" must provide the following interface (can be
 * non-virtual, inlined methods):
 *
 *  \code
 *   // Must return the number of data poins
 *   inline size_t kdtree_get_point_count() const { ... }
 *
 *   // Must return the dim'th component of the idx'th point in the class:
 *   inline T kdtree_get_pt(const size_t idx, const size_t dim) const { ... }
 *
 *   // Optional bounding-box computation: return false to default to a standard
 * bbox computation loop.
 *   //   Return true if the BBOX was already computed by the class and returned
 * in "bb" so it can be avoided to redo it again.
 *   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3
 * for point clouds) template <class BBOX> bool kdtree_get_bbox(BBOX &bb) const
 *   {
 *      bb[0].low = ...; bb[0].high = ...;  // 0th dimension limits
 *      bb[1].low = ...; bb[1].high = ...;  // 1st dimension limits
 *      ...
 *      return true;
 *   }
 *
 *  \endcode
 *
 * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
 * \tparam Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc. \tparam DIM
 * Dimensionality of data points (e.g. 3 for 3D points) \tparam IndexType Will
 * be typically size_t or int
 */
template <typename Distance, class DatasetAdaptor, int DIM = -1,
          typename IndexType = size_t>
class KDTreeSingleIndexDynamicAdaptor_
    : public KDTreeBaseClass<KDTreeSingleIndexDynamicAdaptor_<
                                 Distance, DatasetAdaptor, DIM, IndexType>,
                             Distance, DatasetAdaptor, DIM, IndexType> {
public:
  /**
   * The dataset used by this index
   */
  const DatasetAdaptor &dataset; //!< The source of our data

  KDTreeSingleIndexAdaptorParams index_params;

  std::vector<int> &treeIndex;

  Distance distance;

  typedef typename nanoflann::KDTreeBaseClass<
      nanoflann::KDTreeSingleIndexDynamicAdaptor_<Distance, DatasetAdaptor, DIM,
                                                  IndexType>,
      Distance, DatasetAdaptor, DIM, IndexType>
      BaseClassRef;

  typedef typename BaseClassRef::ElementType ElementType;
  typedef typename BaseClassRef::DistanceType DistanceType;

  typedef typename BaseClassRef::Node Node;
  typedef Node *NodePtr;

  typedef typename BaseClassRef::Interval Interval;
  /** Define "BoundingBox" as a fixed-size or variable-size container depending
   * on "DIM" */
  typedef typename BaseClassRef::BoundingBox BoundingBox;

  /** Define "distance_vector_t" as a fixed-size or variable-size container
   * depending on "DIM" */
  typedef typename BaseClassRef::distance_vector_t distance_vector_t;

  /**
   * KDTree constructor
   *
   * Refer to docs in README.md or online in
   * https://github.com/jlblancoc/nanoflann
   *
   * The KD-Tree point dimension (the length of each point in the datase, e.g. 3
   * for 3D points) is determined by means of:
   *  - The \a DIM template parameter if >0 (highest priority)
   *  - Otherwise, the \a dimensionality parameter of this constructor.
   *
   * @param inputData Dataset with the input features
   * @param params Basically, the maximum leaf node size
   */
  KDTreeSingleIndexDynamicAdaptor_(
      const int dimensionality, const DatasetAdaptor &inputData,
      std::vector<int> &treeIndex_,
      const KDTreeSingleIndexAdaptorParams &params =
          KDTreeSingleIndexAdaptorParams())
      : dataset(inputData), index_params(params), treeIndex(treeIndex_),
        distance(inputData) {
    BaseClassRef::root_node = NULL;
    BaseClassRef::m_size = 0;
    BaseClassRef::m_size_at_index_build = 0;
    BaseClassRef::dim = dimensionality;
    if (DIM > 0)
      BaseClassRef::dim = DIM;
    BaseClassRef::m_leaf_max_size = params.leaf_max_size;
  }

  /** Assignment operator definiton */
  KDTreeSingleIndexDynamicAdaptor_
  operator=(const KDTreeSingleIndexDynamicAdaptor_ &rhs) {
    KDTreeSingleIndexDynamicAdaptor_ tmp(rhs);
    std::swap(BaseClassRef::vind, tmp.BaseClassRef::vind);
    std::swap(BaseClassRef::m_leaf_max_size, tmp.BaseClassRef::m_leaf_max_size);
    std::swap(index_params, tmp.index_params);
    std::swap(treeIndex, tmp.treeIndex);
    std::swap(BaseClassRef::m_size, tmp.BaseClassRef::m_size);
    std::swap(BaseClassRef::m_size_at_index_build,
              tmp.BaseClassRef::m_size_at_index_build);
    std::swap(BaseClassRef::root_node, tmp.BaseClassRef::root_node);
    std::swap(BaseClassRef::root_bbox, tmp.BaseClassRef::root_bbox);
    std::swap(BaseClassRef::pool, tmp.BaseClassRef::pool);
    return *this;
  }

  /**
   * Builds the index
   */
  void buildIndex() {
    BaseClassRef::m_size = BaseClassRef::vind.size();
    this->freeIndex(*this);
    BaseClassRef::m_size_at_index_build = BaseClassRef::m_size;
    if (BaseClassRef::m_size == 0)
      return;
    computeBoundingBox(BaseClassRef::root_bbox);
    BaseClassRef::root_node =
        this->divideTree(*this, 0, BaseClassRef::m_size,
                         BaseClassRef::root_bbox); // construct the tree
  }

  /** \name Query methods
   * @{ */

  /**
   * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored
   * inside the result object.
   *
   * Params:
   *     result = the result object in which the indices of the
   * nearest-neighbors are stored vec = the vector for which to search the
   * nearest neighbors
   *
   * \tparam RESULTSET Should be any ResultSet<DistanceType>
   * \return  True if the requested neighbors could be found.
   * \sa knnSearch, radiusSearch
   */
  template <typename RESULTSET>
  bool findNeighbors(RESULTSET &result, const ElementType *vec,
                     const SearchParams &searchParams) const {
    assert(vec);
    if (this->size(*this) == 0)
      return false;
    if (!BaseClassRef::root_node)
      return false;
    float epsError = 1 + searchParams.eps;

    // fixed or variable-sized container (depending on DIM)
    distance_vector_t dists;
    // Fill it with zeros.
    assign(dists, (DIM > 0 ? DIM : BaseClassRef::dim),
           static_cast<typename distance_vector_t::value_type>(0));
    DistanceType distsq = this->computeInitialDistances(*this, vec, dists);

    searchLevel(result, vec, BaseClassRef::root_node, distsq, dists,
                epsError); // "count_leaf" parameter removed since was neither
                           // used nor returned to the user.

    return result.full();
  }

  /**
   * Find the "num_closest" nearest neighbors to the \a query_point[0:dim-1].
   * Their indices are stored inside the result object. \sa radiusSearch,
   * findNeighbors \note nChecks_IGNORED is ignored but kept for compatibility
   * with the original FLANN interface. \return Number `N` of valid points in
   * the result set. Only the first `N` entries in `out_indices` and
   * `out_distances_sq` will be valid. Return may be less than `num_closest`
   * only if the number of elements in the tree is less than `num_closest`.
   */
  size_t knnSearch(const ElementType *query_point, const size_t num_closest,
                   IndexType *out_indices, DistanceType *out_distances_sq,
                   const int /* nChecks_IGNORED */ = 10) const {
    nanoflann::KNNResultSet<DistanceType, IndexType> resultSet(num_closest);
    resultSet.init(out_indices, out_distances_sq);
    this->findNeighbors(resultSet, query_point, nanoflann::SearchParams());
    return resultSet.size();
  }

  /**
   * Find all the neighbors to \a query_point[0:dim-1] within a maximum radius.
   *  The output is given as a vector of pairs, of which the first element is a
   * point index and the second the corresponding distance. Previous contents of
   * \a IndicesDists are cleared.
   *
   *  If searchParams.sorted==true, the output list is sorted by ascending
   * distances.
   *
   *  For a better performance, it is advisable to do a .reserve() on the vector
   * if you have any wild guess about the number of expected matches.
   *
   *  \sa knnSearch, findNeighbors, radiusSearchCustomCallback
   * \return The number of points within the given radius (i.e. indices.size()
   * or dists.size() )
   */
  size_t
  radiusSearch(const ElementType *query_point, const DistanceType &radius,
               std::vector<std::pair<IndexType, DistanceType>> &IndicesDists,
               const SearchParams &searchParams) const {
    RadiusResultSet<DistanceType, IndexType> resultSet(radius, IndicesDists);
    const size_t nFound =
        radiusSearchCustomCallback(query_point, resultSet, searchParams);
    if (searchParams.sorted)
      std::sort(IndicesDists.begin(), IndicesDists.end(), IndexDist_Sorter());
    return nFound;
  }

  /**
   * Just like radiusSearch() but with a custom callback class for each point
   * found in the radius of the query. See the source of RadiusResultSet<> as a
   * start point for your own classes. \sa radiusSearch
   */
  template <class SEARCH_CALLBACK>
  size_t radiusSearchCustomCallback(
      const ElementType *query_point, SEARCH_CALLBACK &resultSet,
      const SearchParams &searchParams = SearchParams()) const {
    this->findNeighbors(resultSet, query_point, searchParams);
    return resultSet.size();
  }

  /** @} */

public:
  void computeBoundingBox(BoundingBox &bbox) {
    resize(bbox, (DIM > 0 ? DIM : BaseClassRef::dim));

    if (dataset.kdtree_get_bbox(bbox)) {
      // Done! It was implemented in derived class
    } else {
      const size_t N = BaseClassRef::m_size;
      if (!N)
        throw std::runtime_error("[nanoflann] computeBoundingBox() called but "
                                 "no data points found.");
      for (int i = 0; i < (DIM > 0 ? DIM : BaseClassRef::dim); ++i) {
        bbox[i].low = bbox[i].high =
            this->dataset_get(*this, BaseClassRef::vind[0], i);
      }
      for (size_t k = 1; k < N; ++k) {
        for (int i = 0; i < (DIM > 0 ? DIM : BaseClassRef::dim); ++i) {
          if (this->dataset_get(*this, BaseClassRef::vind[k], i) < bbox[i].low)
            bbox[i].low = this->dataset_get(*this, BaseClassRef::vind[k], i);
          if (this->dataset_get(*this, BaseClassRef::vind[k], i) > bbox[i].high)
            bbox[i].high = this->dataset_get(*this, BaseClassRef::vind[k], i);
        }
      }
    }
  }

  /**
   * Performs an exact search in the tree starting from a node.
   * \tparam RESULTSET Should be any ResultSet<DistanceType>
   */
  template <class RESULTSET>
  void searchLevel(RESULTSET &result_set, const ElementType *vec,
                   const NodePtr node, DistanceType mindistsq,
                   distance_vector_t &dists, const float epsError) const {
    /* If this is a leaf node, then do check and return. */
    if ((node->child1 == NULL) && (node->child2 == NULL)) {
      // count_leaf += (node->lr.right-node->lr.left);  // Removed since was
      // neither used nor returned to the user.
      DistanceType worst_dist = result_set.worstDist();
      for (IndexType i = node->node_type.lr.left; i < node->node_type.lr.right;
           ++i) {
        const IndexType index = BaseClassRef::vind[i]; // reorder... : i;
        if (treeIndex[index] == -1)
          continue;
        DistanceType dist = distance.evalMetric(
            vec, index, (DIM > 0 ? DIM : BaseClassRef::dim));
        if (dist < worst_dist) {
          if (!result_set.addPoint(
                  static_cast<typename RESULTSET::DistanceType>(dist),
                  static_cast<typename RESULTSET::IndexType>(
                      BaseClassRef::vind[i]))) {
            // the resultset doesn't want to receive any more points, we're done
            // searching!
            return; // false;
          }
        }
      }
      return;
    }

    /* Which child branch should be taken first? */
    int idx = node->node_type.sub.divfeat;
    ElementType val = vec[idx];
    DistanceType diff1 = val - node->node_type.sub.divlow;
    DistanceType diff2 = val - node->node_type.sub.divhigh;

    NodePtr bestChild;
    NodePtr otherChild;
    DistanceType cut_dist;
    if ((diff1 + diff2) < 0) {
      bestChild = node->child1;
      otherChild = node->child2;
      cut_dist = distance.accum_dist(val, node->node_type.sub.divhigh, idx);
    } else {
      bestChild = node->child2;
      otherChild = node->child1;
      cut_dist = distance.accum_dist(val, node->node_type.sub.divlow, idx);
    }

    /* Call recursively to search next level down. */
    searchLevel(result_set, vec, bestChild, mindistsq, dists, epsError);

    DistanceType dst = dists[idx];
    mindistsq = mindistsq + cut_dist - dst;
    dists[idx] = cut_dist;
    if (mindistsq * epsError <= result_set.worstDist()) {
      searchLevel(result_set, vec, otherChild, mindistsq, dists, epsError);
    }
    dists[idx] = dst;
  }

public:
  /**  Stores the index in a binary file.
   *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so when
   * loading the index object it must be constructed associated to the same
   * source of data points used while building it. See the example:
   * examples/saveload_example.cpp \sa loadIndex  */
  void saveIndex(FILE *stream) { this->saveIndex_(*this, stream); }

  /**  Loads a previous index from a binary file.
   *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so the
   * index object must be constructed associated to the same source of data
   * points used while building the index. See the example:
   * examples/saveload_example.cpp \sa loadIndex  */
  void loadIndex(FILE *stream) { this->loadIndex_(*this, stream); }
};

/** kd-tree dynaimic index
 *
 * class to create multiple static index and merge their results to behave as
 * single dynamic index as proposed in Logarithmic Approach.
 *
 *  Example of usage:
 *  examples/dynamic_pointcloud_example.cpp
 *
 * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
 * \tparam Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc. \tparam DIM
 * Dimensionality of data points (e.g. 3 for 3D points) \tparam IndexType Will
 * be typically size_t or int
 */
template <typename Distance, class DatasetAdaptor, int DIM = -1,
          typename IndexType = size_t>
class KDTreeSingleIndexDynamicAdaptor {
public:
  typedef typename Distance::ElementType ElementType;
  typedef typename Distance::DistanceType DistanceType;

protected:
  size_t m_leaf_max_size;
  size_t treeCount;
  size_t pointCount;

  /**
   * The dataset used by this index
   */
  const DatasetAdaptor &dataset; //!< The source of our data

  std::vector<int> treeIndex; //!< treeIndex[idx] is the index of tree in which
                              //!< point at idx is stored. treeIndex[idx]=-1
                              //!< means that point has been removed.

  KDTreeSingleIndexAdaptorParams index_params;

  int dim; //!< Dimensionality of each data point

  typedef KDTreeSingleIndexDynamicAdaptor_<Distance, DatasetAdaptor, DIM>
      index_container_t;
  std::vector<index_container_t> index;

public:
  /** Get a const ref to the internal list of indices; the number of indices is
   * adapted dynamically as the dataset grows in size. */
  const std::vector<index_container_t> &getAllIndices() const { return index; }

private:
  /** finds position of least significant unset bit */
  int First0Bit(IndexType num) {
    int pos = 0;
    while (num & 1) {
      num = num >> 1;
      pos++;
    }
    return pos;
  }

  /** Creates multiple empty trees to handle dynamic support */
  void init() {
    typedef KDTreeSingleIndexDynamicAdaptor_<Distance, DatasetAdaptor, DIM>
        my_kd_tree_t;
    std::vector<my_kd_tree_t> index_(
        treeCount, my_kd_tree_t(dim /*dim*/, dataset, treeIndex, index_params));
    index = index_;
  }

public:
  Distance distance;

  /**
   * KDTree constructor
   *
   * Refer to docs in README.md or online in
   * https://github.com/jlblancoc/nanoflann
   *
   * The KD-Tree point dimension (the length of each point in the datase, e.g. 3
   * for 3D points) is determined by means of:
   *  - The \a DIM template parameter if >0 (highest priority)
   *  - Otherwise, the \a dimensionality parameter of this constructor.
   *
   * @param inputData Dataset with the input features
   * @param params Basically, the maximum leaf node size
   */
  KDTreeSingleIndexDynamicAdaptor(const int dimensionality,
                                  const DatasetAdaptor &inputData,
                                  const KDTreeSingleIndexAdaptorParams &params =
                                      KDTreeSingleIndexAdaptorParams(),
                                  const size_t maximumPointCount = 1000000000U)
      : dataset(inputData), index_params(params), distance(inputData) {
    treeCount = static_cast<size_t>(std::log2(maximumPointCount));
    pointCount = 0U;
    dim = dimensionality;
    treeIndex.clear();
    if (DIM > 0)
      dim = DIM;
    m_leaf_max_size = params.leaf_max_size;
    init();
    const size_t num_initial_points = dataset.kdtree_get_point_count();
    if (num_initial_points > 0) {
      addPoints(0, num_initial_points - 1);
    }
  }

  /** Deleted copy constructor*/
  KDTreeSingleIndexDynamicAdaptor(
      const KDTreeSingleIndexDynamicAdaptor<Distance, DatasetAdaptor, DIM,
                                            IndexType> &) = delete;

  /** Add points to the set, Inserts all points from [start, end] */
  void addPoints(IndexType start, IndexType end) {
    size_t count = end - start + 1;
    treeIndex.resize(treeIndex.size() + count);
    for (IndexType idx = start; idx <= end; idx++) {
      int pos = First0Bit(pointCount);
      index[pos].vind.clear();
      treeIndex[pointCount] = pos;
      for (int i = 0; i < pos; i++) {
        for (int j = 0; j < static_cast<int>(index[i].vind.size()); j++) {
          index[pos].vind.push_back(index[i].vind[j]);
          if (treeIndex[index[i].vind[j]] != -1)
            treeIndex[index[i].vind[j]] = pos;
        }
        index[i].vind.clear();
        index[i].freeIndex(index[i]);
      }
      index[pos].vind.push_back(idx);
      index[pos].buildIndex();
      pointCount++;
    }
  }

  /** Remove a point from the set (Lazy Deletion) */
  void removePoint(size_t idx) {
    if (idx >= pointCount)
      return;
    treeIndex[idx] = -1;
  }

  /**
   * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored
   * inside the result object.
   *
   * Params:
   *     result = the result object in which the indices of the
   * nearest-neighbors are stored vec = the vector for which to search the
   * nearest neighbors
   *
   * \tparam RESULTSET Should be any ResultSet<DistanceType>
   * \return  True if the requested neighbors could be found.
   * \sa knnSearch, radiusSearch
   */
  template <typename RESULTSET>
  bool findNeighbors(RESULTSET &result, const ElementType *vec,
                     const SearchParams &searchParams) const {
    for (size_t i = 0; i < treeCount; i++) {
      index[i].findNeighbors(result, &vec[0], searchParams);
    }
    return result.full();
  }
};

/** An L2-metric KD-tree adaptor for working with data directly stored in an
 * Eigen Matrix, without duplicating the data storage. Each row in the matrix
 * represents a point in the state space.
 *
 *  Example of usage:
 * \code
 * 	Eigen::Matrix<num_t,Dynamic,Dynamic>  mat;
 * 	// Fill out "mat"...
 *
 * 	typedef KDTreeEigenMatrixAdaptor< Eigen::Matrix<num_t,Dynamic,Dynamic> >
 * my_kd_tree_t; const int max_leaf = 10; my_kd_tree_t   mat_index(mat, max_leaf
 * ); mat_index.index->buildIndex(); mat_index.index->... \endcode
 *
 *  \tparam DIM If set to >0, it specifies a compile-time fixed dimensionality
 * for the points in the data set, allowing more compiler optimizations. \tparam
 * Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc.
 */
template <class MatrixType, int DIM = -1, class Distance = nanoflann::metric_L2>
struct KDTreeEigenMatrixAdaptor {
  typedef KDTreeEigenMatrixAdaptor<MatrixType, DIM, Distance> self_t;
  typedef typename MatrixType::Scalar num_t;
  typedef typename MatrixType::Index IndexType;
  typedef
      typename Distance::template traits<num_t, self_t>::distance_t metric_t;
  typedef KDTreeSingleIndexAdaptor<metric_t, self_t,
                                   MatrixType::ColsAtCompileTime, IndexType>
      index_t;

  index_t *index; //! The kd-tree index for the user to call its methods as
                  //! usual with any other FLANN index.

  /// Constructor: takes a const ref to the matrix object with the data points
  KDTreeEigenMatrixAdaptor(const size_t dimensionality,
                           const std::reference_wrapper<const MatrixType> &mat,
                           const int leaf_max_size = 10)
      : m_data_matrix(mat) {
    const auto dims = mat.get().cols();
    if (size_t(dims) != dimensionality)
      throw std::runtime_error(
          "Error: 'dimensionality' must match column count in data matrix");
    if (DIM > 0 && int(dims) != DIM)
      throw std::runtime_error(
          "Data set dimensionality does not match the 'DIM' template argument");
    index =
        new index_t(static_cast<int>(dims), *this /* adaptor */,
                    nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size));
    index->buildIndex();
  }

public:
  /** Deleted copy constructor */
  KDTreeEigenMatrixAdaptor(const self_t &) = delete;

  ~KDTreeEigenMatrixAdaptor() { delete index; }

  const std::reference_wrapper<const MatrixType> m_data_matrix;

  /** Query for the \a num_closest closest points to a given point (entered as
   * query_point[0:dim-1]). Note that this is a short-cut method for
   * index->findNeighbors(). The user can also call index->... methods as
   * desired. \note nChecks_IGNORED is ignored but kept for compatibility with
   * the original FLANN interface.
   */
  inline void query(const num_t *query_point, const size_t num_closest,
                    IndexType *out_indices, num_t *out_distances_sq,
                    const int /* nChecks_IGNORED */ = 10) const {
    nanoflann::KNNResultSet<num_t, IndexType> resultSet(num_closest);
    resultSet.init(out_indices, out_distances_sq);
    index->findNeighbors(resultSet, query_point, nanoflann::SearchParams());
  }

  /** @name Interface expected by KDTreeSingleIndexAdaptor
   * @{ */

  const self_t &derived() const { return *this; }
  self_t &derived() { return *this; }

  // Must return the number of data points
  inline size_t kdtree_get_point_count() const {
    return m_data_matrix.get().rows();
  }

  // Returns the dim'th component of the idx'th point in the class:
  inline num_t kdtree_get_pt(const IndexType idx, size_t dim) const {
    return m_data_matrix.get().coeff(idx, IndexType(dim));
  }

  // Optional bounding-box computation: return false to default to a standard
  // bbox computation loop.
  //   Return true if the BBOX was already computed by the class and returned in
  //   "bb" so it can be avoided to redo it again. Look at bb.size() to find out
  //   the expected dimensionality (e.g. 2 or 3 for point clouds)
  template <class BBOX> bool kdtree_get_bbox(BBOX & /*bb*/) const {
    return false;
  }

  /** @} */

}; // end of KDTreeEigenMatrixAdaptor
   /** @} */

/** @} */ // end of grouping
} // namespace nanoflann

#endif /* NANOFLANN_HPP_ */


================================================
FILE: thirdparty/kpconv/kernels/kernel_points.py
================================================

#
#
#      0=================================0
#      |    Kernel Point Convolutions    |
#      0=================================0
#
#
# ----------------------------------------------------------------------------------------------------------------------
#
#      Functions handling the disposition of kernel points.
#
# ----------------------------------------------------------------------------------------------------------------------
#
#      Hugues THOMAS - 11/06/2018
#

import time
import numpy as np
from os import makedirs
from os.path import join, exists
# from lib.ply import read_ply, write_ply
from kpconv.lib.ply import read_ply, write_ply


# ------------------------------------------------------------------------------------------
#
#           Functions
#       \***************/
#
#

def create_3D_rotations(axis, angle):
    """
    Create rotation matrices from a list of axes and angles. Code from wikipedia on quaternions
    :param axis: float32[N, 3]
    :param angle: float32[N,]
    :return: float32[N, 3, 3]
    """

    t1 = np.cos(angle)
    t2 = 1 - t1
    t3 = axis[:, 0] * axis[:, 0]
    t6 = t2 * axis[:, 0]
    t7 = t6 * axis[:, 1]
    t8 = np.sin(angle)
    t9 = t8 * axis[:, 2]
    t11 = t6 * axis[:, 2]
    t12 = t8 * axis[:, 1]
    t15 = axis[:, 1] * axis[:, 1]
    t19 = t2 * axis[:, 1] * axis[:, 2]
    t20 = t8 * axis[:, 0]
    t24 = axis[:, 2] * axis[:, 2]
    R = np.stack([t1 + t2 * t3,
                  t7 - t9,
                  t11 + t12,
                  t7 + t9,
                  t1 + t2 * t15,
                  t19 - t20,
                  t11 - t12,
                  t19 + t20,
                  t1 + t2 * t24], axis=1)

    return np.reshape(R, (-1, 3, 3))


def spherical_Lloyd(radius, num_cells, dimension=3, fixed='center', approximation='monte-carlo',
                    approx_n=5000, max_iter=500, momentum=0.9, verbose=0):
    """
    Creation of kernel point via Lloyd algorithm. We use an approximation of the algorithm, and compute the Voronoi
    cell centers with discretization  of space. The exact formula is not trivial with part of the sphere as sides.
    :param radius: Radius of the kernels
    :param num_cells: Number of cell (kernel points) in the Voronoi diagram.
    :param dimension: dimension of the space
    :param fixed: fix position of certain kernel points ('none', 'center' or 'verticals')
    :param approximation: Approximation method for Lloyd's algorithm ('discretization', 'monte-carlo')
    :param approx_n: Number of point used for approximation.
    :param max_iter: Maximum nu;ber of iteration for the algorithm.
    :param momentum: Momentum of the low pass filter smoothing kernel point positions
    :param verbose: display option
    :return: points [num_kernels, num_points, dimension]
    """

    #######################
    # Parameters definition
    #######################

    # Radius used for optimization (points are rescaled afterwards)
    radius0 = 1.0

    #######################
    # Kernel initialization
    #######################

    # Random kernel points (Uniform distribution in a sphere)
    kernel_points = np.zeros((0, dimension))
    while kernel_points.shape[0] < num_cells:
        new_points = np.random.rand(num_cells, dimension) * 2 * radius0 - radius0
        kernel_points = np.vstack((kernel_points, new_points))
        d2 = np.sum(np.power(kernel_points, 2), axis=1)
        kernel_points = kernel_points[np.logical_and(d2 < radius0 ** 2, (0.9 * radius0) ** 2 < d2), :]
    kernel_points = kernel_points[:num_cells, :].reshape((num_cells, -1))

    # Optional fixing
    if fixed == 'center':
        kernel_points[0, :] *= 0
    if fixed == 'verticals':
        kernel_points[:3, :] *= 0
        kernel_points[1, -1] += 2 * radius0 / 3
        kernel_points[2, -1] -= 2 * radius0 / 3

    ##############################
    # Approximation initialization
    ##############################

    # Initialize figure
    if verbose > 1:
        fig = plt.figure()

    # Initialize discretization in this method is chosen
    if approximation == 'discretization':
        side_n = int(np.floor(approx_n ** (1. / dimension)))
        dl = 2 * radius0 / side_n
        coords = np.arange(-radius0 + dl/2, radius0, dl)
        if dimension == 2:
            x, y = np.meshgrid(coords, coords)
            X = np.vstack((np.ravel(x), np.ravel(y))).T
        elif dimension == 3:
            x, y, z = np.meshgrid(coords, coords, coords)
            X = np.vstack((np.ravel(x), np.ravel(y), np.ravel(z))).T
        elif dimension == 4:
            x, y, z, t = np.meshgrid(coords, coords, coords, coords)
            X = np.vstack((np.ravel(x), np.ravel(y), np.ravel(z), np.ravel(t))).T
        else:
            raise ValueError('Unsupported dimension (max is 4)')
    elif approximation == 'monte-carlo':
        X = np.zeros((0, dimension))
    else:
        raise ValueError('Wrong approximation method chosen: "{:s}"'.format(approximation))

    # Only points inside the sphere are used
    d2 = np.sum(np.power(X, 2), axis=1)
    X = X[d2 < radius0 * radius0, :]

    #####################
    # Kernel optimization
    #####################

    # Warning if at least one kernel point has no cell
    warning = False

    # moving vectors of kernel points saved to detect convergence
    max_moves = np.zeros((0,))

    for iter in range(max_iter):

        # In the case of monte-carlo, renew the sampled points
        if approximation == 'monte-carlo':
            X = np.random.rand(approx_n, dimension) * 2 * radius0 - radius0
            d2 = np.sum(np.power(X, 2), axis=1)
            X = X[d2 < radius0 * radius0, :]

        # Get the distances matrix [n_approx, K, dim]
        differences = np.expand_dims(X, 1) - kernel_points
        sq_distances = np.sum(np.square(differences), axis=2)

        # Compute cell centers
        cell_inds = np.argmin(sq_distances, axis=1)
        centers = []
        for c in range(num_cells):
            bool_c = (cell_inds == c)
            num_c = np.sum(bool_c.astype(np.int32))
            if num_c > 0:
                centers.append(np.sum(X[bool_c, :], axis=0) / num_c)
            else:
                warning = True
                centers.append(kernel_points[c])

        # Update kernel points with low pass filter to smooth mote carlo
        centers = np.vstack(centers)
        moves = (1 - momentum) * (centers - kernel_points)
        kernel_points += moves

        # Check moves for convergence
        max_moves = np.append(max_moves, np.max(np.linalg.norm(moves, axis=1)))

        # Optional fixing
        if fixed == 'center':
            kernel_points[0, :] *= 0
        if fixed == 'verticals':
            kernel_points[0, :] *= 0
            kernel_points[:3, :-1] *= 0

        if verbose:
            print('iter {:5d} / max move = {:f}'.format(iter, np.max(np.linalg.norm(moves, axis=1))))
            if warning:
                print('{:}WARNING: at least one point has no cell{:}'.format(bcolors.WARNING, bcolors.ENDC))
        if verbose > 1:
            plt.clf()
            plt.scatter(X[:, 0], X[:, 1], c=cell_inds, s=20.0,
                        marker='.', cmap=plt.get_cmap('tab20'))
            #plt.scatter(kernel_points[:, 0], kernel_points[:, 1], c=np.arange(num_cells), s=100.0,
            #            marker='+', cmap=plt.get_cmap('tab20'))
            plt.plot(kernel_points[:, 0], kernel_points[:, 1], 'k+')
            circle = plt.Circle((0, 0), radius0, color='r', fill=False)
            fig.axes[0].add_artist(circle)
            fig.axes[0].set_xlim((-radius0 * 1.1, radius0 * 1.1))
            fig.axes[0].set_ylim((-radius0 * 1.1, radius0 * 1.1))
            fig.axes[0].set_aspect('equal')
            plt.draw()
            plt.pause(0.001)
            plt.show(block=False)

    ###################
    # User verification
    ###################

    # Show the convergence to ask user if this kernel is correct
    if verbose:
        if dimension == 2:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[10.4, 4.8])
            ax1.plot(max_moves)
            ax2.scatter(X[:, 0], X[:, 1], c=cell_inds, s=20.0,
                        marker='.', cmap=plt.get_cmap('tab20'))
            # plt.scatter(kernel_points[:, 0], kernel_points[:, 1], c=np.arange(num_cells), s=100.0,
            #            marker='+', cmap=plt.get_cmap('tab20'))
            ax2.plot(kernel_points[:, 0], kernel_points[:, 1], 'k+')
            circle = plt.Circle((0, 0), radius0, color='r', fill=False)
            ax2.add_artist(circle)
            ax2.set_xlim((-radius0 * 1.1, radius0 * 1.1))
            ax2.set_ylim((-radius0 * 1.1, radius0 * 1.1))
            ax2.set_aspect('equal')
            plt.title('Check if kernel is correct.')
            plt.draw()
            plt.show()

        if dimension > 2:
            plt.figure()
            plt.plot(max_moves)
            plt.title('Check if kernel is correct.')
            plt.show()

    # Rescale kernels with real radius
    return kernel_points * radius


def kernel_point_optimization_debug(radius, num_points, num_kernels=1, dimension=3,
                                    fixed='center', ratio=0.66, verbose=0):
    """
    Creation of kernel point via optimization of potentials.
    :param radius: Radius of the kernels
    :param num_points: points composing kernels
    :param num_kernels: number of wanted kernels
    :param dimension: dimension of the space
    :param fixed: fix position of certain kernel points ('none', 'center' or 'verticals')
    :param ratio: ratio of the radius where you want the kernels points to be placed
    :param verbose: display option
    :return: points [num_kernels, num_points, dimension]
    """

    #######################
    # Parameters definition
    #######################

    # Radius used for optimization (points are rescaled afterwards)
    radius0 = 1
    diameter0 = 2

    # Factor multiplicating gradients for moving points (~learning rate)
    moving_factor = 1e-2
    continuous_moving_decay = 0.9995

    # Gradient threshold to stop optimization
    thresh = 1e-5

    # Gradient clipping value
    clip = 0.05 * radius0

    #######################
    # Kernel initialization
    #######################
    # import pdb 
    # pdb.set_trace()

    # Random kernel points
    kernel_points = np.random.rand(num_kernels * num_points - 1, dimension) * diameter0 - radius0
    while (kernel_points.shape[0] < num_kernels * num_points):
        new_points = np.random.rand(num_kernels * num_points - 1, dimension) * diameter0 - radius0
        kernel_points = np.vstack((kernel_points, new_points))
        d2 = np.sum(np.power(kernel_points, 2), axis=1)
        kernel_points = kernel_points[d2 < 0.5 * radius0 * radius0, :]
    kernel_points = kernel_points[:num_kernels * num_points, :].reshape((num_kernels, num_points, -1))

    # Optionnal fixing
    if fixed == 'center':
        kernel_points[:, 0, :] *= 0
    if fixed == 'verticals':
        kernel_points[:, :3, :] *= 0
        kernel_points[:, 1, -1] += 2 * radius0 / 3
        kernel_points[:, 2, -1] -= 2 * radius0 / 3

    #####################
    # Kernel optimization
    #####################

    # Initialize figure
    if verbose>1:
        fig = plt.figure()

    saved_gradient_norms = np.zeros((10000, num_kernels))
    old_gradient_norms = np.zeros((num_kernels, num_points))
    for iter in range(10000):

        # Compute gradients
        # *****************

        # Derivative of the sum of potentials of all points
        A = np.expand_dims(kernel_points, axis=2)
        B = np.expand_dims(kernel_points, axis=1)
        interd2 = np.sum(np.power(A - B, 2), axis=-1)
        inter_grads = (A - B) / (np.power(np.expand_dims(interd2, -1), 3/2) + 1e-6)
        inter_grads = np.sum(inter_grads, axis=1)

        # Derivative of the radius potential
        circle_grads = 10*kernel_points

        # All gradients
        gradients = inter_grads + circle_grads

        if fixed == 'verticals':
            gradients[:, 1:3, :-1] = 0

        # Stop condition
        # **************

        # Compute norm of gradients
        gradients_norms = np.sqrt(np.sum(np.power(gradients, 2), axis=-1))
        saved_gradient_norms[iter, :] = np.max(gradients_norms, axis=1)

        # Stop if all moving points are gradients fixed (low gradients diff)

        if fixed == 'center' and np.max(np.abs(old_gradient_norms[:, 1:] - gradients_norms[:, 1:])) < thresh:
            break
        elif fixed == 'verticals' and np.max(np.abs(old_gradient_norms[:, 3:] - gradients_norms[:, 3:])) < thresh:
            break
        elif np.max(np.abs(old_gradient_norms - gradients_norms)) < thresh:
            break
        old_gradient_norms = gradients_norms

        # Move points
        # ***********

        # Clip gradient to get moving dists
        moving_dists = np.minimum(moving_factor * gradients_norms, clip)

        # Fix central point
        if fixed == 'center':
            moving_dists[:, 0] = 0
        if fixed == 'verticals':
            moving_dists[:, 0] = 0

        # Move points
        kernel_points -= np.expand_dims(moving_dists, -1) * gradients / np.expand_dims(gradients_norms + 1e-6, -1)

        if verbose:
            print('iter {:5d} / max grad = {:f}'.format(iter, np.max(gradients_norms[:, 3:])))
        if verbose > 1:
            plt.clf()
            plt.plot(kernel_points[0, :, 0], kernel_points[0, :, 1], '.')
            circle = plt.Circle((0, 0), radius, color='r', fill=False)
            fig.axes[0].add_artist(circle)
            fig.axes[0].set_xlim((-radius*1.1, radius*1.1))
            fig.axes[0].set_ylim((-radius*1.1, radius*1.1))
            fig.axes[0].set_aspect('equal')
            plt.draw()
            plt.pause(0.001)
            plt.show(block=False)
            print(moving_factor)

        # moving factor decay
        moving_factor *= continuous_moving_decay

    # Rescale radius to fit the wanted ratio of radius
    r = np.sqrt(np.sum(np.power(kernel_points, 2), axis=-1))
    kernel_points *= ratio / np.mean(r[:, 1:])

    # Rescale kernels with real radius
    return kernel_points * radius, saved_gradient_norms


def load_kernels(radius, num_kpoints, dimension, fixed, lloyd=False):

    # Kernel directory
    kernel_dir = 'kernels/dispositions'
    if not exists(kernel_dir):
        makedirs(kernel_dir)

    # To many points switch to Lloyds
    if num_kpoints > 30:
        lloyd = True

    # Kernel_file
    kernel_file = join(kernel_dir, 'k_{:03d}_{:s}_{:d}D.ply'.format(num_kpoints, fixed, dimension))

    # Check if already done
    if not exists(kernel_file):
        if lloyd:
            # Create kernels
            kernel_points = spherical_Lloyd(1.0,
                                            num_kpoints,
                                            dimension=dimension,
                                            fixed=fixed,
                                            verbose=0)

        else:
            # Create kernels
        
            kernel_points, grad_norms = kernel_point_optimization_debug(1.0,
                                                                        num_kpoints,
                                                                        num_kernels=100,
                                                                        dimension=dimension,
                                                                        fixed=fixed,
                                                                        verbose=0)

            # Find best candidate
            best_k = np.argmin(grad_norms[-1, :])

            # Save points
            kernel_points = kernel_points[best_k, :, :]

        write_ply(kernel_file, kernel_points, ['x', 'y', 'z'])

    else:
        data = read_ply(kernel_file)
        kernel_points = np.vstack((data['x'], data['y'], data['z'])).T

    # Random roations for the kernel
    # N.B. 4D random rotations not supported yet
    R = np.eye(dimension)
    theta = np.random.rand() * 2 * np.pi
    if dimension == 2:
        if fixed != 'vertical':
            c, s = np.cos(theta), np.sin(theta)
            R = np.array([[c, -s], [s, c]], dtype=np.float32)

    elif dimension == 3:
        if fixed != 'vertical':
            c, s = np.cos(theta), np.sin(theta)
            R = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]], dtype=np.float32)

        else:
            phi = (np.random.rand() - 0.5) * np.pi

            # Create the first vector in carthesian coordinates
            u = np.array([np.cos(theta) * np.cos(phi), np.sin(theta) * np.cos(phi), np.sin(phi)])

            # Choose a random rotation angle
            alpha = np.random.rand() * 2 * np.pi

            # Create the rotation matrix with this vector and angle
            R = create_3D_rotations(np.reshape(u, (1, -1)), np.reshape(alpha, (1, -1)))[0]

            R = R.astype(np.float32)

    # Add a small noise
    kernel_points = kernel_points + np.random.normal(scale=0.01, size=kernel_points.shape)

    # Scale kernels
    kernel_points = radius * kernel_points

    # Rotate kernels
    kernel_points = np.matmul(kernel_points, R)

    return kernel_points.astype(np.float32)


================================================
FILE: thirdparty/kpconv/kpconv_blocks.py
================================================
#
#
#      0=================================0
#      |    Kernel Point Convolutions    |
#      0=================================0
#
#
# ----------------------------------------------------------------------------------------------------------------------
#
#      Define network blocks
#
# ----------------------------------------------------------------------------------------------------------------------
#
#      Hugues THOMAS - 06/03/2020

import time
import math
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.nn.init import kaiming_uniform_
# from kernels.kernel_points import load_kernels
from kpconv.kernels.kernel_points import load_kernels

# from lib.ply import write_ply
from kpconv.lib.ply import write_ply


def gather(x, idx, method=2):
    """
    implementation of a custom gather operation for faster backwards.
    :param x: input with shape [N, D_1, ... D_d]
    :param idx: indexing with shape [n_1, ..., n_m]
    :param method: Choice of the method
    :return: x[idx] with shape [n_1, ..., n_m, D_1, ... D_d]
    """

    if method == 0:
        return x[idx]
    elif method == 1:
        x = x.unsqueeze(1)
        x = x.expand((-1, idx.shape[-1], -1))
        idx = idx.unsqueeze(2)
        idx = idx.expand((-1, -1, x.shape[-1]))
        return x.gather(0, idx)
    elif method == 2:
        for i, ni in enumerate(idx.size()[1:]):
            x = x.unsqueeze(i+1)
            new_s = list(x.size())
            new_s[i+1] = ni
            x = x.expand(new_s)
        n = len(idx.size())
        for i, di in enumerate(x.size()[n:]):
            idx = idx.unsqueeze(i+n)
            new_s = list(idx.size())
            new_s[i+n] = di
            idx = idx.expand(new_s)
        return x.gather(0, idx)
    else:
        raise ValueError('Unkown method')


def radius_gaussian(sq_r, sig, eps=1e-9):
    """
    Compute a radius gaussian (gaussian of distance)
    :param sq_r: input radiuses [dn, ..., d1, d0]
    :param sig: extents of gaussians [d1, d0] or [d0] or float
    :return: gaussian of sq_r [dn, ..., d1, d0]
    """
    return torch.exp(-sq_r / (2 * sig**2 + eps))


def closest_pool(x, inds):
    """
    Pools features from the closest neighbors. WARNING: this function assumes the neighbors are ordered.
    :param x: [n1, d] features matrix
    :param inds: [n2, max_num] Only the first column is used for pooling
    :return: [n2, d] pooled features matrix
    """

    # Add a last row with minimum features for shadow pools
    x = torch.cat((x, torch.zeros_like(x[:1, :])), 0)

    # Get features for each pooling location [n2, d]
    return gather(x, inds[:, 0])


def max_pool(x, inds):
    """
    Pools features with the maximum values.
    :param x: [n1, d] features matrix
    :param inds: [n2, max_num] pooling indices
    :return: [n2, d] pooled features matrix
    """

    # Add a last row with minimum features for shadow pools
    x = torch.cat((x, torch.zeros_like(x[:1, :])), 0)

    # Get all features for each pooling location [n2, max_num, d]
    pool_features = gather(x, inds)

    # Pool the maximum [n2, d]
    max_features, _ = torch.max(pool_features, 1)
    return max_features


def global_average(x, batch_lengths):
    """
    Block performing a global average over batch pooling
    :param x: [N, D] input features
    :param batch_lengths: [B] list of batch lengths
    :return: [B, D] averaged features
    """

    # Loop over the clouds of the batch
    averaged_features = []
    i0 = 0
    for b_i, length in enumerate(batch_lengths):

        # Average features for each batch cloud
        averaged_features.append(torch.mean(x[i0:i0 + length], dim=0))

        # Increment for next cloud
        i0 += length

    # Average features in each batch
    return torch.stack(averaged_features)


# ----------------------------------------------------------------------------------------------------------------------
#
#           KPConv class
#       \******************/
#


class KPConv(nn.Module):

    def __init__(self, kernel_size, p_dim, in_channels, out_channels, KP_extent, radius,
                 fixed_kernel_points='center', KP_influence='linear', aggregation_mode='sum',
                 deformable=False, modulated=False):
        """
        Initialize parameters for KPConvDeformable.
        :param kernel_size: Number of kernel points.
        :param p_dim: dimension of the point space.
        :param in_channels: dimension of input features.
        :param out_channels: dimension of output features.
        :param KP_extent: influence radius of each kernel point.
        :param radius: radius used for kernel point init. Even for deformable, use the config.conv_radius
        :param fixed_kernel_points: fix position of certain kernel points ('none', 'center' or 'verticals').
        :param KP_influence: influence function of the kernel points ('constant', 'linear', 'gaussian').
        :param aggregation_mode: choose to sum influences, or only keep the closest ('closest', 'sum').
        :param deformable: choose deformable or not
        :param modulated: choose if kernel weights are modulated in addition to deformed
        """
        super(KPConv, self).__init__()

        # Save parameters
        self.K = kernel_size
        self.p_dim = p_dim
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.radius = radius
        self.KP_extent = KP_extent
        self.fixed_kernel_points = fixed_kernel_points
        self.KP_influence = KP_influence
        self.aggregation_mode = aggregation_mode
        self.deformable = deformable
        self.modulated = modulated

        # Running variable containing deformed KP distance to input points. (used in regularization loss)
        self.min_d2 = None
        self.deformed_KP = None
        self.offset_features = None

        # Initialize weights
        self.weights = Parameter(torch.zeros((self.K, in_channels, out_channels), dtype=torch.float32),
                                 requires_grad=True)

        # Initiate weights for offsets
        if deformable:
            if modulated:
                self.offset_dim = (self.p_dim + 1) * self.K
            else:
                self.offset_dim = self.p_dim * self.K
            self.offset_conv = KPConv(self.K,
                                      self.p_dim,
                                      self.in_channels,
                                      self.offset_dim,
                                      KP_extent,
                                      radius,
                                      fixed_kernel_points=fixed_kernel_points,
                                      KP_influence=KP_influence,
                                      aggregation_mode=aggregation_mode)
            self.offset_bias = Parameter(torch.zeros(self.offset_dim, dtype=torch.float32), requires_grad=True)

        else:
            self.offset_dim = None
            self.offset_conv = None
            self.offset_bias = None

        # Reset parameters
        self.reset_parameters()
        
        # Initialize kernel points
        self.kernel_points = self.init_KP()

        return

    def reset_parameters(self):
        kaiming_uniform_(self.weights, a=math.sqrt(5))
        if self.deformable:
            nn.init.zeros_(self.offset_bias)
        return

    def init_KP(self):
        """
        Initialize the kernel point positions in a sphere
        :return: the tensor of kernel points
        """

        # Create one kernel disposition (as numpy array). Choose the KP distance to center thanks to the KP extent
        K_points_numpy = load_kernels(self.radius,
                                      self.K,
                                      dimension=self.p_dim,
                                      fixed=self.fixed_kernel_points)

        return Parameter(torch.tensor(K_points_numpy, dtype=torch.float32),
                         requires_grad=False)

    def forward(self, q_pts, s_pts, neighb_inds, x):

        ###################
        # Offset generation
        ###################

        if self.deformable:

            # Get offsets with a KPConv that only takes part of the features
            self.offset_features = self.offset_conv(q_pts, s_pts, neighb_inds, x) + self.offset_bias

            if self.modulated:

                # Get offset (in normalized scale) from features
                unscaled_offsets = self.offset_features[:, :self.p_dim * self.K]
                unscaled_offsets = unscaled_offsets.view(-1, self.K, self.p_dim)

                # Get modulations
                modulations = 2 * torch.sigmoid(self.offset_features[:, self.p_dim * self.K:])

            else:

                # Get offset (in normalized scale) from features
                unscaled_offsets = self.offset_features.view(-1, self.K, self.p_dim)

                # No modulations
                modulations = None

            # Rescale offset for this layer
            offsets = unscaled_offsets * self.KP_extent

        else:
            offsets = None
            modulations = None

        ######################
        # Deformed convolution
        ######################

        # Add a fake point in the last row for shadow neighbors
        s_pts = torch.cat((s_pts, torch.zeros_like(s_pts[:1, :]) + 1e6), 0)

        # Get neighbor points [n_points, n_neighbors, dim]
        neighbors = s_pts[neighb_inds, :]

        # Center every neighborhood
        neighbors = neighbors - q_pts.unsqueeze(1)

        # Apply offsets to kernel points [n_points, n_kpoints, dim]
        if self.deformable:
            self.deformed_KP = offsets + self.kernel_points
            deformed_K_points = self.deformed_KP.unsqueeze(1)
        else:
            deformed_K_points = self.kernel_points

        # Get all difference matrices [n_points, n_neighbors, n_kpoints, dim]
        neighbors.unsqueeze_(2)
        differences = neighbors - deformed_K_points

        # Get the square distances [n_points, n_neighbors, n_kpoints]
        sq_distances = torch.sum(differences ** 2, dim=3)

        # Optimization by ignoring points outside a deformed KP range
        if self.deformable:

            # Save distances for loss
            self.min_d2, _ = torch.min(sq_distances, dim=1)

            # Boolean of the neighbors in range of a kernel point [n_points, n_neighbors]
            in_range = torch.any(sq_distances < self.KP_extent ** 2, dim=2).type(torch.int32)

            # New value of max neighbors
            new_max_neighb = torch.max(torch.sum(in_range, dim=1))

            # For each row of neighbors, indices of the ones that are in range [n_points, new_max_neighb]
            neighb_row_bool, neighb_row_inds = torch.topk(in_range, new_max_neighb.item(), dim=1)

            # Gather new neighbor indices [n_points, new_max_neighb]
            new_neighb_inds = neighb_inds.gather(1, neighb_row_inds, sparse_grad=False)

            # Gather new distances to KP [n_points, new_max_neighb, n_kpoints]
            neighb_row_inds.unsqueeze_(2)
            neighb_row_inds = neighb_row_inds.expand(-1, -1, self.K)
            sq_distances = sq_distances.gather(1, neighb_row_inds, sparse_grad=False)

            # New shadow neighbors have to point to the last shadow point
            new_neighb_inds *= neighb_row_bool
            new_neighb_inds -= (neighb_row_bool.type(torch.int64) - 1) * int(s_pts.shape[0] - 1)
        else:
            new_neighb_inds = neighb_inds

        # Get Kernel point influences [n_points, n_kpoints, n_neighbors]
        if self.KP_influence == 'constant':
            # Every point get an influence of 1.
            all_weights = torch.ones_like(sq_distances)
            all_weights = torch.transpose(all_weights, 1, 2)

        elif self.KP_influence == 'linear':
            # Influence decrease linearly with the distance, and get to zero when d = KP_extent.
            all_weights = torch.clamp(1 - torch.sqrt(sq_distances) / self.KP_extent, min=0.0)
            all_weights = torch.transpose(all_weights, 1, 2)

        elif self.KP_influence == 'gaussian':
            # Influence in gaussian of the distance.
            sigma = self.KP_extent * 0.3
            all_weights = radius_gaussian(sq_distances, sigma)
            all_weights = torch.transpose(all_weights, 1, 2)
        else:
            raise ValueError('Unknown influence function type (config.KP_influence)')

        # In case of closest mode, only the closest KP can influence each point
        if self.aggregation_mode == 'closest':
            neighbors_1nn = torch.argmin(sq_distances, dim=2)
            all_weights *= torch.transpose(nn.functional.one_hot(neighbors_1nn, self.K), 1, 2)

        elif self.aggregation_mode != 'sum':
            raise ValueError("Unknown convolution mode. Should be 'closest' or 'sum'")

        # Add a zero feature for shadow neighbors
        x = torch.cat((x, torch.zeros_like(x[:1, :])), 0)

        # Get the features of each neighborhood [n_points, n_neighbors, in_fdim]
        neighb_x = gather(x, new_neighb_inds)

        # Apply distance weights [n_points, n_kpoints, in_fdim]
        weighted_features = torch.matmul(all_weights, neighb_x)

        # Apply modulations
        if self.deformable and self.modulated:
            weighted_features *= modulations.unsqueeze(2)

        # Apply network weights [n_kpoints, n_points, out_fdim]
        weighted_features = weighted_features.permute((1, 0, 2))
        
        kernel_outputs = torch.matmul(weighted_features, self.weights)

        # Convolution sum [n_points, out_fdim]
        # return torch.sum(kernel_outputs, dim=0)
        output_features = torch.sum(kernel_outputs, dim=0, keepdim=False)

        # normalization term.
        neighbor_features_sum = torch.sum(neighb_x, dim=-1)
        neighbor_num = torch.sum(torch.gt(neighbor_features_sum, 0.0), dim=-1)
        neighbor_num = torch.max(neighbor_num, torch.ones_like(neighbor_num))
        output_features = output_features / neighbor_num.unsqueeze(1)

        return output_features

    def __repr__(self):
        return 'KPConv(radius: {:.2f}, extent: {:.2f}, in_feat: {:d}, out_feat: {:d})'.format(self.radius, self.KP_extent,
                                                                              self.in_channels,
                                                                              self.out_channels)

# ----------------------------------------------------------------------------------------------------------------------
#
#           Complex blocks
#       \********************/
#

def block_decider(block_name,
                  radius,
                  in_dim,
                  out_dim,
                  layer_ind,
                  config):

    if block_name == 'unary':
        return UnaryBlock(in_dim, out_dim, config.use_batch_norm, config.batch_norm_momentum)
    
    if block_name == 'last_unary':
        return LastUnaryBlock(in_dim, config.final_feats_dim+2, config.use_batch_norm, config.batch_norm_momentum)

    if block_name == 'last_unary_v2':
        return LastUnaryBlock(in_dim, config.final_feats_dim+config.keypoint_num+1, config.use_batch_norm, config.batch_norm_momentum)
    # if block_name == 'last_unary_pose':
    #     return LastUnaryBlock(in_dim, 7+1, config.use_batch_norm, config.batch_norm_momentum)


    elif block_name in ['simple',
                        'simple_deformable',
                        'simple_invariant',
                        'simple_equivariant',
                        'simple_strided',
                        'simple_deformable_strided',
                        'simple_invariant_strided',
                        'simple_equivariant_strided']:
        return SimpleBlock(block_name, in_dim, out_dim, radius, layer_ind, config)

    elif block_name in ['resnetb',
                        'resnetb_invariant',
                        'resnetb_equivariant',
                        'resnetb_deformable',
                        'resnetb_strided',
                        'resnetb_deformable_strided',
                        'resnetb_equivariant_strided',
                        'resnetb_invariant_strided']:
        return ResnetBottleneckBlock(block_name, in_dim, out_dim, radius, layer_ind, config)

    elif block_name == 'max_pool' or block_name == 'max_pool_wide':
        return MaxPoolBlock(layer_ind)

    elif block_name == 'global_average':
        return GlobalAverageBlock()

    elif block_name == 'nearest_upsample':
        return NearestUpsampleBlock(layer_ind)

    else:
        raise ValueError('Unknown block name in the architecture definition : ' + block_name)


class BatchNormBlock(nn.Module):

    def __init__(self, in_dim, use_bn, bn_momentum):
        """
        Initialize a batch normalization block. If network does not use batch normalization, replace with biases.
        :param in_dim: dimension input features
        :param use_bn: boolean indicating if we use Batch Norm
        :param bn_momentum: Batch norm momentum
        """
        super(BatchNormBlock, self).__init__()
        self.bn_momentum = bn_momentum
        self.use_bn = use_bn
        self.in_dim = in_dim
        if self.use_bn:
            #self.batch_norm = nn.BatchNorm1d(in_dim, momentum=bn_momentum)
            self.batch_norm = nn.InstanceNorm1d(in_dim, momentum=bn_momentum)
        else:
            self.bias = Parameter(torch.zeros(in_dim, dtype=torch.float32), requires_grad=True)
        return

    def reset_parameters(self):
        nn.init.zeros_(self.bias)

    def forward(self, x):
        if self.use_bn:

            x = x.unsqueeze(2)
            x = x.transpose(0, 2)
            x = self.batch_norm(x)
            x = x.transpose(0, 2)
            return x.squeeze()
        else:
            return x + self.bias

    def __repr__(self):
        return 'BatchNormBlock(in_feat: {:d}, momentum: {:.3f}, only_bias: {:s})'.format(self.in_dim,
                                                                                         self.bn_momentum,
                                                                                         str(not self.use_bn))


class UnaryBlock(nn.Module):

    def __init__(self, in_dim, out_dim, use_bn, bn_momentum, no_relu=False):
        """
        Initialize a standard unary block with its ReLU and BatchNorm.
        :param in_dim: dimension input features
        :param out_dim: dimension input features
        :param use_bn: boolean indicating if we use Batch Norm
        :param bn_momentum: Batch norm momentum
        """

        super(UnaryBlock, self).__init__()
        self.bn_momentum = bn_momentum
        self.use_bn = use_bn
        self.no_relu = no_relu
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.mlp = nn.Linear(in_dim, out_dim, bias=False)
        self.batch_norm = BatchNormBlock(out_dim, self.use_bn, self.bn_momentum)
        if not no_relu:
            self.leaky_relu = nn.LeakyReLU(0.1)
        return

    def forward(self, x, batch=None):
        x = self.mlp(x)
        x = self.batch_norm(x)
        if not self.no_relu:
            x = self.leaky_relu(x)
        return x

    def __repr__(self):
        return 'UnaryBlock(in_feat: {:d}, out_feat: {:d}, BN: {:s}, ReLU: {:s})'.format(self.in_dim,
                                                                                        self.out_dim,
                                                                                        str(self.use_bn),
                                                                                        str(not self.no_relu))


class LastUnaryBlock(nn.Module):

    def __init__(self, in_dim, out_dim, use_bn, bn_momentum, no_relu=False):
        """
        Initialize a standard last_unary block without BN, ReLU.
        :param in_dim: dimension input features
        :param out_dim: dimension input features
        :param use_bn: boolean indicating if we use Batch Norm
        :param bn_momentum: Batch norm momentum
        """

        super(LastUnaryBlock, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.mlp = nn.Linear(in_dim, out_dim, bias=False)
        return

    def forward(self, x, batch=None):
        x = self.mlp(x)
        return x

    def __repr__(self):
        return 'LastUnaryBlock(in_feat: {:d}, out_feat: {:d})'.format(self.in_dim,
                                                                      self.out_dim)


class SimpleBlock(nn.Module):

    def __init__(self, block_name, in_dim, out_dim, radius, layer_ind, config):
        """
        Initialize a simple convolution block with its ReLU and BatchNorm.
        :param in_dim: dimension input features
        :param out_dim: dimension input features
        :param radius: current radius of convolution
        :param config: parameters
        """
        super(SimpleBlock, self).__init__()

        # get KP_extent from current radius
        current_extent = radius * config.KP_extent / config.conv_radius

        # Get other parameters
        self.bn_momentum = config.batch_norm_momentum
        self.use_bn = config.use_batch_norm
        self.layer_ind = layer_ind
        self.block_name = block_name
        self.in_dim = in_dim
        self.out_dim = out_dim

        # Define the KPConv class
        self.KPConv = KPConv(config.num_kernel_points,
                             config.in_points_dim,
                             in_dim,
                             out_dim // 2,
                             current_extent,
                             radius,
                             fixed_kernel_points=config.fixed_kernel_points,
                             KP_influence=config.KP_influence,
                             aggregation_mode=config.aggregation_mode,
                             deformable='deform' in block_name,
                             modulated=config.modulated)

        # Other opperations
        self.batch_norm = BatchNormBlock(out_dim // 2, self.use_bn, self.bn_momentum)
        self.leaky_relu = nn.LeakyReLU(0.1)

        return

    def forward(self, x, batch):

        if 'strided' in self.block_name:
            q_pts = batch['points'][self.layer_ind + 1]
            s_pts = batch['points'][self.layer_ind]
            neighb_inds = batch['pools'][self.layer_ind]
        else:
            q_pts = batch['points'][self.layer_ind]
            s_pts = batch['points'][self.layer_ind]
            neighb_inds = batch['neighbors'][self.layer_ind]

        x = self.KPConv(q_pts, s_pts, neighb_inds, x)
        return self.leaky_relu(self.batch_norm(x))


class ResnetBottleneckBlock(nn.Module):

    def __init__(self, block_name, in_dim, out_dim, radius, layer_ind, config):
        """
        Initialize a resnet bottleneck block.
        :param in_dim: dimension input features
        :param out_dim: dimension input features
        :param radius: current radius of convolution
        :param config: parameters
        """
        super(ResnetBottleneckBlock, self).__init__()

        # get KP_extent from current radius
        current_extent = radius * config.KP_extent / config.conv_radius

        # Get other parameters
        self.bn_momentum = config.batch_norm_momentum
        self.use_bn = config.use_batch_norm
        self.block_name = block_name
        self.layer_ind = layer_ind
        self.in_dim = in_dim
        self.out_dim = out_dim

        # First downscaling mlp
        if in_dim != out_dim // 4:
            self.unary1 = UnaryBlock(in_dim, out_dim // 4, self.use_bn, self.bn_momentum)
        else:
            self.unary1 = nn.Identity()

        # KPConv block
        self.KPConv = KPConv(config.num_kernel_points,
                             config.in_points_dim,
                             out_dim // 4,
                             out_dim // 4,
                             current_extent,
                             radius,
                             fixed_kernel_points=config.fixed_kernel_points,
                             KP_influence=config.KP_influence,
                             aggregation_mode=config.aggregation_mode,
                             deformable='deform' in block_name,
                             modulated=config.modulated)
        self.batch_norm_conv = BatchNormBlock(out_dim // 4, self.use_bn, self.bn_momentum)

        # Second upscaling mlp
        self.unary2 = UnaryBlock(out_dim // 4, out_dim, self.use_bn, self.bn_momentum, no_relu=True)

        # Shortcut optional mpl
        if in_dim != out_dim:
            self.unary_shortcut = UnaryBlock(in_dim, out_dim, self.use_bn, self.bn_momentum, no_relu=True)
        else:
            self.unary_shortcut = nn.Identity()

        # Other operations
        self.leaky_relu = nn.LeakyReLU(0.1)

        return

    def forward(self, features, batch):

        if 'strided' in self.block_name:
            q_pts = batch['points'][self.layer_ind + 1]
            s_pts = batch['points'][self.layer_ind]
            neighb_inds = batch['pools'][self.layer_ind]
        else:
            q_pts = batch['points'][self.layer_ind]
            s_pts = batch['points'][self.layer_ind]
            neighb_inds = batch['neighbors'][self.layer_ind]

        # First downscaling mlp
        x = self.unary1(features)

        # Convolution
        x = self.KPConv(q_pts, s_pts, neighb_inds, x)
        x = self.leaky_relu(self.batch_norm_conv(x))

        # Second upscaling mlp
        x = self.unary2(x)

        # Shortcut
        if 'strided' in self.block_name:
            shortcut = max_pool(features, neighb_inds)
        else:
            shortcut = features
        shortcut = self.unary_shortcut(shortcut)

        return self.leaky_relu(x + shortcut)


class GlobalAverageBlock(nn.Module):

    def __init__(self):
        """
        Initialize a global average block with its ReLU and BatchNorm.
        """
        super(GlobalAverageBlock, self).__init__()
        return

    def forward(self, x, batch):
        return global_average(x, batch['stack_lengths'][-1])


class NearestUpsampleBlock(nn.Module):

    def __init__(self, layer_ind):
        """
        Initialize a nearest upsampling block with its ReLU and BatchNorm.
        """
        super(NearestUpsampleBlock, self).__init__()
        self.layer_ind = layer_ind
        return

    def forward(self, x, batch):
        return closest_pool(x, batch['upsamples'][self.layer_ind - 1])

    def __repr__(self):
        return 'NearestUpsampleBlock(layer: {:d} -> {:d})'.format(self.layer_ind,
                                                                  self.layer_ind - 1)


class MaxPoolBlock(nn.Module):

    def __init__(self, layer_ind):
        """
        Initialize a max pooling block with its ReLU and BatchNorm.
        """
        super(MaxPoolBlock, self).__init__()
        self.layer_ind = layer_ind
        return

    def forward(self, x, batch):
        return max_pool(x, batch['pools'][self.layer_ind + 1])


================================================
FILE: thirdparty/kpconv/lib/__init__.py
================================================


================================================
FILE: thirdparty/kpconv/lib/ply.py
================================================
#
#
#      0===============================0
#      |    PLY files reader/writer    |
#      0===============================0
#
#
# ----------------------------------------------------------------------------------------------------------------------
#
#      function to read/write .ply files
#
# ----------------------------------------------------------------------------------------------------------------------
#
#      Hugues THOMAS - 10/02/2017
#


# ----------------------------------------------------------------------------------------------------------------------
#
#          Imports and global variables
#      \**********************************/
#

# Basic libs
import numpy as np
import sys

# Define PLY types
ply_dtypes = dict([
    (b'int8', 'i1'),
    (b'char', 'i1'),
    (b'uint8', 'u1'),
    (b'uchar', 'u1'),
    (b'int16', 'i2'),
    (b'short', 'i2'),
    (b'uint16', 'u2'),
    (b'ushort', 'u2'),
    (b'int32', 'i4'),
    (b'int', 'i4'),
    (b'uint32', 'u4'),
    (b'uint', 'u4'),
    (b'float32', 'f4'),
    (b'float', 'f4'),
    (b'float64', 'f8'),
    (b'double', 'f8')
])

# Numpy reader format
valid_formats = {'ascii': '', 'binary_big_endian': '>',
                 'binary_little_endian': '<'}


# ----------------------------------------------------------------------------------------------------------------------
#
#           Functions
#       \***************/
#


def parse_header(plyfile, ext):
    # Variables
    line = []
    properties = []
    num_points = None

    while b'end_header' not in line and line != b'':
        line = plyfile.readline()

        if b'element' in line:
            line = line.split()
            num_points = int(line[2])

        elif b'property' in line:
            line = line.split()
            properties.append((line[2].decode(), ext + ply_dtypes[line[1]]))

    return num_points, properties


def parse_mesh_header(plyfile, ext):
    # Variables
    line = []
    vertex_properties = []
    num_points = None
    num_faces = None
    current_element = None

    while b'end_header' not in line and line != b'':
        line = plyfile.readline()

        # Find point element
        if b'element vertex' in line:
            current_element = 'vertex'
            line = line.split()
            num_points = int(line[2])

        elif b'element face' in line:
            current_element = 'face'
            line = line.split()
            num_faces = int(line[2])

        elif b'property' in line:
            if current_element == 'vertex':
                line = line.split()
                vertex_properties.append((line[2].decode(), ext + ply_dtypes[line[1]]))
            elif current_element == 'vertex':
                if not line.startswith('property list uchar int'):
                    raise ValueError('Unsupported faces property : ' + line)

    return num_points, num_faces, vertex_properties


def read_ply(filename, triangular_mesh=False):
    """
    Read ".ply" files

    Parameters
    ----------
    filename : string
        the name of the file to read.

    Returns
    -------
    result : array
        data stored in the file

    Examples
    --------
    Store data in file

    >>> points = np.random.rand(5, 3)
    >>> values = np.random.randint(2, size=10)
    >>> write_ply('example.ply', [points, values], ['x', 'y', 'z', 'values'])

    Read the file

    >>> data = read_ply('example.ply')
    >>> values = data['values']
    array([0, 0, 1, 1, 0])

    >>> points = np.vstack((data['x'], data['y'], data['z'])).T
    array([[ 0.466  0.595  0.324]
           [ 0.538  0.407  0.654]
           [ 0.850  0.018  0.988]
           [ 0.395  0.394  0.363]
           [ 0.873  0.996  0.092]])

    """

    with open(filename, 'rb') as plyfile:

        # Check if the file start with ply
        if b'ply' not in plyfile.readline():
            raise ValueError('The file does not start whith the word ply')

        # get binary_little/big or ascii
        fmt = plyfile.readline().split()[1].decode()
        if fmt == "ascii":
            raise ValueError('The file is not binary')

        # get extension for building the numpy dtypes
        ext = valid_formats[fmt]

        # PointCloud reader vs mesh reader
        if triangular_mesh:

            # Parse header
            num_points, num_faces, properties = parse_mesh_header(plyfile, ext)

            # Get point data
            vertex_data = np.fromfile(plyfile, dtype=properties, count=num_points)

            # Get face data
            face_properties = [('k', ext + 'u1'),
                               ('v1', ext + 'i4'),
                               ('v2', ext + 'i4'),
                               ('v3', ext + 'i4')]
            faces_data = np.fromfile(plyfile, dtype=face_properties, count=num_faces)

            # Return vertex data and concatenated faces
            faces = np.vstack((faces_data['v1'], faces_data['v2'], faces_data['v3'])).T
            data = [vertex_data, faces]

        else:

            # Parse header
            num_points, properties = parse_header(plyfile, ext)

            # Get data
            data = np.fromfile(plyfile, dtype=properties, count=num_points)

    return data


def header_properties(field_list, field_names):
    # List of lines to write
    lines = []

    # First line describing element vertex
    lines.append('element vertex %d' % field_list[0].shape[0])

    # Properties lines
    i = 0
    for fields in field_list:
        for field in fields.T:
            lines.append('property %s %s' % (field.dtype.name, field_names[i]))
            i += 1

    return lines


def write_ply(filename, field_list, field_names, triangular_faces=None):
    """
    Write ".ply" files

    Parameters
    ----------
    filename : string
        the name of the file to which the data is saved. A '.ply' extension will be appended to the
        file name if it does no already have one.

    field_list : list, tuple, numpy array
        the fields to be saved in the ply file. Either a numpy array, a list of numpy arrays or a
        tuple of numpy arrays. Each 1D numpy array and each column of 2D numpy arrays are considered
        as one field.

    field_names : list
        the name of each fields as a list of strings. Has to be the same length as the number of
        fields.

    Examples
    --------
    >>> points = np.random.rand(10, 3)
    >>> write_ply('example1.ply', points, ['x', 'y', 'z'])

    >>> values = np.random.randint(2, size=10)
    >>> write_ply('example2.ply', [points, values], ['x', 'y', 'z', 'values'])

    >>> colors = np.random.randint(255, size=(10,3), dtype=np.uint8)
    >>> field_names = ['x', 'y', 'z', 'red', 'green', 'blue', values']
    >>> write_ply('example3.ply', [points, colors, values], field_names)

    """

    # Format list input to the right form
    field_list = list(field_list) if (type(field_list) == list or type(field_list) == tuple) else list((field_list,))
    for i, field in enumerate(field_list):
        if field.ndim < 2:
            field_list[i] = field.reshape(-1, 1)
        if field.ndim > 2:
            print('fields have more than 2 dimensions')
            return False

            # check all fields have the same number of data
    n_points = [field.shape[0] for field in field_list]
    if not np.all(np.equal(n_points, n_points[0])):
        print('wrong field dimensions')
        return False

        # Check if field_names and field_list have same nb of column
    n_fields = np.sum([field.shape[1] for field in field_list])
    if (n_fields != len(field_names)):
        print('wrong number of field names')
        return False

    # Add extension if not there
    if not filename.endswith('.ply'):
        filename += '.ply'

    # open in text mode to write the header
    with open(filename, 'w') as plyfile:

        # First magical word
        header = ['ply']

        # Encoding format
        header.append('format binary_' + sys.byteorder + '_endian 1.0')

        # Points properties description
        header.extend(header_properties(field_list, field_names))

        # Add faces if needded
        if triangular_faces is not None:
            header.append('element face {:d}'.format(triangular_faces.shape[0]))
            header.append('property list uchar int vertex_indices')

        # End of header
        header.append('end_header')

        # Write all lines
        for line in header:
            plyfile.write("%s\n" % line)

    # open in binary/append to use tofile
    with open(filename, 'ab') as plyfile:

        # Create a structured array
        i = 0
        type_list = []
        for fields in field_list:
            for field in fields.T:
                type_list += [(field_names[i], field.dtype.str)]
                i += 1
        data = np.empty(field_list[0].shape[0], dtype=type_list)
        i = 0
        for fields in field_list:
            for field in fields.T:
                data[field_names[i]] = field
                i += 1

        data.tofile(plyfile)

        if triangular_faces is not None:
            triangular_faces = triangular_faces.astype(np.int32)
            type_list = [('k', 'uint8')] + [(str(ind), 'int32') for ind in range(3)]
            data = np.empty(triangular_faces.shape[0], dtype=type_list)
            data['k'] = np.full((triangular_faces.shape[0],), 3, dtype=np.uint8)
            data['0'] = triangular_faces[:, 0]
            data['1'] = triangular_faces[:, 1]
            data['2'] = triangular_faces[:, 2]
            data.tofile(plyfile)

    return True


def describe_element(name, df):
    """ Takes the columns of the dataframe and builds a ply-like description

    Parameters
    ----------
    name: str
    df: pandas DataFrame

    Returns
    -------
    element: list[str]
    """
    property_formats = {'f': 'float', 'u': 'uchar', 'i': 'int'}
    element = ['element ' + name + ' ' + str(len(df))]

    if name == 'face':
        element.append("property list uchar int points_indices")

    else:
        for i in range(len(df.columns)):
            # get first letter of dtype to infer format
            f = property_formats[str(df.dtypes[i])[0]]
            element.append('property ' + f + ' ' + df.columns.values[i])

    return element


================================================
FILE: thirdparty/kpconv/lib/timer.py
================================================
import time


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0.0
        self.sq_sum = 0.0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        self.sq_sum += val ** 2 * n
        self.var = self.sq_sum / self.count - self.avg ** 2


class Timer(object):
    """A simple timer."""

    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.avg = 0.

    def reset(self):
        self.total_time = 0
        self.calls = 0
        self.start_time = 0
        self.diff = 0
        self.avg = 0

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.avg = self.total_time / self.calls
        if average:
            return self.avg
        else:
            return self.diff


================================================
FILE: thirdparty/kpconv/lib/utils.py
================================================
"""
General utility functions

Author: Shengyu Huang
Last modified: 30.11.2020
"""

import os,re,sys,json,yaml,random, argparse, torch, pickle
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from scipy.spatial.transform import Rotation

from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import minkowski
_EPS = 1e-7  # To prevent division by zero


class Logger:
    def __init__(self, path):
        self.path = path
        self.fw = open(self.path+'/log','a')

    def write(self, text):
        self.fw.write(text)
        self.fw.flush()

    def close(self):
        self.fw.close()

def save_obj(obj, path ):
    """
    save a dictionary to a pickle file
    """
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_obj(path):
    """
    read a dictionary from a pickle file
    """
    with open(path, 'rb') as f:
        return pickle.load(f)

def load_config(path):
    """
    Loads config file:

    Args:
        path (str): path to the config file

    Returns: 
        config (dict): dictionary of the configuration parameters

    """
    with open(path,'r') as f:
        cfg = yaml.safe_load(f)

    return cfg


def setup_seed(seed):
    """
    fix random seed for deterministic training
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

def square_distance(src, dst, normalised = False):
    """
    Calculate Euclid distance between each two points.
    Args:
        src: source points, [B, N, C]
        dst: target points, [B, M, C]
    Returns:
        dist: per-point square distance, [B, N, M]
    """
    # print(src.shape, dst.shape, '!!!',flush=True)
    B, N, _ = src.shape
    _, M, _ = dst.shape
    dist = -2 * torch.matmul(src, dst.permute(0, 2, 1))
    if(normalised):
        dist += 2
    else:
        dist += torch.sum(src ** 2, dim=-1)[:, :, None]
        dist += torch.sum(dst ** 2, dim=-1)[:, None, :]

    dist = torch.clamp(dist, min=1e-12, max=None)
    return dist
    

def validate_gradient(model):
    """
    Confirm all the gradients are non-nan and non-inf
    """
    for name, param in model.named_parameters():
        if param.grad is not None:
            if torch.any(torch.isnan(param.grad)):
                return False
            if torch.any(torch.isinf(param.grad)):
                return False
    return True


def natural_key(string_):
    """
    Sort strings by numbers in the name
    """
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

================================================
FILE: thirdparty/nn/_ext.c
================================================
#define _CFFI_

/* We try to define Py_LIMITED_API before including Python.h.

   Mess: we can only define it if Py_DEBUG, Py_TRACE_REFS and
   Py_REF_DEBUG are not defined.  This is a best-effort approximation:
   we can learn about Py_DEBUG from pyconfig.h, but it is unclear if
   the same works for the other two macros.  Py_DEBUG implies them,
   but not the other way around.

   Issue #350 is still open: on Windows, the code here causes it to link
   with PYTHON36.DLL (for example) instead of PYTHON3.DLL.  A fix was
   attempted in 164e526a5515 and 14ce6985e1c3, but reverted: virtualenv
   does not make PYTHON3.DLL available, and so the "correctly" compiled
   version would not run inside a virtualenv.  We will re-apply the fix
   after virtualenv has been fixed for some time.  For explanation, see
   issue #355.  For a workaround if you want PYTHON3.DLL and don't worry
   about virtualenv, see issue #350.  See also 'py_limited_api' in
   setuptools_ext.py.
*/
#if !defined(_CFFI_USE_EMBEDDING) && !defined(Py_LIMITED_API)
#  include <pyconfig.h>
#  if !defined(Py_DEBUG) && !defined(Py_TRACE_REFS) && !defined(Py_REF_DEBUG)
#    define Py_LIMITED_API
#  endif
#endif

#include <Python.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h>

/* This part is from file 'cffi/parse_c_type.h'.  It is copied at the
   beginning of C sources generated by CFFI's ffi.set_source(). */

typedef void *_cffi_opcode_t;

#define _CFFI_OP(opcode, arg)   (_cffi_opcode_t)(opcode | (((uintptr_t)(arg)) << 8))
#define _CFFI_GETOP(cffi_opcode)    ((unsigned char)(uintptr_t)cffi_opcode)
#define _CFFI_GETARG(cffi_opcode)   (((intptr_t)cffi_opcode) >> 8)

#define _CFFI_OP_PRIMITIVE       1
#define _CFFI_OP_POINTER         3
#define _CFFI_OP_ARRAY           5
#define _CFFI_OP_OPEN_ARRAY      7
#define _CFFI_OP_STRUCT_UNION    9
#define _CFFI_OP_ENUM           11
#define _CFFI_OP_FUNCTION       13
#define _CFFI_OP_FUNCTION_END   15
#define _CFFI_OP_NOOP           17
#define _CFFI_OP_BITFIELD       19
#define _CFFI_OP_TYPENAME       21
#define _CFFI_OP_CPYTHON_BLTN_V 23   // varargs
#define _CFFI_OP_CPYTHON_BLTN_N 25   // noargs
#define _CFFI_OP_CPYTHON_BLTN_O 27   // O  (i.e. a single arg)
#define _CFFI_OP_CONSTANT       29
#define _CFFI_OP_CONSTANT_INT   31
#define _CFFI_OP_GLOBAL_VAR     33
#define _CFFI_OP_DLOPEN_FUNC    35
#define _CFFI_OP_DLOPEN_CONST   37
#define _CFFI_OP_GLOBAL_VAR_F   39
#define _CFFI_OP_EXTERN_PYTHON  41

#define _CFFI_PRIM_VOID          0
#define _CFFI_PRIM_BOOL          1
#define _CFFI_PRIM_CHAR          2
#define _CFFI_PRIM_SCHAR         3
#define _CFFI_PRIM_UCHAR         4
#define _CFFI_PRIM_SHORT         5
#define _CFFI_PRIM_USHORT        6
#define _CFFI_PRIM_INT           7
#define _CFFI_PRIM_UINT          8
#define _CFFI_PRIM_LONG          9
#define _CFFI_PRIM_ULONG        10
#define _CFFI_PRIM_LONGLONG     11
#define _CFFI_PRIM_ULONGLONG    12
#define _CFFI_PRIM_FLOAT        13
#define _CFFI_PRIM_DOUBLE       14
#define _CFFI_PRIM_LONGDOUBLE   15

#define _CFFI_PRIM_WCHAR        16
#define _CFFI_PRIM_INT8         17
#define _CFFI_PRIM_UINT8        18
#define _CFFI_PRIM_INT16        19
#define _CFFI_PRIM_UINT16       20
#define _CFFI_PRIM_INT32        21
#define _CFFI_PRIM_UINT32       22
#define _CFFI_PRIM_INT64        23
#define _CFFI_PRIM_UINT64       24
#define _CFFI_PRIM_INTPTR       25
#define _CFFI_PRIM_UINTPTR      26
#define _CFFI_PRIM_PTRDIFF      27
#define _CFFI_PRIM_SIZE         28
#define _CFFI_PRIM_SSIZE        29
#define _CFFI_PRIM_INT_LEAST8   30
#define _CFFI_PRIM_UINT_LEAST8  31
#define _CFFI_PRIM_INT_LEAST16  32
#define _CFFI_PRIM_UINT_LEAST16 33
#define _CFFI_PRIM_INT_LEAST32  34
#define _CFFI_PRIM_UINT_LEAST32 35
#define _CFFI_PRIM_INT_LEAST64  36
#define _CFFI_PRIM_UINT_LEAST64 37
#define _CFFI_PRIM_INT_FAST8    38
#define _CFFI_PRIM_UINT_FAST8   39
#define _CFFI_PRIM_INT_FAST16   40
#define _CFFI_PRIM_UINT_FAST16  41
#define _CFFI_PRIM_INT_FAST32   42
#define _CFFI_PRIM_UINT_FAST32  43
#define _CFFI_PRIM_INT_FAST64   44
#define _CFFI_PRIM_UINT_FAST64  45
#define _CFFI_PRIM_INTMAX       46
#define _CFFI_PRIM_UINTMAX      47
#define _CFFI_PRIM_FLOATCOMPLEX 48
#define _CFFI_PRIM_DOUBLECOMPLEX 49
#define _CFFI_PRIM_CHAR16       50
#define _CFFI_PRIM_CHAR32       51

#define _CFFI__NUM_PRIM         52
#define _CFFI__UNKNOWN_PRIM           (-1)
#define _CFFI__UNKNOWN_FLOAT_PRIM     (-2)
#define _CFFI__UNKNOWN_LONG_DOUBLE    (-3)

#define _CFFI__IO_FILE_STRUCT         (-1)


struct _cffi_global_s {
    const char *name;
    void *address;
    _cffi_opcode_t type_op;
    void *size_or_direct_fn;  // OP_GLOBAL_VAR: size, or 0 if unknown
                              // OP_CPYTHON_BLTN_*: addr of direct function
};

struct _cffi_getconst_s {
    unsigned long long value;
    const struct _cffi_type_context_s *ctx;
    int gindex;
};

struct _cffi_struct_union_s {
    const char *name;
    int type_index;          // -> _cffi_types, on a OP_STRUCT_UNION
    int flags;               // _CFFI_F_* flags below
    size_t size;
    int alignment;
    int first_field_index;   // -> _cffi_fields array
    int num_fields;
};
#define _CFFI_F_UNION         0x01   // is a union, not a struct
#define _CFFI_F_CHECK_FIELDS  0x02   // complain if fields are not in the
                                     // "standard layout" or if some are missing
#define _CFFI_F_PACKED        0x04   // for CHECK_FIELDS, assume a packed struct
#define _CFFI_F_EXTERNAL      0x08   // in some other ffi.include()
#define _CFFI_F_OPAQUE        0x10   // opaque

struct _cffi_field_s {
    const char *name;
    size_t field_offset;
    size_t field_size;
    _cffi_opcode_t field_type_op;
};

struct _cffi_enum_s {
    const char *name;
    int type_index;          // -> _cffi_types, on a OP_ENUM
    int type_prim;           // _CFFI_PRIM_xxx
    const char *enumerators; // comma-delimited string
};

struct _cffi_typename_s {
    const char *name;
    int type_index;   /* if opaque, points to a possibly artificial
                         OP_STRUCT which is itself opaque */
};

struct _cffi_type_context_s {
    _cffi_opcode_t *types;
    const struct _cffi_global_s *globals;
    const struct _cffi_field_s *fields;
    const struct _cffi_struct_union_s *struct_unions;
    const struct _cffi_enum_s *enums;
    const struct _cffi_typename_s *typenames;
    int num_globals;
    int num_struct_unions;
    int num_enums;
    int num_typenames;
    const char *const *includes;
    int num_types;
    int flags;      /* future extension */
};

struct _cffi_parse_info_s {
    const struct _cffi_type_context_s *ctx;
    _cffi_opcode_t *output;
    unsigned int output_size;
    size_t error_location;
    const char *error_message;
};

struct _cffi_externpy_s {
    const char *name;
    size_t size_of_result;
    void *reserved1, *reserved2;
};

#ifdef _CFFI_INTERNAL
static int parse_c_type(struct _cffi_parse_info_s *info, const char *input);
static int search_in_globals(const struct _cffi_type_context_s *ctx,
                             const char *search, size_t search_len);
static int search_in_struct_unions(const struct _cffi_type_context_s *ctx,
                                   const char *search, size_t search_len);
#endif

/* this block of #ifs should be kept exactly identical between
   c/_cffi_backend.c, cffi/vengine_cpy.py, cffi/vengine_gen.py
   and cffi/_cffi_include.h */
#if defined(_MSC_VER)
# include <malloc.h>   /* for alloca() */
# if _MSC_VER < 1600   /* MSVC < 2010 */
   typedef __int8 int8_t;
   typedef __int16 int16_t;
   typedef __int32 int32_t;
   typedef __int64 int64_t;
   typedef unsigned __int8 uint8_t;
   typedef unsigned __int16 uint16_t;
   typedef unsigned __int32 uint32_t;
   typedef unsigned __int64 uint64_t;
   typedef __int8 int_least8_t;
   typedef __int16 int_least16_t;
   typedef __int32 int_least32_t;
   typedef __int64 int_least64_t;
   typedef unsigned __int8 uint_least8_t;
   typedef unsigned __int16 uint_least16_t;
   typedef unsigned __int32 uint_least32_t;
   typedef unsigned __int64 uint_least64_t;
   typedef __int8 int_fast8_t;
   typedef __int16 int_fast16_t;
   typedef __int32 int_fast32_t;
   typedef __int64 int_fast64_t;
   typedef unsigned __int8 uint_fast8_t;
   typedef unsigned __int16 uint_fast16_t;
   typedef unsigned __int32 uint_fast32_t;
   typedef unsigned __int64 uint_fast64_t;
   typedef __int64 intmax_t;
   typedef unsigned __int64 uintmax_t;
# else
#  include <stdint.h>
# endif
# if _MSC_VER < 1800   /* MSVC < 2013 */
#  ifndef __cplusplus
    typedef unsigned char _Bool;
#  endif
# endif
#else
# include <stdint.h>
# if (defined (__SVR4) && defined (__sun)) || defined(_AIX) || defined(__hpux)
#  include <alloca.h>
# endif
#endif

#ifdef __GNUC__
# define _CFFI_UNUSED_FN  __attribute__((unused))
#else
# define _CFFI_UNUSED_FN  /* nothing */
#endif

#ifdef __cplusplus
# ifndef _Bool
   typedef bool _Bool;   /* semi-hackish: C++ has no _Bool; bool is builtin */
# endif
#endif

/**********  CPython-specific section  **********/
#ifndef PYPY_VERSION


#if PY_MAJOR_VERSION >= 3
# define PyInt_FromLong PyLong_FromLong
#endif

#define _cffi_from_c_double PyFloat_FromDouble
#define _cffi_from_c_float PyFloat_FromDouble
#define _cffi_from_c_long PyInt_FromLong
#define _cffi_from_c_ulong PyLong_FromUnsignedLong
#define _cffi_from_c_longlong PyLong_FromLongLong
#define _cffi_from_c_ulonglong PyLong_FromUnsignedLongLong
#define _cffi_from_c__Bool PyBool_FromLong

#define _cffi_to_c_double PyFloat_AsDouble
#define _cffi_to_c_float PyFloat_AsDouble

#define _cffi_from_c_int(x, type)                                        \
    (((type)-1) > 0 ? /* unsigned */                                     \
        (sizeof(type) < sizeof(long) ?                                   \
            PyInt_FromLong((long)x) :                                    \
         sizeof(type) == sizeof(long) ?                                  \
            PyLong_FromUnsignedLong((unsigned long)x) :                  \
            PyLong_FromUnsignedLongLong((unsigned long long)x)) :        \
        (sizeof(type) <= sizeof(long) ?                                  \
            PyInt_FromLong((long)x) :                                    \
            PyLong_FromLongLong((long long)x)))

#define _cffi_to_c_int(o, type)                                          \
    ((type)(                                                             \
     sizeof(type) == 1 ? (((type)-1) > 0 ? (type)_cffi_to_c_u8(o)        \
                                         : (type)_cffi_to_c_i8(o)) :     \
     sizeof(type) == 2 ? (((type)-1) > 0 ? (type)_cffi_to_c_u16(o)       \
                                         : (type)_cffi_to_c_i16(o)) :    \
     sizeof(type) == 4 ? (((type)-1) > 0 ? (type)_cffi_to_c_u32(o)       \
                                         : (type)_cffi_to_c_i32(o)) :    \
     sizeof(type) == 8 ? (((type)-1) > 0 ? (type)_cffi_to_c_u64(o)       \
                                         : (type)_cffi_to_c_i64(o)) :    \
     (Py_FatalError("unsupported size for type " #type), (type)0)))

#define _cffi_to_c_i8                                                    \
                 ((int(*)(PyObject *))_cffi_exports[1])
#define _cffi_to_c_u8                                                    \
                 ((int(*)(PyObject *))_cffi_exports[2])
#define _cffi_to_c_i16                                                   \
                 ((int(*)(PyObject *))_cffi_exports[3])
#define _cffi_to_c_u16                                                   \
                 ((int(*)(PyObject *))_cffi_exports[4])
#define _cffi_to_c_i32                                                   \
                 ((int(*)(PyObject *))_cffi_exports[5])
#define _cffi_to_c_u32                                                   \
                 ((unsigned int(*)(PyObject *))_cffi_exports[6])
#define _cffi_to_c_i64                                                   \
                 ((long long(*)(PyObject *))_cffi_exports[7])
#define _cffi_to_c_u64                                                   \
                 ((unsigned long long(*)(PyObject *))_cffi_exports[8])
#define _cffi_to_c_char                                                  \
                 ((int(*)(PyObject *))_cffi_exports[9])
#define _cffi_from_c_pointer                                             \
    ((PyObject *(*)(char *, struct _cffi_ctypedescr *))_cffi_exports[10])
#define _cffi_to_c_pointer                                               \
    ((char *(*)(PyObject *, struct _cffi_ctypedescr *))_cffi_exports[11])
#define _cffi_get_struct_layout                                          \
    not used any more
#define _cffi_restore_errno                                              \
    ((void(*)(void))_cffi_exports[13])
#define _cffi_save_errno                                                 \
    ((void(*)(void))_cffi_exports[14])
#define _cffi_from_c_char                                                \
    ((PyObject *(*)(char))_cffi_exports[15])
#define _cffi_from_c_deref                                               \
    ((PyObject *(*)(char *, struct _cffi_ctypedescr *))_cffi_exports[16])
#define _cffi_to_c                                                       \
    ((int(*)(char *, struct _cffi_ctypedescr *, PyObject *))_cffi_exports[17])
#define _cffi_from_c_struct                                              \
    ((PyObject *(*)(char *, struct _cffi_ctypedescr *))_cffi_exports[18])
#define _cffi_to_c_wchar_t                                               \
    ((_cffi_wchar_t(*)(PyObject *))_cffi_exports[19])
#define _cffi_from_c_wchar_t                                             \
    ((PyObject *(*)(_cffi_wchar_t))_cffi_exports[20])
#define _cffi_to_c_long_double                                           \
    ((long double(*)(PyObject *))_cffi_exports[21])
#define _cffi_to_c__Bool                                                 \
    ((_Bool(*)(PyObject *))_cffi_exports[22])
#define _cffi_prepare_pointer_call_argument                              \
    ((Py_ssize_t(*)(struct _cffi_ctypedescr *,                           \
                    PyObject *, char **))_cffi_exports[23])
#define _cffi_convert_array_from_object                                  \
    ((int(*)(char *, struct _cffi_ctypedescr *, PyObject *))_cffi_exports[24])
#define _CFFI_CPIDX  25
#define _cffi_call_python                                                \
    ((void(*)(struct _cffi_externpy_s *, char *))_cffi_exports[_CFFI_CPIDX])
#define _cffi_to_c_wchar3216_t                                           \
    ((int(*)(PyObject *))_cffi_exports[26])
#define _cffi_from_c_wchar3216_t                                         \
    ((PyObject *(*)(int))_cffi_exports[27])
#define _CFFI_NUM_EXPORTS 28

struct _cffi_ctypedescr;

static void *_cffi_exports[_CFFI_NUM_EXPORTS];

#define _cffi_type(index)   (                           \
    assert((((uintptr_t)_cffi_types[index]) & 1) == 0), \
    (struct _cffi_ctypedescr *)_cffi_types[index])

static PyObject *_cffi_init(const char *module_name, Py_ssize_t version,
                            const struct _cffi_type_context_s *ctx)
{
    PyObject *module, *o_arg, *new_module;
    void *raw[] = {
        (void *)module_name,
        (void *)version,
        (void *)_cffi_exports,
        (void *)ctx,
    };

    module = PyImport_ImportModule("_cffi_backend");
    if (module == NULL)
        goto failure;

    o_arg = PyLong_FromVoidPtr((void *)raw);
    if (o_arg == NULL)
        goto failure;

    new_module = PyObject_CallMethod(
        module, (char *)"_init_cffi_1_0_external_module", (char *)"O", o_arg);

    Py_DECREF(o_arg);
    Py_DECREF(module);
    return new_module;

  failure:
    Py_XDECREF(module);
    return NULL;
}


#ifdef HAVE_WCHAR_H
typedef wchar_t _cffi_wchar_t;
#else
typedef uint16_t _cffi_wchar_t;   /* same random pick as _cffi_backend.c */
#endif

_CFFI_UNUSED_FN static uint16_t _cffi_to_c_char16_t(PyObject *o)
{
    if (sizeof(_cffi_wchar_t) == 2)
        return (uint16_t)_cffi_to_c_wchar_t(o);
    else
        return (uint16_t)_cffi_to_c_wchar3216_t(o);
}

_CFFI_UNUSED_FN static PyObject *_cffi_from_c_char16_t(uint16_t x)
{
    if (sizeof(_cffi_wchar_t) == 2)
        return _cffi_from_c_wchar_t((_cffi_wchar_t)x);
    else
        return _cffi_from_c_wchar3216_t((int)x);
}

_CFFI_UNUSED_FN static int _cffi_to_c_char32_t(PyObject *o)
{
    if (sizeof(_cffi_wchar_t) == 4)
        return (int)_cffi_to_c_wchar_t(o);
    else
        return (int)_cffi_to_c_wchar3216_t(o);
}

_CFFI_UNUSED_FN static PyObject *_cffi_from_c_char32_t(int x)
{
    if (sizeof(_cffi_wchar_t) == 4)
        return _cffi_from_c_wchar_t((_cffi_wchar_t)x);
    else
        return _cffi_from_c_wchar3216_t(x);
}


/**********  end CPython-specific section  **********/
#else
_CFFI_UNUSED_FN
static void (*_cffi_call_python_org)(struct _cffi_externpy_s *, char *);
# define _cffi_call_python  _cffi_call_python_org
#endif


#define _cffi_array_len(array)   (sizeof(array) / sizeof((array)[0]))

#define _cffi_prim_int(size, sign)                                      \
    ((size) == 1 ? ((sign) ? _CFFI_PRIM_INT8  : _CFFI_PRIM_UINT8)  :    \
     (size) == 2 ? ((sign) ? _CFFI_PRIM_INT16 : _CFFI_PRIM_UINT16) :    \
     (size) == 4 ? ((sign) ? _CFFI_PRIM_INT32 : _CFFI_PRIM_UINT32) :    \
     (size) == 8 ? ((sign) ? _CFFI_PRIM_INT64 : _CFFI_PRIM_UINT64) :    \
     _CFFI__UNKNOWN_PRIM)

#define _cffi_prim_float(size)                                          \
    ((size) == sizeof(float) ? _CFFI_PRIM_FLOAT :                       \
     (size) == sizeof(double) ? _CFFI_PRIM_DOUBLE :                     \
     (size) == sizeof(long double) ? _CFFI__UNKNOWN_LONG_DOUBLE :       \
     _CFFI__UNKNOWN_FLOAT_PRIM)

#define _cffi_check_int(got, got_nonpos, expected)      \
    ((got_nonpos) == (expected <= 0) &&                 \
     (got) == (unsigned long long)expected)

#ifdef MS_WIN32
# define _cffi_stdcall  __stdcall
#else
# define _cffi_stdcall  /* nothing */
#endif

#ifdef __cplusplus
}
#endif

/************************************************************/


    #include "src/ext.h"
    

/************************************************************/

static void *_cffi_types[] = {
/*  0 */ _CFFI_OP(_CFFI_OP_FUNCTION, 11), // void()(float *, float *, int *, int, int, int, int, int)
/*  1 */ _CFFI_OP(_CFFI_OP_POINTER, 10), // float *
/*  2 */ _CFFI_OP(_CFFI_OP_NOOP, 1),
/*  3 */ _CFFI_OP(_CFFI_OP_POINTER, 4), // int *
/*  4 */ _CFFI_OP(_CFFI_OP_PRIMITIVE, 7), // int
/*  5 */ _CFFI_OP(_CFFI_OP_PRIMITIVE, 7),
/*  6 */ _CFFI_OP(_CFFI_OP_PRIMITIVE, 7),
/*  7 */ _CFFI_OP(_CFFI_OP_PRIMITIVE, 7),
/*  8 */ _CFFI_OP(_CFFI_OP_PRIMITIVE, 7),
/*  9 */ _CFFI_OP(_CFFI_OP_FUNCTION_END, 0),
/* 10 */ _CFFI_OP(_CFFI_OP_PRIMITIVE, 13), // float
/* 11 */ _CFFI_OP(_CFFI_OP_PRIMITIVE, 0), // void
};

static void _cffi_d_findNearestPointIdxLauncher(float * x0, float * x1, int * x2, int x3, int x4, int x5, int x6, int x7)
{
  findNearestPointIdxLauncher(x0, x1, x2, x3, x4, x5, x6, x7);
}
#ifndef PYPY_VERSION
static PyObject *
_cffi_f_findNearestPointIdxLauncher(PyObject *self, PyObject *args)
{
  float * x0;
  float * x1;
  int * x2;
  int x3;
  int x4;
  int x5;
  int x6;
  int x7;
  Py_ssize_t datasize;
  PyObject *arg0;
  PyObject *arg1;
  PyObject *arg2;
  PyObject *arg3;
  PyObject *arg4;
  PyObject *arg5;
  PyObject *arg6;
  PyObject *arg7;

  if (!PyArg_UnpackTuple(args, "findNearestPointIdxLauncher", 8, 8, &arg0, &arg1, &arg2, &arg3, &arg4, &arg5, &arg6, &arg7))
    return NULL;

  datasize = _cffi_prepare_pointer_call_argument(
      _cffi_type(1), arg0, (char **)&x0);
  if (datasize != 0) {
    if (datasize < 0)
      return NULL;
    x0 = (float *)alloca((size_t)datasize);
    memset((void *)x0, 0, (size_t)datasize);
    if (_cffi_convert_array_from_object((char *)x0, _cffi_type(1), arg0) < 0)
      return NULL;
  }

  datasize = _cffi_prepare_pointer_call_argument(
      _cffi_type(1), arg1, (char **)&x1);
  if (datasize != 0) {
    if (datasize < 0)
      return NULL;
    x1 = (float *)alloca((size_t)datasize);
    memset((void *)x1, 0, (size_t)datasize);
    if (_cffi_convert_array_from_object((char *)x1, _cffi_type(1), arg1) < 0)
      return NULL;
  }

  datasize = _cffi_prepare_pointer_call_argument(
      _cffi_type(3), arg2, (char **)&x2);
  if (datasize != 0) {
    if (datasize < 0)
      return NULL;
    x2 = (int *)alloca((size_t)datasize);
    memset((void *)x2, 0, (size_t)datasize);
    if (_cffi_convert_array_from_object((char *)x2, _cffi_type(3), arg2) < 0)
      return NULL;
  }

  x3 = _cffi_to_c_int(arg3, int);
  if (x3 == (int)-1 && PyErr_Occurred())
    return NULL;

  x4 = _cffi_to_c_int(arg4, int);
  if (x4 == (int)-1 && PyErr_Occurred())
    return NULL;

  x5 = _cffi_to_c_int(arg5, int);
  if (x5 == (int)-1 && PyErr_Occurred())
    return NULL;

  x6 = _cffi_to_c_int(arg6, int);
  if (x6 == (int)-1 && PyErr_Occurred())
    return NULL;

  x7 = _cffi_to_c_int(arg7, int);
  if (x7 == (int)-1 && PyErr_Occurred())
    return NULL;

  Py_BEGIN_ALLOW_THREADS
  _cffi_restore_errno();
  { findNearestPointIdxLauncher(x0, x1, x2, x3, x4, x5, x6, x7); }
  _cffi_save_errno();
  Py_END_ALLOW_THREADS

  (void)self; /* unused */
  Py_INCREF(Py_None);
  return Py_None;
}
#else
#  define _cffi_f_findNearestPointIdxLauncher _cffi_d_findNearestPointIdxLauncher
#endif

static const struct _cffi_global_s _cffi_globals[] = {
  { "findNearestPointIdxLauncher", (void *)_cffi_f_findNearestPointIdxLauncher, _CFFI_OP(_CFFI_OP_CPYTHON_BLTN_V, 0), (void *)_cffi_d_findNearestPointIdxLauncher },
};

static const struct _cffi_type_context_s _cffi_type_context = {
  _cffi_types,
  _cffi_globals,
  NULL,  /* no fields */
  NULL,  /* no struct_unions */
  NULL,  /* no enums */
  NULL,  /* no typenames */
  1,  /* num_globals */
  0,  /* num_struct_unions */
  0,  /* num_enums */
  0,  /* num_typenames */
  NULL,  /* no includes */
  12,  /* num_types */
  0,  /* flags */
};

#ifdef __GNUC__
#  pragma GCC visibility push(default)  /* for -fvisibility= */
#endif

#ifdef PYPY_VERSION
PyMODINIT_FUNC
_cffi_pypyinit__ext(const void *p[])
{
    p[0] = (const void *)0x2601;
    p[1] = &_cffi_type_context;
#if PY_MAJOR_VERSION >= 3
    return NULL;
#endif
}
#  ifdef _MSC_VER
     PyMODINIT_FUNC
#  if PY_MAJOR_VERSION >= 3
     PyInit__ext(void) { return NULL; }
#  else
     init_ext(void) { }
#  endif
#  endif
#elif PY_MAJOR_VERSION >= 3
PyMODINIT_FUNC
PyInit__ext(void)
{
  return _cffi_init("_ext", 0x2601, &_cffi_type_context);
}
#else
PyMODINIT_FUNC
init_ext(void)
{
  _cffi_init("_ext", 0x2601, &_cffi_type_context);
}
#endif

#ifdef __GNUC__
#  pragma GCC visibility pop
#endif


================================================
FILE: thirdparty/nn/nn_utils.py
================================================
# from lib.csrc.nn._ext import lib, ffi
from thirdparty.nn._ext import lib, ffi
import numpy as np


def find_nearest_point_idx(ref_pts, que_pts):
    assert(ref_pts.shape[1] == que_pts.shape[1] and 1 < que_pts.shape[1] <= 3)
    pn1 = ref_pts.shape[0]
    pn2 = que_pts.shape[0]
    dim = ref_pts.shape[1]

    ref_pts = np.ascontiguousarray(ref_pts[None,:,:], np.float32)
    que_pts = np.ascontiguousarray(que_pts[None,:,:], np.float32)
    idxs = np.zeros([1, pn2], np.int32)

    ref_pts_ptr = ffi.cast('float *', ref_pts.ctypes.data)
    que_pts_ptr = ffi.cast('float *', que_pts.ctypes.data)
    idxs_ptr = ffi.cast('int *', idxs.ctypes.data)
    lib.findNearestPointIdxLauncher(ref_pts_ptr, que_pts_ptr, idxs_ptr, 1, pn1, pn2, dim, 0)

    return idxs[0]


================================================
FILE: thirdparty/nn/setup.py
================================================
import os

cuda_include=os.path.join(os.environ.get('CUDA_HOME'), 'include')
os.system('nvcc src/nearest_neighborhood.cu -c -o src/nearest_neighborhood.cu.o -x cu -Xcompiler -fPIC -O2 -arch=sm_52 -I {}'.format(cuda_include))

from cffi import FFI
ffibuilder = FFI()


with open(os.path.join(os.path.dirname(__file__), "src/ext.h")) as f:
    ffibuilder.cdef(f.read())

ffibuilder.set_source(
    "_ext",
    """
    #include "src/ext.h"
    """,
    extra_objects=['src/nearest_neighborhood.cu.o',
                   os.path.join(os.environ.get('CUDA_HOME'),'lib64/libcudart.so')],
    libraries=['stdc++']
)


if __name__ == "__main__":
    ffibuilder.compile(verbose=True)
    os.system("rm src/*.o")
    os.system("rm *.o")


================================================
FILE: thirdparty/nn/src/ext.h
================================================
void findNearestPointIdxLauncher(
    float* ref_pts,   // [b,pn1,dim]
    float* que_pts,   // [b,pn2,dim]
    int* idxs,        // [b,pn2]
    int b,
    int pn1,
    int pn2,
    int dim,
    int exclude_self
);


================================================
FILE: thirdparty/nn/src/nearest_neighborhood.cu
================================================
#include <float.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <stdio.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

int infTwoExp(int val)
{
    int inf=1;
    while(val>inf) inf<<=1;
    return inf;
}

void getGPULayout(
        int dim0,int dim1,int dim2,
        int* bdim0,int* bdim1,int* bdim2,
        int* tdim0,int* tdim1,int* tdim2
)
{
    (*tdim2)=64;
    if(dim2<(*tdim2)) (*tdim2)=infTwoExp(dim2);
    (*bdim2)=dim2/(*tdim2);
    if(dim2%(*tdim2)>0) (*bdim2)++;

    (*tdim1)=1024/(*tdim2);
    if(dim1<(*tdim1)) (*tdim1)=infTwoExp(dim1);
    (*bdim1)=dim1/(*tdim1);
    if(dim1%(*tdim1)>0) (*bdim1)++;

    (*tdim0)=1024/((*tdim1)*(*tdim2));
    if(dim0<(*tdim0)) (*tdim0)=infTwoExp(dim0);
    (*bdim0)=dim0/(*tdim0);
    if(dim0%(*tdim0)>0) (*bdim0)++;
}

__global__
void findNearestPoint3DIdxKernel(
    float* ref_pts,   // [b,pn1,3]
    float* que_pts,   // [b,pn2,3]
    int* idxs,        // [b,pn2]
    int b,
    int pn1,
    int pn2,
    int exclude_self
)
{
    int bi = threadIdx.x + blockIdx.x*blockDim.x;
    int p2i = threadIdx.y + blockIdx.y*blockDim.y;
    if(p2i>=pn2||bi>=b) return;

    float x2=que_pts[bi*pn2*3+p2i*3];
    float y2=que_pts[bi*pn2*3+p2i*3+1];
    float z2=que_pts[bi*pn2*3+p2i*3+2];
    float min_dist=FLT_MAX;
    int min_idx=0;
    for(int p1i=0;p1i<pn1;p1i++)
    {
        if(exclude_self&&p1i==p2i) continue;
        float x1=ref_pts[bi*pn1*3+p1i*3];
        float y1=ref_pts[bi*pn1*3+p1i*3+1];
        float z1=ref_pts[bi*pn1*3+p1i*3+2];

        float dist=(x1-x2)*(x1-x2)+(y1-y2)*(y1-y2)+(z1-z2)*(z1-z2);
        if(dist<min_dist)
        {
            min_dist=dist;
            min_idx=p1i;
        }
    }
    idxs[bi*pn2+p2i]=min_idx;
}
__global__
void findNearestPoint2DIdxKernel(
    float* ref_pts,   // [b,pn1,2]
    float* que_pts,   // [b,pn2,2]
    int* idxs,        // [b,pn2]
    int b,
    int pn1,
    int pn2,
    int exclude_self
)
{
    int bi = threadIdx.x + blockIdx.x*blockDim.x;
    int p2i = threadIdx.y + blockIdx.y*blockDim.y;
    if(p2i>=pn2||bi>=b) return;

    float x2=que_pts[bi*pn2*2+p2i*2];
    float y2=que_pts[bi*pn2*2+p2i*2+1];
    float min_dist=FLT_MAX;
    int min_idx=0;
    for(int p1i=0;p1i<pn1;p1i++)
    {
        if(exclude_self&&p1i==p2i) continue;
        float x1=ref_pts[bi*pn1*2+p1i*2];
        float y1=ref_pts[bi*pn1*2+p1i*2+1];

        float dist=(x1-x2)*(x1-x2)+(y1-y2)*(y1-y2);
        if(dist<min_dist)
        {
            min_dist=dist;
            min_idx=p1i;
        }
    }
    idxs[bi*pn2+p2i]=min_idx;
}

#ifdef __cplusplus
extern "C" {
#endif

void findNearestPointIdxLauncher(
    float* ref_pts,   // [b,pn1,dim]
    float* que_pts,   // [b,pn2,dim]
    int* idxs,        // [b,pn2]
    int b,
    int pn1,
    int pn2,
    int dim,
    int exclude_self
)
{
    float* ref_pts_dev,* que_pts_dev;
    int* idxs_dev;
    gpuErrchk(cudaMalloc(&ref_pts_dev,b*pn1*sizeof(float)*dim))
    gpuErrchk(cudaMalloc(&que_pts_dev,b*pn2*sizeof(float)*dim))
    gpuErrchk(cudaMalloc(&idxs_dev,b*pn2*sizeof(int)))

    gpuErrchk(cudaMemcpy(ref_pts_dev,ref_pts,b*pn1*sizeof(float)*dim,cudaMemcpyHostToDevice))
    gpuErrchk(cudaMemcpy(que_pts_dev,que_pts,b*pn2*sizeof(float)*dim,cudaMemcpyHostToDevice))
    gpuErrchk(cudaMemcpy(idxs_dev,idxs,b*pn2*sizeof(int),cudaMemcpyHostToDevice))

    int bdim0,bdim1,bdim2;
    int tdim0,tdim1,tdim2;

    getGPULayout(b,pn2,1,&bdim0,&bdim1,&bdim2,&tdim0,&tdim1,&tdim2);

    dim3 bdim(bdim0,bdim1,bdim2);
    dim3 tdim(tdim0,tdim1,tdim2);

    if(dim==3)
        findNearestPoint3DIdxKernel<<<bdim,tdim>>>(ref_pts_dev,que_pts_dev,idxs_dev,b,pn1,pn2,exclude_self);
    else
        findNearestPoint2DIdxKernel<<<bdim,tdim>>>(ref_pts_dev,que_pts_dev,idxs_dev,b,pn1,pn2,exclude_self);
    gpuErrchk(cudaGetLastError())

    gpuErrchk(cudaMemcpy(idxs,idxs_dev,b*pn2*sizeof(int),cudaMemcpyDeviceToHost))
    gpuErrchk(cudaFree(ref_pts_dev))
    gpuErrchk(cudaFree(que_pts_dev))
    gpuErrchk(cudaFree(idxs_dev))

}

#ifdef __cplusplus
}
#endif


================================================
FILE: thirdparty/raft/corr.py
================================================
import torch
import torch.nn.functional as F
from .utils.utils import bilinear_sampler, coords_grid

try:
    import alt_cuda_corr
except:
    # alt_cuda_corr is not compiled
    pass


class CorrBlock:
    def __init__(self, fmap1, fmap2, num_levels=4, radius=4, downsample_rate=1):
        self.num_levels = num_levels
        self.radius = radius
        self.corr_pyramid = []

        # all pairs correlation
        corr = CorrBlock.corr(fmap1, fmap2)
        if downsample_rate>1:
            batch, h1, w1, dim, h2, w2 = corr.shape
            corr=torch.nn.MaxPool2d(corr.reshape(batch, -1, dim, h2,w2), downsample_rate, stride=downsample_rate)
            corr=torch.nn.MaxPool2d(corr.reshape(batch, h1,w1, dim, -1).permute(0,4,3,1,2), downsample_rate, stride=downsample_rate)
            corr = corr.permute(0,3,4,3,1).reshape(batch,h1//downsample_rate, w1//downsample_rate, dim, h2//downsample_rate, w2//downsample_rate)


        batch, h1, w1, dim, h2, w2 = corr.shape
        corr = corr.reshape(batch*h1*w1, dim, h2, w2)
        
        self.corr_pyramid.append(corr)
        for i in range(self.num_levels-1):
            corr = F.avg_pool2d(corr, 2, stride=2)
            self.corr_pyramid.append(corr)

    def __call__(self, coords):
        r = self.radius
        coords = coords.permute(0, 2, 3, 1)
        batch, h1, w1, _ = coords.shape

        out_pyramid = []
        for i in range(self.num_levels):
            corr = self.corr_pyramid[i]
            dx = torch.linspace(-r, r, 2*r+1)
            dy = torch.linspace(-r, r, 2*r+1)
            delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device)

            centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i
            delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2)
            coords_lvl = centroid_lvl + delta_lvl

            corr = bilinear_sampler(corr, coords_lvl)
            corr = corr.view(batch, h1, w1, -1)
            out_pyramid.append(corr)

        out = torch.cat(out_pyramid, dim=-1)
        return out.permute(0, 3, 1, 2).contiguous().float()

    @staticmethod
    def corr(fmap1, fmap2):
        batch, dim, ht, wd = fmap1.shape
        fmap1 = fmap1.view(batch, dim, ht*wd)
        fmap2 = fmap2.view(batch, dim, ht*wd) 
        
        corr = torch.matmul(fmap1.transpose(1,2), fmap2)
        corr = corr.view(batch, ht, wd, 1, ht, wd)
        return corr  / torch.sqrt(torch.tensor(dim).float())


class AlternateCorrBlock:
    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
        self.num_levels = num_levels
        self.radius = radius

        self.pyramid = [(fmap1, fmap2)]
        for i in range(self.num_levels):
            fmap1 = F.avg_pool2d(fmap1, 2, stride=2)
            fmap2 = F.avg_pool2d(fmap2, 2, stride=2)
            self.pyramid.append((fmap1, fmap2))

    def __call__(self, coords):
        coords = coords.permute(0, 2, 3, 1)
        B, H, W, _ = coords.shape
        dim = self.pyramid[0][0].shape[1]

        corr_list = []
        for i in range(self.num_levels):
            r = self.radius
            fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous()
            fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous()

            coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous()
            corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r)
            corr_list.append(corr.squeeze(1))

        corr = torch.stack(corr_list, dim=1)
        corr = corr.reshape(B, -1, H, W)
        return corr / torch.sqrt(torch.tensor(dim).float())


================================================
FILE: thirdparty/raft/extractor.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class ResidualBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
        super(ResidualBlock, self).__init__()
  
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
        
        elif norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(planes)
            self.norm2 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.BatchNorm2d(planes)
        
        elif norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(planes)
            self.norm2 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm3 = nn.InstanceNorm2d(planes)

        elif norm_fn == 'none':
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            if not stride == 1:
                self.norm3 = nn.Sequential()

        if stride == 1:
            self.downsample = None
        
        else:    
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)


    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x+y)


class BottleneckBlock(nn.Module):
    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
        super(BottleneckBlock, self).__init__()
  
        self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0)
        self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride)
        self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0)
        self.relu = nn.ReLU(inplace=True)

        num_groups = planes // 8

        if norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
            if not stride == 1:
                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
        
        elif norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(planes//4)
            self.norm2 = nn.BatchNorm2d(planes//4)
            self.norm3 = nn.BatchNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.BatchNorm2d(planes)
        
        elif norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(planes//4)
            self.norm2 = nn.InstanceNorm2d(planes//4)
            self.norm3 = nn.InstanceNorm2d(planes)
            if not stride == 1:
                self.norm4 = nn.InstanceNorm2d(planes)

        elif norm_fn == 'none':
            self.norm1 = nn.Sequential()
            self.norm2 = nn.Sequential()
            self.norm3 = nn.Sequential()
            if not stride == 1:
                self.norm4 = nn.Sequential()

        if stride == 1:
            self.downsample = None
        
        else:    
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)


    def forward(self, x):
        y = x
        y = self.relu(self.norm1(self.conv1(y)))
        y = self.relu(self.norm2(self.conv2(y)))
        y = self.relu(self.norm3(self.conv3(y)))

        if self.downsample is not None:
            x = self.downsample(x)

        return self.relu(x+y)

class BasicEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0, input_dim=3, with_decoder=False, decoder_dim=0):
        super(BasicEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.conv1 = nn.Conv2d(input_dim, 64, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64,  stride=1)
        self.layer2 = self._make_layer(96, stride=2)
        self.layer3 = self._make_layer(128, stride=2)

        # output convolution
        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)
        
        self.with_decoder = with_decoder
        if with_decoder:
            self.dec_layer3 = nn.Sequential(
                            nn.Conv2d(128, 96, 3, 1, 1, bias=False),
                            nn.BatchNorm2d(96),
                            self.relu1,
                            nn.UpsamplingBilinear2d(scale_factor=2),
            )
            self.dec_layer2 = nn.Sequential(
                            nn.Conv2d(96+96, 64, 3, 1, 1, bias=False),
                            nn.BatchNorm2d(64),
                            self.relu1,
                            nn.UpsamplingBilinear2d(scale_factor=2),
            )
            self.dec_layer1 = nn.Sequential(
                            nn.Conv2d(64+64, 64, 3, 1, 1, bias=False),
                            nn.BatchNorm2d(64),
                            self.relu1,
                            nn.UpsamplingBilinear2d(scale_factor=2),
            )
            self.reg_layer = nn.Conv2d(64, decoder_dim, 1, 1, 0, bias=False)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
        

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
        
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):
        
        ups=[]
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)
        ups.append(x) #added

        x = self.layer1(x)
        x = self.layer2(x)
        ups.append(x) #added
        x = self.layer3(x)

        if self.with_decoder:
            dec_x=self.dec_layer3(x)
            dec_x=self.dec_layer2(torch.cat( [dec_x, ups[-1]], dim=1 ) )
            dec_x=self.dec_layer1(torch.cat([dec_x, ups[-2]], dim=1 ) )
            dec_x = self.reg_layer(dec_x)

        x = self.conv2(x)


        if self.training and self.dropout is not None:
            x = self.dropout(x)


        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)
            if self.with_decoder:
                dec_x = torch.split(dec_x, [batch_dim, batch_dim], dim=0)
        if self.with_decoder:
            return x, dec_x
        else:
            return x

class BasicEncoder_dx4(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(BasicEncoder_dx4, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(64)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(64)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 64
        self.layer1 = self._make_layer(64,  stride=1)
        self.layer2 = self._make_layer(96, stride=2)
        self.layer3 = self._make_layer(128, stride=2)

        # output convolution
        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
        
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x


class SmallEncoder(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(SmallEncoder, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(32)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(32)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 32
        self.layer1 = self._make_layer(32,  stride=1)
        self.layer2 = self._make_layer(64, stride=2)
        self.layer3 = self._make_layer(96, stride=2)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)
        
        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
    
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x

class SmallEncoder_dx4(nn.Module):
    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
        super(SmallEncoder_dx4, self).__init__()
        self.norm_fn = norm_fn

        if self.norm_fn == 'group':
            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
            
        elif self.norm_fn == 'batch':
            self.norm1 = nn.BatchNorm2d(32)

        elif self.norm_fn == 'instance':
            self.norm1 = nn.InstanceNorm2d(32)

        elif self.norm_fn == 'none':
            self.norm1 = nn.Sequential()

        # self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=1, padding=3)
        self.relu1 = nn.ReLU(inplace=True)

        self.in_planes = 32
        self.layer1 = self._make_layer(32,  stride=1)
        self.layer2 = self._make_layer(64, stride=2)
        self.layer3 = self._make_layer(96, stride=2)

        self.dropout = None
        if dropout > 0:
            self.dropout = nn.Dropout2d(p=dropout)
        
        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
                if m.weight is not None:
                    nn.init.constant_(m.weight, 1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def _make_layer(self, dim, stride=1):
        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
        layers = (layer1, layer2)
    
        self.in_planes = dim
        return nn.Sequential(*layers)


    def forward(self, x):

        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[0]
            x = torch.cat(x, dim=0)

        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.conv2(x)

        if self.training and self.dropout is not None:
            x = self.dropout(x)

        if is_list:
            x = torch.split(x, [batch_dim, batch_dim], dim=0)

        return x

================================================
FILE: thirdparty/raft/update.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class FlowHead(nn.Module):
    def __init__(self, input_dim=128, hidden_dim=256):
        super(FlowHead, self).__init__()
        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.conv2(self.relu(self.conv1(x)))

class ConvGRU(nn.Module):
    def __init__(self, hidden_dim=128, input_dim=192+128):
        super(ConvGRU, self).__init__()
        self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)
        self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)
        self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)

    def forward(self, h, x):
        hx = torch.cat([h, x], dim=1)

        z = torch.sigmoid(self.convz(hx))
        r = torch.sigmoid(self.convr(hx))
        q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)))

        h = (1-z) * h + z * q
        return h

class SepConvGRU(nn.Module):
    def __init__(self, hidden_dim=128, input_dim=192+128):
        super(SepConvGRU, self).__init__()
        self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
        self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
        self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))

        self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
        self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
        self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))


    def forward(self, h, x):
        # horizontal
        hx = torch.cat([h, x], dim=1)
        z = torch.sigmoid(self.convz1(hx))
        r = torch.sigmoid(self.convr1(hx))
        q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1)))        
        h = (1-z) * h + z * q

        # vertical
        hx = torch.cat([h, x], dim=1)
        z = torch.sigmoid(self.convz2(hx))
        r = torch.sigmoid(self.convr2(hx))
        q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1)))       
        h = (1-z) * h + z * q

        return h

class SmallMotionEncoder(nn.Module):
    def __init__(self, args):
        super(SmallMotionEncoder, self).__init__()
        cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2
        self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0)
        self.convf1 = nn.Conv2d(2, 64, 7, padding=3)
        self.convf2 = nn.Conv2d(64, 32, 3, padding=1)
        self.conv = nn.Conv2d(128, 80, 3, padding=1)

    def forward(self, flow, corr):
        cor = F.relu(self.convc1(corr))
        flo = F.relu(self.convf1(flow))
        flo = F.relu(self.convf2(flo))
        cor_flo = torch.cat([cor, flo], dim=1)
        out = F.relu(self.conv(cor_flo))
        return torch.cat([out, flow], dim=1)

class BasicMotionEncoder(nn.Module):
    def __init__(self, args):
        super(BasicMotionEncoder, self).__init__()
        cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2
        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
        self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1)

    def forward(self, flow, corr):
        cor = F.relu(self.convc1(corr))
        cor = F.relu(self.convc2(cor))
        flo = F.relu(self.convf1(flow))
        flo = F.relu(self.convf2(flo))

        cor_flo = torch.cat([cor, flo], dim=1)
        out = F.relu(self.conv(cor_flo))
        return torch.cat([out, flow], dim=1)

class BasicMotionEncoderGeo(nn.Module):
    def __init__(self, args):
        super(BasicMotionEncoderGeo, self).__init__()
        cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2
        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
        self.convc1_geo = nn.Conv2d(cor_planes, 256, 1, padding=0)
        self.convc2_geo = nn.Conv2d(256, 192, 3, padding=1)
        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
        self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1)

    def forward(self, flow, corr, corr_geo):
        cor = F.relu(self.convc1(corr))
        cor = F.relu(self.convc2(cor))
        
        cor_geo = F.relu(self.convc1_geo(corr_geo))
        cor_geo = F.relu(self.convc2_geo(cor_geo))
        # cor_geo = F.leaky_relu(self.convc1_geo(corr_geo))
        # cor_geo = F.leaky_relu(self.convc2_geo(cor_geo))

        flo = F.relu(self.convf1(flow))
        flo = F.relu(self.convf2(flo))

        # cor_flo = torch.cat([cor, flo], dim=1)
        cor_flo = torch.cat([cor+cor_geo, flo], dim=1)
        out = F.relu(self.conv(cor_flo))
        return torch.cat([out, flow], dim=1)

class SmallUpdateBlock(nn.Module):
    def __init__(self, args, hidden_dim=96):
        super(SmallUpdateBlock, self).__init__()
        self.encoder = SmallMotionEncoder(args)
        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82+64)
        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)

    def forward(self, net, inp, corr, flow):
        motion_features = self.encoder(flow, corr)
        inp = torch.cat([inp, motion_features], dim=1)
        net = self.gru(net, inp)
        delta_flow = self.flow_head(net)

        return net, None, delta_flow

class SmallUpdateBlockUpMask(nn.Module):
    def __init__(self, args, hidden_dim=96):
        super(SmallUpdateBlockUpMask, self).__init__()
        self.encoder = SmallMotionEncoder(args)
        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82+64)
        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)
        self.mask = nn.Sequential(
            nn.Conv2d(96, 192, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(192, 64*9, 1, padding=0))

    def forward(self, net, inp, corr, flow):
        motion_features = self.encoder(flow, corr)
        inp = torch.cat([inp, motion_features], dim=1)
        net = self.gru(net, inp)
        delta_flow = self.flow_head(net)
        mask = .25 * self.mask(net)

        # return net, None, delta_flow
        return net, mask, delta_flow

class BasicUpdateBlock(nn.Module):
    def __init__(self, args, hidden_dim=128, input_dim=128, downsample_scale=8):
        super(BasicUpdateBlock, self).__init__()
        self.args = args
        self.encoder = BasicMotionEncoder(args)
        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim)
        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)

        self.mask = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(inplace=True),
            # nn.Conv2d(256, 64*9, 1, padding=0))
            nn.Conv2d(256, downsample_scale*downsample_scale*9, 1, padding=0))

    def forward(self, net, inp, corr, flow, upsample=True):
        
        motion_features = self.encoder(flow, corr)
        inp = torch.cat([inp, motion_features], dim=1)

        net = self.gru(net, inp)
        delta_flow = self.flow_head(net)

        # scale mask to balence gradients
        mask = .25 * self.mask(net)
        return net, mask, delta_flow

class BasicUpdateBlockGeo(nn.Module):
    #add geo corr input
    def __init__(self, args, hidden_dim=128, input_dim=128, downsample_scale=8):
        super(BasicUpdateBlockGeo, self).__init__()
        self.args = args
        self.encoder = BasicMotionEncoderGeo(args)
        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim)
        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)

        self.mask = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(inplace=True),
            # nn.Conv2d(256, 64*9, 1, padding=0))
            nn.Conv2d(256, downsample_scale*downsample_scale*9, 1, padding=0))

    def forward(self, net, inp, corr, geo_corr, flow, upsample=True):
        
        motion_features = self.encoder(flow, corr, geo_corr)
        inp = torch.cat([inp, motion_features], dim=1)

        net = self.gru(net, inp)
        delta_flow = self.flow_head(net)

        # scale mask to balence gradients
        mask = .25 * self.mask(net)
        return net, mask, delta_flow


================================================
FILE: thirdparty/raft/utils/__init__.py
================================================


================================================
FILE: thirdparty/raft/utils/augmentor.py
================================================
import numpy as np
import random
import math
from PIL import Image

import cv2
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

import torch
from torchvision.transforms import ColorJitter
import torch.nn.functional as F


class FlowAugmentor:
    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True):
        
        # spatial augmentation params
        self.crop_size = crop_size
        self.min_scale = min_scale
        self.max_scale = max_scale
        self.spatial_aug_prob = 0.8
        self.stretch_prob = 0.8
        self.max_stretch = 0.2

        # flip augmentation params
        self.do_flip = do_flip
        self.h_flip_prob = 0.5
        self.v_flip_prob = 0.1

        # photometric augmentation params
        self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5/3.14)
        self.asymmetric_color_aug_prob = 0.2
        self.eraser_aug_prob = 0.5

    def color_transform(self, img1, img2):
        """ Photometric augmentation """

        # asymmetric
        if np.random.rand() < self.asymmetric_color_aug_prob:
            img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8)
            img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8)

        # symmetric
        else:
            image_stack = np.concatenate([img1, img2], axis=0)
            image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
            img1, img2 = np.split(image_stack, 2, axis=0)

        return img1, img2

    def eraser_transform(self, img1, img2, bounds=[50, 100]):
        """ Occlusion augmentation """

        ht, wd = img1.shape[:2]
        if np.random.rand() < self.eraser_aug_prob:
            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
            for _ in range(np.random.randint(1, 3)):
                x0 = np.random.randint(0, wd)
                y0 = np.random.randint(0, ht)
                dx = np.random.randint(bounds[0], bounds[1])
                dy = np.random.randint(bounds[0], bounds[1])
                img2[y0:y0+dy, x0:x0+dx, :] = mean_color

        return img1, img2

    def spatial_transform(self, img1, img2, flow):
        # randomly sample scale
        ht, wd = img1.shape[:2]
        min_scale = np.maximum(
            (self.crop_size[0] + 8) / float(ht), 
            (self.crop_size[1] + 8) / float(wd))

        scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
        scale_x = scale
        scale_y = scale
        if np.random.rand() < self.stretch_prob:
            scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
            scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
        
        scale_x = np.clip(scale_x, min_scale, None)
        scale_y = np.clip(scale_y, min_scale, None)

        if np.random.rand() < self.spatial_aug_prob:
            # rescale the images
            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            flow = flow * [scale_x, scale_y]

        if self.do_flip:
            if np.random.rand() < self.h_flip_prob: # h-flip
                img1 = img1[:, ::-1]
                img2 = img2[:, ::-1]
                flow = flow[:, ::-1] * [-1.0, 1.0]

            if np.random.rand() < self.v_flip_prob: # v-flip
                img1 = img1[::-1, :]
                img2 = img2[::-1, :]
                flow = flow[::-1, :] * [1.0, -1.0]

        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
        x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
        
        img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]

        return img1, img2, flow

    def __call__(self, img1, img2, flow):
        img1, img2 = self.color_transform(img1, img2)
        img1, img2 = self.eraser_transform(img1, img2)
        img1, img2, flow = self.spatial_transform(img1, img2, flow)

        img1 = np.ascontiguousarray(img1)
        img2 = np.ascontiguousarray(img2)
        flow = np.ascontiguousarray(flow)

        return img1, img2, flow

class SparseFlowAugmentor:
    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False):
        # spatial augmentation params
        self.crop_size = crop_size
        self.min_scale = min_scale
        self.max_scale = max_scale
        self.spatial_aug_prob = 0.8
        self.stretch_prob = 0.8
        self.max_stretch = 0.2

        # flip augmentation params
        self.do_flip = do_flip
        self.h_flip_prob = 0.5
        self.v_flip_prob = 0.1

        # photometric augmentation params
        self.photo_aug = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3/3.14)
        self.asymmetric_color_aug_prob = 0.2
        self.eraser_aug_prob = 0.5
        
    def color_transform(self, img1, img2):
        image_stack = np.concatenate([img1, img2], axis=0)
        image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
        img1, img2 = np.split(image_stack, 2, axis=0)
        return img1, img2

    def eraser_transform(self, img1, img2):
        ht, wd = img1.shape[:2]
        if np.random.rand() < self.eraser_aug_prob:
            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
            for _ in range(np.random.randint(1, 3)):
                x0 = np.random.randint(0, wd)
                y0 = np.random.randint(0, ht)
                dx = np.random.randint(50, 100)
                dy = np.random.randint(50, 100)
                img2[y0:y0+dy, x0:x0+dx, :] = mean_color

        return img1, img2

    def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0):
        ht, wd = flow.shape[:2]
        coords = np.meshgrid(np.arange(wd), np.arange(ht))
        coords = np.stack(coords, axis=-1)

        coords = coords.reshape(-1, 2).astype(np.float32)
        flow = flow.reshape(-1, 2).astype(np.float32)
        valid = valid.reshape(-1).astype(np.float32)

        coords0 = coords[valid>=1]
        flow0 = flow[valid>=1]

        ht1 = int(round(ht * fy))
        wd1 = int(round(wd * fx))

        coords1 = coords0 * [fx, fy]
        flow1 = flow0 * [fx, fy]

        xx = np.round(coords1[:,0]).astype(np.int32)
        yy = np.round(coords1[:,1]).astype(np.int32)

        v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
        xx = xx[v]
        yy = yy[v]
        flow1 = flow1[v]

        flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32)
        valid_img = np.zeros([ht1, wd1], dtype=np.int32)

        flow_img[yy, xx] = flow1
        valid_img[yy, xx] = 1

        return flow_img, valid_img

    def spatial_transform(self, img1, img2, flow, valid):
        # randomly sample scale

        ht, wd = img1.shape[:2]
        min_scale = np.maximum(
            (self.crop_size[0] + 1) / float(ht), 
            (self.crop_size[1] + 1) / float(wd))

        scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
        scale_x = np.clip(scale, min_scale, None)
        scale_y = np.clip(scale, min_scale, None)

        if np.random.rand() < self.spatial_aug_prob:
            # rescale the images
            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
            flow, valid = self.resize_sparse_flow_map(flow, valid, fx=scale_x, fy=scale_y)

        if self.do_flip:
            if np.random.rand() < 0.5: # h-flip
                img1 = img1[:, ::-1]
                img2 = img2[:, ::-1]
                flow = flow[:, ::-1] * [-1.0, 1.0]
                valid = valid[:, ::-1]

        margin_y = 20
        margin_x = 50

        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y)
        x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x)

        y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0])
        x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1])

        img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
        return img1, img2, flow, valid


    def __call__(self, img1, img2, flow, valid):
        img1, img2 = self.color_transform(img1, img2)
        img1, img2 = self.eraser_transform(img1, img2)
        img1, img2, flow, valid = self.spatial_transform(img1, img2, flow, valid)

        img1 = np.ascontiguousarray(img1)
        img2 = np.ascontiguousarray(img2)
        flow = np.ascontiguousarray(flow)
        valid = np.ascontiguousarray(valid)

        return img1, img2, flow, valid


================================================
FILE: thirdparty/raft/utils/flow_viz.py
================================================
# Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization


# MIT License
#
# Copyright (c) 2018 Tom Runia
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to conditions.
#
# Author: Tom Runia
# Date Created: 2018-08-03

import numpy as np

def make_colorwheel():
    """
    Generates a color wheel for optical flow visualization as presented in:
        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf

    Code follows the original C++ source code of Daniel Scharstein.
    Code follows the the Matlab source code of Deqing Sun.

    Returns:
        np.ndarray: Color wheel
    """

    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR
    colorwheel = np.zeros((ncols, 3))
    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY)
    col = col+RY
    # YG
    colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG)
    colorwheel[col:col+YG, 1] = 255
    col = col+YG
    # GC
    colorwheel[col:col+GC, 1] = 255
    colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC)
    col = col+GC
    # CB
    colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
    colorwheel[col:col+CB, 2] = 255
    col = col+CB
    # BM
    colorwheel[col:col+BM, 2] = 255
    colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM)
    col = col+BM
    # MR
    colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
    colorwheel[col:col+MR, 0] = 255
    return colorwheel


def flow_uv_to_colors(u, v, convert_to_bgr=False):
    """
    Applies the flow color wheel to (possibly clipped) flow components u and v.

    According to the C++ source code of Daniel Scharstein
    According to the Matlab source code of Deqing Sun

    Args:
        u (np.ndarray): Input horizontal flow of shape [H,W]
        v (np.ndarray): Input vertical flow of shape [H,W]
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.

    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
    colorwheel = make_colorwheel()  # shape [55x3]
    ncols = colorwheel.shape[0]
    rad = np.sqrt(np.square(u) + np.square(v))
    a = np.arctan2(-v, -u)/np.pi
    fk = (a+1) / 2*(ncols-1)
    k0 = np.floor(fk).astype(np.int32)
    k1 = k0 + 1
    k1[k1 == ncols] = 0
    f = fk - k0
    for i in range(colorwheel.shape[1]):
        tmp = colorwheel[:,i]
        col0 = tmp[k0] / 255.0
        col1 = tmp[k1] / 255.0
        col = (1-f)*col0 + f*col1
        idx = (rad <= 1)
        col[idx]  = 1 - rad[idx] * (1-col[idx])
        col[~idx] = col[~idx] * 0.75   # out of range
        # Note the 2-i => BGR instead of RGB
        ch_idx = 2-i if convert_to_bgr else i
        flow_image[:,:,ch_idx] = np.floor(255 * col)
    return flow_image


def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
    """
    Expects a two dimensional flow image of shape.

    Args:
        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.

    Returns:
        np.ndarray: Flow visualization image of shape [H,W,3]
    """
    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
    if clip_flow is not None:
        flow_uv = np.clip(flow_uv, 0, clip_flow)
    u = flow_uv[:,:,0]
    v = flow_uv[:,:,1]
    rad = np.sqrt(np.square(u) + np.square(v))
    rad_max = np.max(rad)
    epsilon = 1e-5
    u = u / (rad_max + epsilon)
    v = v / (rad_max + epsilon)
    return flow_uv_to_colors(u, v, convert_to_bgr)

================================================
FILE: thirdparty/raft/utils/frame_utils.py
================================================
import numpy as np
from PIL import Image
from os.path import *
import re

import cv2
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

TAG_CHAR = np.array([202021.25], np.float32)

def readFlow(fn):
    """ Read .flo file in Middlebury format"""
    # Code adapted from:
    # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy

    # WARNING: this will work on little-endian architectures (eg Intel x86) only!
    # print 'fn = %s'%(fn)
    with open(fn, 'rb') as f:
        magic = np.fromfile(f, np.float32, count=1)
        if 202021.25 != magic:
            print('Magic number incorrect. Invalid .flo file')
            return None
        else:
            w = np.fromfile(f, np.int32, count=1)
            h = np.fromfile(f, np.int32, count=1)
            # print 'Reading %d x %d flo file\n' % (w, h)
            data = np.fromfile(f, np.float32, count=2*int(w)*int(h))
            # Reshape data into 3D array (columns, rows, bands)
            # The reshape here is for visualization, the original code is (w,h,2)
            return np.resize(data, (int(h), int(w), 2))

def readPFM(file):
    file = open(file, 'rb')

    color = None
    width = None
    height = None
    scale = None
    endian = None

    header = file.readline().rstrip()
    if header == b'PF':
        color = True
    elif header == b'Pf':
        color = False
    else:
        raise Exception('Not a PFM file.')

    dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
    if dim_match:
        width, height = map(int, dim_match.groups())
    else:
        raise Exception('Malformed PFM header.')

    scale = float(file.readline().rstrip())
    if scale < 0: # little-endian
        endian = '<'
        scale = -scale
    else:
        endian = '>' # big-endian

    data = np.fromfile(file, endian + 'f')
    shape = (height, width, 3) if color else (height, width)

    data = np.reshape(data, shape)
    data = np.flipud(data)
    return data

def writeFlow(filename,uv,v=None):
    """ Write optical flow to file.
    
    If v is None, uv is assumed to contain both u and v channels,
    stacked in depth.
    Original code by Deqing Sun, adapted from Daniel Scharstein.
    """
    nBands = 2

    if v is None:
        assert(uv.ndim == 3)
        assert(uv.shape[2] == 2)
        u = uv[:,:,0]
        v = uv[:,:,1]
    else:
        u = uv

    assert(u.shape == v.shape)
    height,width = u.shape
    f = open(filename,'wb')
    # write the header
    f.write(TAG_CHAR)
    np.array(width).astype(np.int32).tofile(f)
    np.array(height).astype(np.int32).tofile(f)
    # arrange into matrix form
    tmp = np.zeros((height, width*nBands))
    tmp[:,np.arange(width)*2] = u
    tmp[:,np.arange(width)*2 + 1] = v
    tmp.astype(np.float32).tofile(f)
    f.close()


def readFlowKITTI(filename):
    flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR)
    flow = flow[:,:,::-1].astype(np.float32)
    flow, valid = flow[:, :, :2], flow[:, :, 2]
    flow = (flow - 2**15) / 64.0
    return flow, valid

def readDispKITTI(filename):
    disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0
    valid = disp > 0.0
    flow = np.stack([-disp, np.zeros_like(disp)], -1)
    return flow, valid


def writeFlowKITTI(filename, uv):
    uv = 64.0 * uv + 2**15
    valid = np.ones([uv.shape[0], uv.shape[1], 1])
    uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
    cv2.imwrite(filename, uv[..., ::-1])
    

def read_gen(file_name, pil=False):
    ext = splitext(file_name)[-1]
    if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg':
        return Image.open(file_name)
    elif ext == '.bin' or ext == '.raw':
        return np.load(file_name)
    elif ext == '.flo':
        return readFlow(file_name).astype(np.float32)
    elif ext == '.pfm':
        flow = readPFM(file_name).astype(np.float32)
        if len(flow.shape) == 2:
            return flow
        else:
            return flow[:, :, :-1]
    return []

================================================
FILE: thirdparty/raft/utils/utils.py
================================================
import torch
import torch.nn.functional as F
import numpy as np
from scipy import interpolate


class InputPadder:
    """ Pads images such that dimensions are divisible by 8 """
    def __init__(self, dims, mode='sintel'):
        self.ht, self.wd = dims[-2:]
        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
        if mode == 'sintel':
            self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
        else:
            self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]

    def pad(self, *inputs):
        return [F.pad(x, self._pad, mode='replicate') for x in inputs]

    def unpad(self,x):
        ht, wd = x.shape[-2:]
        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
        return x[..., c[0]:c[1], c[2]:c[3]]

def forward_interpolate(flow):
    flow = flow.detach().cpu().numpy()
    dx, dy = flow[0], flow[1]

    ht, wd = dx.shape
    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))

    x1 = x0 + dx
    y1 = y0 + dy
    
    x1 = x1.reshape(-1)
    y1 = y1.reshape(-1)
    dx = dx.reshape(-1)
    dy = dy.reshape(-1)

    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
    x1 = x1[valid]
    y1 = y1[valid]
    dx = dx[valid]
    dy = dy[valid]

    flow_x = interpolate.griddata(
        (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)

    flow_y = interpolate.griddata(
        (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)

    flow = np.stack([flow_x, flow_y], axis=0)
    return torch.from_numpy(flow).float()


def bilinear_sampler(img, coords, mode='bilinear', mask=False):
    """ Wrapper for grid_sample, uses pixel coordinates """
    H, W = img.shape[-2:]
    xgrid, ygrid = coords.split([1,1], dim=-1)
    xgrid = 2*xgrid/(W-1) - 1
    ygrid = 2*ygrid/(H-1) - 1

    grid = torch.cat([xgrid, ygrid], dim=-1)
    img = F.grid_sample(img, grid, align_corners=True)

    if mask:
        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
        return img, mask.float()

    return img


def coords_grid(batch, ht, wd):
    coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
    coords = torch.stack(coords[::-1], dim=0).float()
    return coords[None].repeat(batch, 1, 1, 1)


def upflow8(flow, mode='bilinear'):
    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
    return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)

def upflow(flow, mode='bilinear', scale=8):
    new_size = (scale * flow.shape[2], scale * flow.shape[3])
    return  scale * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)

================================================
FILE: thirdparty/vsd/inout.py
================================================
# Author: Tomas Hodan (hodantom@cmp.felk.cvut.cz)
# Center for Machine Perception, Czech Technical University in Prague

import struct
import itertools
import numpy as np
import scipy.misc


def load_depth(path):
    d = scipy.misc.imread(path)
    d = d.astype(np.float32)
    return d


def load_ply(path):
    """
    Loads a 3D mesh model from a PLY file.
    :param path: Path to a PLY file.
    :return: The loaded model given by a dictionary with items:
    'pts' (nx3 ndarray), 'normals' (nx3 ndarray), 'colors' (nx3 ndarray),
    'faces' (mx3 ndarray) - the latter three are optional.
    """
    f = open(path, 'r')

    n_pts = 0
    n_faces = 0
    face_n_corners = 3 # Only triangular faces are supported
    pt_props = []
    face_props = []
    is_binary = False
    header_vertex_section = False
    header_face_section = False

    # Read header
    while True:
        line = f.readline().rstrip('\n').rstrip('\r') # Strip the newline character(s)
        if line.startswith('element vertex'):
            n_pts = int(line.split()[-1])
            header_vertex_section = True
            header_face_section = False
        elif line.startswith('element face'):
            n_faces = int(line.split()[-1])
            header_vertex_section = False
            header_face_section = True
        elif line.startswith('element'): # Some other element
            header_vertex_section = False
            header_face_section = False
        elif line.startswith('property') and header_vertex_section:
            # (name of the property, data type)
            pt_props.append((line.split()[-1], line.split()[-2]))
        elif line.startswith('property list') and header_face_section:
            elems = line.split()
            if elems[-1] == 'vertex_indices':
                # (name of the property, data type)
                face_props.append(('n_corners', elems[2]))
                for i in range(face_n_corners):
                    face_props.append(('ind_' + str(i), elems[3]))
            else:
                print(('Warning: Not supported face property: ' + elems[-1]))
        elif line.startswith('format'):
            if 'binary' in line:
                is_binary = True
        elif line.startswith('end_header'):
            break

    # Prepare data structures
    model = {}
    model['pts'] = np.zeros((n_pts, 3), np.float)
    if n_faces > 0:
        model['faces'] = np.zeros((n_faces, face_n_corners), np.float)

    pt_props_names = [p[0] for p in pt_props]
    is_normal = False
    if {'nx', 'ny', 'nz'}.issubset(set(pt_props_names)):
        is_normal = True
        model['normals'] = np.zeros((n_pts, 3), np.float)

    is_color = False
    if {'red', 'green', 'blue'}.issubset(set(pt_props_names)):
        is_color = True
        model['colors'] = np.zeros((n_pts, 3), np.float)

    is_texture = False
    if {'texture_u', 'texture_v'}.issubset(set(pt_props_names)):
        is_texture = True
        model['texture_uv'] = np.zeros((n_pts, 2), np.float)

    formats = { # For binary format
        'float': ('f', 4),
        'double': ('d', 8),
        'int': ('i', 4),
        'uchar': ('B', 1)
    }

    # Load vertices
    for pt_id in range(n_pts):
        prop_vals = {}
        load_props = ['x', 'y', 'z', 'nx', 'ny', 'nz',
                      'red', 'green', 'blue', 'texture_u', 'texture_v']
        if is_binary:
            for prop in pt_props:
                format = formats[prop[1]]
                val = struct.unpack(format[0], f.read(format[1]))[0]
                if prop[0] in load_props:
                    prop_vals[prop[0]] = val
        else:
            elems = f.readline().rstrip('\n').rstrip('\r').split()
            for prop_id, prop in enumerate(pt_props):
                if prop[0] in load_props:
                    prop_vals[prop[0]] = elems[prop_id]

        model['pts'][pt_id, 0] = float(prop_vals['x'])
        model['pts'][pt_id, 1] = float(prop_vals['y'])
        model['pts'][pt_id, 2] = float(prop_vals['z'])

        if is_normal:
            model['normals'][pt_id, 0] = float(prop_vals['nx'])
            model['normals'][pt_id, 1] = float(prop_vals['ny'])
            model['normals'][pt_id, 2] = float(prop_vals['nz'])

        if is_color:
            model['colors'][pt_id, 0] = float(prop_vals['red'])
            model['colors'][pt_id, 1] = float(prop_vals['green'])
            model['colors'][pt_id, 2] = float(prop_vals['blue'])

        if is_texture:
            model['texture_uv'][pt_id, 0] = float(prop_vals['texture_u'])
            model['texture_uv'][pt_id, 1] = float(prop_vals['texture_v'])

    # Load faces
    for face_id in range(n_faces):
        prop_vals = {}
        if is_binary:
            for prop in face_props:
                format = formats[prop[1]]
                val = struct.unpack(format[0], f.read(format[1]))[0]
                if prop[0] == 'n_corners':
                    if val != face_n_corners:
                        print('Error: Only triangular faces are supported.')
                        print(('Number of face corners: ' + str(val)))
                        exit(-1)
                else:
                    prop_vals[prop[0]] = val
        else:
            elems = f.readline().rstrip('\n').rstrip('\r').split()
            for prop_id, prop in enumerate(face_props):
                if prop[0] == 'n_corners':
                    if int(elems[prop_id]) != face_n_corners:
                        print('Error: Only triangular faces are supported.')
                        print(('Number of face corners: ' + str(int(elems[prop_id]))))
                        exit(-1)
                else:
                    prop_vals[prop[0]] = elems[prop_id]

        model['faces'][face_id, 0] = int(prop_vals['ind_0'])
        model['faces'][face_id, 1] = int(prop_vals['ind_1'])
        model['faces'][face_id, 2] = int(prop_vals['ind_2'])

    f.close()

    return model


================================================
FILE: tools/eval.py
================================================
#CERTIFICATED
import torch
import numpy as np 
import tensorboard
from pathlib import Path
import json
import random
import re
import torch.backends.cudnn as cudnn
import torch.multiprocessing as mp
import time
import fire
import torch.distributed as dist
import os
from collections import defaultdict
import ast 
import flow_vis
import copy

from utils.progress_bar import ProgressBar
from utils.log_tool import SimpleModelLog
from data.preprocess import merge_batch, get_dataloader, get_dataloader_deepim # merge_second_batch_multigpu
from utils.config_io import merge_cfg, save_cfg
import torchplus
from builder import (
    dataset_builder,
    input_reader_builder,
    lr_scheduler_builder,
    optimizer_builder,
    rnnpose_builder
)
from utils.distributed_utils import dist_init, average_gradients, DistModule, ParallelWrapper, DistributedSequatialSampler, DistributedGivenIterationSampler, DistributedGivenIterationSamplerEpoch 
from utils.util import modify_parameter_name_with_map
from utils.eval_metric import *
from config.default import get_cfg


GLOBAL_GPUS_PER_DEVICE = 1  
GLOBAL_STEP = 0
RANK=-1
WORLD_SIZE=-1


def load_example_to_device(example,
                             device=None) -> dict:
    example_torch = {}

    for k, v in example.items():  
        if k in ['idx', 'class_name']:
            example_torch[k]=v
            continue

        if type(v) == list:
            example_torch[k] = [item.to(device=device) for item in v]
        else:
            example_torch[k] = v.to(device=device)

    return example_torch
def build_network(model_cfg, measure_time=False, testing=False):
    net = rnnpose_builder.build(
        model_cfg, measure_time=measure_time, testing=testing)
    return net


def _worker_init_fn(worker_id):
    global GLOBAL_STEP
    time_seed = GLOBAL_STEP
    np.random.seed(time_seed + worker_id)
    print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0])


def freeze_params(params: dict, include: str = None, exclude: str = None):
    assert isinstance(params, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    remain_params = []
    for k, p in params.items():
        if include_re is not None:
            if include_re.match(k) is not None:
                continue
        if exclude_re is not None:
            if exclude_re.match(k) is None:
                continue
        remain_params.append(p)
    return remain_params


def freeze_params_v2(params: dict, include: str = None, exclude: str = None):
    assert isinstance(params, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    for k, p in params.items():
        if include_re is not None:
            if include_re.match(k) is not None:
                p.requires_grad = False
        if exclude_re is not None:
            if exclude_re.match(k) is None:
                p.requires_grad = False


def filter_param_dict(state_dict: dict, include: str = None, exclude: str = None):
    assert isinstance(state_dict, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    res_dict = {}
    for k, p in state_dict.items():
        if include_re is not None:
            if include_re.match(k) is None:
                continue
        if exclude_re is not None:
            if exclude_re.match(k) is not None:
                continue
        res_dict[k] = p
    return res_dict

def chk_rank(rank_, use_dist=False):
    if not use_dist:
        return True
    global RANK
    if RANK<0:
        RANK=dist.get_rank()
    cur_rank = RANK#dist.get_rank()
    # self.world_size = dist.get_world_size()
    return cur_rank == rank_

def get_rank(use_dist=False):
    if not use_dist:
        return 0
    else:
        # return dist.get_rank()
        global RANK 
        if RANK<0:
            RANK=dist.get_rank()
        return RANK 

def get_world(use_dist):
    if not use_dist:
        return 1
    else:
        global WORLD_SIZE 
        if WORLD_SIZE<0:
            WORLD_SIZE=dist.get_world_size()
        return WORLD_SIZE #dist.get_world_size()
def get_ngpus_per_node():
    global GLOBAL_GPUS_PER_DEVICE
    return GLOBAL_GPUS_PER_DEVICE


def multi_proc_train(
          config_path,
          model_dir,
          use_apex,
          world_size,
          result_path=None,
          create_folder=False,
          display_step=50,
          summary_step=5,
          pretrained_path=None,
          pretrained_include=None,
          pretrained_exclude=None,
          pretrained_param_map=None,
          freeze_include=None,
          freeze_exclude=None,
          measure_time=False,
          resume=False,
          use_dist=False,
          gpus_per_node=1,
          start_gpu_id=0,
          optim_eval=False,
          seed=7,
          dist_port="23335",
         force_resume_step=None,
         batch_size=None,
         apex_opt_level='O0'
          ):
    
    params = {
          "config_path": config_path,
          "model_dir": model_dir,
          "use_apex": use_apex,
          "result_path": result_path,
          "create_folder": create_folder,
          "display_step": display_step,
          "summary_step": summary_step,
          "pretrained_path": pretrained_path,
          "pretrained_include": pretrained_include,
          "pretrained_exclude": pretrained_exclude,
          "pretrained_param_map": pretrained_param_map,
          "freeze_include": freeze_include,
          "freeze_exclude": freeze_exclude,
        #   "multi_gpu": multi_gpu,
          "measure_time": measure_time,
          "resume": resume,
          "use_dist": use_dist,
          "gpus_per_node": gpus_per_node,
          "optim_eval": optim_eval,
          "seed": seed,
          "dist_port": dist_port,
          "world_size": world_size,
          "force_resume_step":force_resume_step,
          "batch_size": batch_size,
          "apex_opt_level":apex_opt_level
    }
    from types import SimpleNamespace 
    params = SimpleNamespace(**params)

    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
        str(x) for x in range(start_gpu_id, start_gpu_id+gpus_per_node))
    print(f"CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}"  )

    mp.spawn(train_worker, nprocs=gpus_per_node,
                args=( params,) )

def train_worker(rank, params):
    global RANK, WORLD_SIZE
    RANK = rank
    WORLD_SIZE=params.world_size
    
    eval(config_path=params.config_path,
          model_dir=params.model_dir,
          use_apex=params.use_apex,
          result_path=params.result_path,
          create_folder=params.create_folder,
          display_step=params.display_step,
          pretrained_path=params.pretrained_path,
          pretrained_include=params.pretrained_include,
          pretrained_exclude=params.pretrained_exclude,
          pretrained_param_map=params.pretrained_param_map,
          freeze_include=params.freeze_include,
          freeze_exclude=params.freeze_exclude,
          measure_time=params.measure_time,
          resume=params.resume,
          use_dist=params.use_dist,
          dist_port=params.dist_port,
          gpus_per_node=params.gpus_per_node,
          optim_eval=params.optim_eval,
          seed=params.seed,
          force_resume_step=params.force_resume_step,
          batch_size = params.batch_size,
          apex_opt_level=params.apex_opt_level
          ) 


def eval(
         config_path,
          model_dir,
          use_apex,
          result_path=None,
          create_folder=False,
          display_step=50,
          summary_step=5,
          pretrained_path=None,
          pretrained_include=None,
          pretrained_exclude=None,
          pretrained_param_map=None,
          freeze_include=None,
          freeze_exclude=None,
          multi_gpu=False,
          measure_time=False,
          resume=False,
          use_dist=False,
          dist_port="23335",
          gpus_per_node=1,
          optim_eval=False,
          seed=7,
          force_resume_step=None,
          batch_size=None,
          apex_opt_level='O0',
          verbose=False
          ):
    """train a VoxelNet model specified by a config file.
    """

    print("force_resume_step:", force_resume_step)
    print("torch.cuda.is_available()=", torch.cuda.is_available())
    print("torch.version.cuda=",torch.version.cuda) 
    dist_url=f"tcp://127.0.0.1:{dist_port}"
    print(f"dist_url={dist_url}", flush=True)
    global RANK, WORLD_SIZE
    # RANK, WORLD_SIZE=rank, world_size
    if RANK<0:
        RANK=0
    if WORLD_SIZE<0:
        WORLD_SIZE=1

    global GLOBAL_GPUS_PER_DEVICE
    GLOBAL_GPUS_PER_DEVICE = gpus_per_node

  
    ######################################## initialize the distributed env #########################################
    if use_dist:
        if use_apex:
            dist.init_process_group(
                backend="nccl", init_method=dist_url, world_size=get_world(use_dist), rank=get_rank(use_dist))
        else:
            # rank, world_size = dist_init(str(dist_port))
            dist.init_process_group(
                backend="nccl", init_method=dist_url, world_size=get_world(use_dist), rank=get_rank(use_dist))
    
    print(get_rank(use_dist)%GLOBAL_GPUS_PER_DEVICE, flush=True)
    #set cuda device number
    torch.cuda.set_device(get_rank(use_dist)%GLOBAL_GPUS_PER_DEVICE)

    ############################################ create folders ############################################
    print(f"Set seed={seed}", flush=True)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    model_dir = str(Path(model_dir).resolve())
    model_dir = Path(model_dir)
    if chk_rank(0, use_dist):
        if not resume and model_dir.exists():
            raise ValueError("model dir exists and you don't specify resume.")
            print("Warning: model dir exists and you don't specify resume.")

        model_dir.mkdir(parents=True, exist_ok=True)
    if result_path is None:
        result_path = model_dir / 'results'
    config_file_bkp = "pipeline.config"

    ############################################# read config proto ############################################
    config = merge_cfg(
        [config_path], intersection=True)
    if chk_rank(0, use_dist):
        print(json.dumps(config, indent=4))

    if chk_rank(0, use_dist):
        # save_cfg([default_config_path, custom_config_path],
        save_cfg([config_path, config_path],
                 str(model_dir / config_file_bkp))

    #update the global config object
    get_cfg().merge(config.get("BASIC",{}),"BASIC" )  

    input_cfg = config.train_input_reader
    eval_input_cfg = config.eval_input_reader
    model_cfg = config.model
    train_cfg = config.train_config
    optimizer_cfg = train_cfg.optimizer
    loss_scale = train_cfg.loss_scale_factor


    ############################################# Update default options ############################################

    if batch_size is not None:
        input_cfg.batch_size = batch_size 
        eval_input_cfg.batch_size = batch_size
    print(input_cfg.batch_size)
    
    ############################################# build network, optimizer etc. ############################################
    #dummy dataset
    dataset_tmp = input_reader_builder.build(
        eval_input_cfg,
        training=False,
    )
 
    model_cfg.obj_seqs = copy.copy(dataset_tmp._dataset.infos["seqs"])

    net = build_network(model_cfg, measure_time)  # .to(device)
    net.cuda()

    fastai_optimizer = optimizer_builder.build(
        optimizer_cfg, net,
        mixed=False,
        loss_scale=loss_scale)

    print("# parameters:", len(list(net.parameters())))

    ############################################# load pretrained model ############################################
    if pretrained_path is not None:
        model_dict = net.state_dict()
        pretrained_dict = torch.load(pretrained_path, map_location='cpu')

        if verbose:
            print("Pretrained keys:", pretrained_dict.keys())
            print("Model keys:", model_dict.keys())

        pretrained_dict = filter_param_dict(
            pretrained_dict, pretrained_include, pretrained_exclude)

        pretrained_dict = modify_parameter_name_with_map(
            pretrained_dict, ast.literal_eval(str(pretrained_param_map)))
        new_pretrained_dict = {}
        
        for k, v in pretrained_dict.items():
            if k in model_dict and v.shape == model_dict[k].shape:
                new_pretrained_dict[k] = v
            else:
                print("Fail to load:", k )

        model_dict.update(new_pretrained_dict)
        net.load_state_dict(model_dict)
        freeze_params_v2(dict(net.named_parameters()),
                         freeze_include, freeze_exclude)
        net.clear_global_step()
        # net.clear_metrics()
        del pretrained_dict
    else:
    ############################################# try to resume from the latest chkpt ############################################
        torchplus.train.try_restore_latest_checkpoints(model_dir, [net])
    # torchplus.train.try_restore_latest_checkpoints(model_dir,
    #                                                [fastai_optimizer])

    ######################################## parallel the network  #########################################
    if use_dist:
        if use_apex:
            import apex
            net, amp_optimizer = apex.amp.initialize(net.cuda(
            ), fastai_optimizer, opt_level="O0", keep_batchnorm_fp32=None, loss_scale=None)
            net_parallel = apex.parallel.DistributedDataParallel(net)
        else:
            # net_parallel = ParallelWrapper(net.cuda(), 'dist')
            # amp_optimizer = fastai_optimizer
            amp_optimizer = optimizer_builder.build(
                optimizer_cfg, net,
                mixed=False,
                loss_scale=loss_scale)
            net_parallel = torch.nn.parallel.DistributedDataParallel(net, device_ids=[get_rank(use_dist)], output_device=get_rank(use_dist) ,find_unused_parameters=True)
    else:
        net_parallel = net.cuda()

    ############################################# build lr_scheduler ############################################
    lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, amp_optimizer,
                                              train_cfg.steps)

    float_dtype = torch.float32
    ######################################## build dataloaders #########################################
    if use_dist:
        num_gpu = 1
        collate_fn = merge_batch
    else:
        raise NotImplementedError

    print(f"MULTI-GPU: using {num_gpu} GPU(s)")

        
    ######################
    # PREPARE INPUT
    ######################
    dataset = input_reader_builder.build(
        input_cfg,
        training=True,
    )
    eval_dataset = input_reader_builder.build(
        eval_input_cfg,
        training=False,
    )


    if use_dist:
        train_sampler = DistributedGivenIterationSamplerEpoch(
            dataset, train_cfg.steps, input_cfg.batch_size, last_iter=net.get_global_step()-1, review_cycle=-1)
        # train_sampler=DistributedSequatialSampler(dataset)
        shuffle = False
        eval_sampler = DistributedSequatialSampler(eval_dataset)

    else:
        train_sampler = None
        eval_sampler = None
        eval_train_sampler = None
        shuffle = True

    dataloader, neighborhood_limits=get_dataloader_deepim(dataset=dataset,
                                                kpconv_config=model_cfg.descriptor_net.keypoints_detector_3d,
                                                batch_size=input_cfg.batch_size * num_gpu,
                                                shuffle=shuffle,
                                                num_workers= 2 ,#input_cfg.preprocess.num_workers * num_gpu,
                                                sampler=train_sampler 
                                        )
    eval_dataloader, _ =get_dataloader_deepim(dataset=eval_dataset,
                                                kpconv_config=model_cfg.descriptor_net.keypoints_detector_3d,
                                    batch_size=eval_input_cfg.batch_size,
                                    shuffle=False,
                                    num_workers= 2, #eval_input_cfg.preprocess.num_workers,
                                    sampler=eval_sampler,
                                    neighborhood_limits=neighborhood_limits
                                   )

    #########################################################################################
    #                                            TRAINING
    ##########################################################################################
    model_logging = SimpleModelLog(model_dir, disable=get_rank(use_dist) != 0)
    model_logging.open()
    start_step = net.get_global_step()
    total_step = train_cfg.steps
    t = time.time()
    steps_per_eval = train_cfg.steps_per_eval

    amp_optimizer.zero_grad()
    step = start_step
    epoch = 0
    net_parallel.eval()

    classes=list(set(eval_dataset.dataset.infos['seqs']))
    
    evaluator = dict([(c,LineMODEvaluator(f"{c}",result_path) ) for c in classes ] )

    ang_errs=[]
    trans_errs=[]
    try:
        for example in eval_dataloader:
            global GLOBAL_STEP
            GLOBAL_STEP = step


            lr_scheduler.step(net.get_global_step())
            example_torch=load_example_to_device(example, device=torch.device("cuda"))

            batch_size = example["image"].shape[0]
            with torch.no_grad():
                # t1=time.time()
                ret_dict = net_parallel(example_torch)
                # print("time:", time.time()-t1)

            eval_results=evaluator[example['class_name'][0]].evaluate_rnnpose(ret_dict, example_torch )
            ang_errs.append(eval_results['ang_err'])
            trans_errs.append(eval_results['trans_err'])

            if step%10 ==0:
                model_logging.log_metrics({
                    "ang_err": float(ang_errs[-1]), 
                    "trans_err": float(trans_errs[-1]), 
                }, GLOBAL_STEP)

                model_logging.log_images(
                    {
                        "pc_proj_vis": eval_results['pc_proj_vis'].transpose([2,0,1])[None], # HWC->NCHW
                        "pc_proj_vis_pred": eval_results['pc_proj_vis_pred'].transpose([2,0,1])[None], # HWC->NCHW
                        "syn_img": torch.cat(ret_dict["syn_img"], dim=0).detach().cpu(),
                        # "image": example["image"].cpu(),
                        # "image_f": (example["image"].cpu()+ret_dict["syn_img"][0].detach().cpu())/2,
                    }, GLOBAL_STEP, prefix="")


            net.update_global_step()
            metrics = defaultdict(dict)
            GLOBAL_STEP = net.get_global_step()

            step += 1


        for k in evaluator:
            print(f"###############Evaluation results of class {k}###############")
            evaluator[k].summarize()
                
    except Exception as e:
        model_logging.log_text(str(e), step)
        raise e
    finally:
        model_logging.close()


if __name__ == '__main__':
    fire.Fire()


================================================
FILE: tools/generate_data_info_deepim_0_orig.py
================================================
import os
import numpy as np
import copy
import pickle
import fire
import glob
import re
from data.linemod import linemod_config


def parse_pose_file(file):
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]

    poses = []
    for line in lines:
        poses.append(
            np.array([np.float32(l) for l in line.split()],
                     dtype=np.float32).reshape((3, 4))
        )
    return poses


def parse_calib_file(file):
    info = {}
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]
    for i, l in enumerate(lines):
        nums = np.array([np.float32(x)
                         for x in l.split(' ')[1:]], dtype=np.float32)
        if i < 4:
            info[f"calib/P{i}"] = nums.reshape((3, 4))
        else:
            info[f"calib/Tr_velo_to_cam"] = nums.reshape((3, 4))
    return info


# def create_data_info(data_root, saving_path, is_test_data=False):
# def create_data_info(data_root, saving_path, data_type='train'):
# def create_data_info(data_root, saving_path, training_data_ratio=0.8, shuffle=True, ):
def create_data_info(data_root, saving_path, with_assertion=True):
    """[summary]
        info structure:
        {
            0:[
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
                },
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
               
                },
            }
            ...
            ],
            1:[

            ]
            ...
        }

    """
    idx2class = {
        1: "ape",
        2: "benchvise",
        # 3: 'bowl',
        4: "camera",
        5: "can",
        6: "cat",
        # 7: 'cup',
        8: "driller",
        9: "duck",
        10: "eggbox",
        11: "glue",
        12: "holepuncher",
        13: "iron",
        14: "lamp",
        15: "phone",
    }
    class2idx = dict([[idx2class[k],k ] for k in idx2class.keys() ])

    seqs=class2idx.keys()

    observed_dir = os.path.join('', 'data/observed')
    gt_observed_dir = os.path.join('', 'data/gt_observed')
    rendered_dir = os.path.join('', 'data/rendered')
    set_split_dir = os.path.join('','image_set/observed')

    # max_items_per_seq=10000#8000#100#10000#2000
    # create training data
    res = {}
   
    for seq in seqs:
        res[seq] = []

        rgb_orig_dir = os.path.join(observed_dir, f"{class2idx[seq]:02d}")        
        rgb_noisy_rendered_dir = os.path.join(rendered_dir, seq )        

        depth_orig_dir= os.path.join(observed_dir, f"{class2idx[seq]:02d}")    
        depth_rendered_dir= os.path.join(gt_observed_dir, seq)    
        depth_noisy_rendered_dir= os.path.join(rendered_dir, seq)    

        gt_pose_dir = os.path.join(gt_observed_dir, seq)
        noisy_pose_dir = os.path.join(rendered_dir, seq)

        label_dir = os.path.join(observed_dir,  f"{class2idx[seq]:02d}" )


        rgb_orig_paths = glob.glob(r'{}/*color.png'.format(rgb_orig_dir) ) 
        train_split_file=os.path.join(data_root,set_split_dir, f"{seq}_train.txt")
        
        with open(train_split_file, 'r') as f:
            train_split = f.readlines()
            train_split = [ int(t.split('/')[-1] ) for t in train_split]

        rgb_orig_paths.sort(key=lambda s: int(re.split( '\.|_|-' ,os.path.basename(s))[0]) )


        NUM_RENDERED=10
        
        for idx in train_split:
            
            #original data paths
            gt_pose=np.loadtxt(os.path.join(data_root,gt_pose_dir, f"{idx:06d}-pose.txt"), skiprows=1).reshape(3,4)
            rgb_orig = os.path.join(rgb_orig_dir, f"{idx:06d}-color.png" )
            depth_orig=os.path.join(depth_orig_dir, f"{idx:06d}-depth.png" )
            depth_rendered = os.path.join(depth_rendered_dir, f"{idx:06d}-depth.png")
            label_orig=os.path.join(label_dir, f"{idx:06d}-label.png" )

            #rendered data paths
            rgb_noisy_rendered = [os.path.join(rgb_noisy_rendered_dir, f"{idx:06d}_{i}-color.png" ) for i in range(NUM_RENDERED) ]
            depth_noisy_rendered = [os.path.join(depth_noisy_rendered_dir, f"{idx:06d}_{i}-depth.png" ) for i in range(NUM_RENDERED) ]

            pose_noisy_rendered = [os.path.join(data_root, noisy_pose_dir, f"{idx:06d}_{i}-pose.txt" ) for i in range(NUM_RENDERED) ]
            pose_noisy_rendered = [np.loadtxt(p, skiprows=1).reshape(3,4) for p in pose_noisy_rendered ]

            #generate data pairs

            for noisy_data_idx in range(NUM_RENDERED):
                if with_assertion:
                    assert os.path.exists(os.path.join(data_root, rgb_orig) ), os.path.join(data_root, rgb_orig) 
                    assert os.path.exists(os.path.join(data_root, depth_orig) ), os.path.join(data_root, depth_orig) 
                    assert os.path.exists(os.path.join(data_root, label_orig) ), os.path.join(data_root, label_orig) 
                    assert os.path.exists(os.path.join(data_root, rgb_noisy_rendered[noisy_data_idx]) ), os.path.join(data_root, rgb_noisy_rendered[noisy_data_idx]) 
                    assert os.path.exists(os.path.join(data_root, depth_noisy_rendered[noisy_data_idx] ) ), os.path.join(data_root, depth_noisy_rendered[noisy_data_idx] ) 

                info = {
                    "index": idx,
                    # "rgb_orig_path": rgb_orig,
                    "rgb_observed_path": rgb_orig,
                    "depth_observed_path": depth_orig,
                    "depth_gt_observed_path": depth_rendered,
                    "gt_pose": gt_pose,

                    "rgb_noisy_rendered": rgb_noisy_rendered[noisy_data_idx],
                    "depth_noisy_rendered": depth_noisy_rendered[noisy_data_idx],
                    "pose_noisy_rendered": pose_noisy_rendered[noisy_data_idx],

                    "model_points_path": f"{seq}.bin",
                    #legacy
                    "RT": gt_pose,
                    "K": linemod_config.linemod_K,
                }
                res[seq].append(info)

                print(info['rgb_observed_path'], info['rgb_noisy_rendered'])
    
    train_saving_path=saving_path+'.train'
    with open(train_saving_path, 'wb+') as f:
        print("Total data amount:", np.sum([len(res[r]) for r in res]))
        pickle.dump(res, f)

    # eval_saving_path=saving_path+'.eval'
    # with open(eval_saving_path, 'wb+') as f:
    #     print("Total data amount:", np.sum([len(test_res[r]) for r in test_res]))
    #     pickle.dump(test_res, f)


if __name__ == '__main__':
    fire.Fire()


================================================
FILE: tools/generate_data_info_deepim_1_syn.py
================================================
import os
import numpy as np
import copy
import pickle
import fire
import glob
import re
from data.linemod import linemod_config


def parse_pose_file(file):
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]

    poses = []
    for line in lines:
        poses.append(
            np.array([np.float32(l) for l in line.split()],
                     dtype=np.float32).reshape((3, 4))
        )
    return poses


def parse_calib_file(file):
    info = {}
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]
    for i, l in enumerate(lines):
        nums = np.array([np.float32(x)
                         for x in l.split(' ')[1:]], dtype=np.float32)
        if i < 4:
            info[f"calib/P{i}"] = nums.reshape((3, 4))
        else:
            info[f"calib/Tr_velo_to_cam"] = nums.reshape((3, 4))
    return info


def create_data_info(data_root, saving_path, with_assertion=False ):
    """[summary]
        info structure:
        {
            0:[
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
                },
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
               
                },
            }
            ...
            ],
            1:[

            ]
            ...
        }

    """


    idx2class = {
        1: "ape",
        2: "benchvise",
        # 3: 'bowl',
        4: "camera",
        5: "can",
        6: "cat",
        # 7: 'cup',
        8: "driller",
        9: "duck",
        10: "eggbox",
        11: "glue",
        12: "holepuncher",
        13: "iron",
        14: "lamp",
        15: "phone",
    }
    class2idx = dict([[idx2class[k],k ] for k in idx2class.keys() ])

    seqs=class2idx.keys()

    observed_dir = os.path.join('', 'data/observed')
    gt_observed_dir = os.path.join('', 'data/gt_observed')
    rendered_dir = os.path.join('', 'data/rendered')
    set_split_dir = os.path.join('','image_set/observed')

    # create training data
    res = {}
    test_res = {}
    for seq in seqs:
        res[seq] = []

        # rgb_orig_dir = os.path.join(observed_dir, f"{class2idx[seq]:02d}")        
        rgb_orig_dir = os.path.join(observed_dir, seq)        
        rgb_noisy_rendered_dir = os.path.join(rendered_dir, seq )        

        # depth_orig_dir= os.path.join(observed_dir, f"{class2idx[seq]:02d}")    
        depth_orig_dir= os.path.join(observed_dir, seq)    

        # depth_renderd_dir= os.path.join(gt_observed_dir, seq)    
        depth_rendered_dir= os.path.join(gt_observed_dir, seq)    
        depth_noisy_rendered_dir= os.path.join(rendered_dir, seq)    

        gt_pose_dir = os.path.join(gt_observed_dir, seq)
        noisy_pose_dir = os.path.join(rendered_dir, seq)

        label_dir = os.path.join(observed_dir, seq)


        rgb_orig_paths = glob.glob(r'{}/*color.png'.format(rgb_orig_dir) ) 

        # train_split_file=os.path.join(data_root,set_split_dir, f"{seq}_train.txt")
        train_split_file=os.path.join(data_root,set_split_dir, f"LM6d_data_syn_train_observed_{seq}.txt")

        with open(train_split_file, 'r') as f:
            train_split = f.readlines()
            train_split = [ int(t.split('/')[-1] ) for t in train_split]


        rgb_orig_paths.sort(key=lambda s: int(re.split( '\.|_|-' ,os.path.basename(s))[0]) )


        # data_num=len(image_paths[:max_items_per_seq] ) 
        # if shuffle: 
            
        #     permute=np.random.permutation(data_num)
        # else:
        #     permute = np.arange(data_num)
        # train_split=permute[:int(data_num*training_data_ratio)]
        # eval_split= permute[int(data_num*training_data_ratio):]

        # for idx in range(len(image_paths[:max_items_per_seq])):
        NUM_RENDERED=1
        for idx in train_split:
            
            #original data paths
            gt_pose=np.loadtxt(os.path.join(data_root,gt_pose_dir, f"{idx:06d}-pose.txt"), skiprows=1).reshape(3,4)
            rgb_orig = os.path.join(rgb_orig_dir, f"{idx:06d}-color.png" )
            depth_orig=os.path.join(depth_orig_dir, f"{idx:06d}-depth.png" )
            label_orig=os.path.join(label_dir, f"{idx:06d}-label.png" )

            depth_rendered = os.path.join(depth_rendered_dir, f"{idx:06d}-depth.png")

           
            #rendered data paths
            rgb_noisy_rendered = [os.path.join(rgb_noisy_rendered_dir, f"{seq}_{idx:06d}_{i}-color.png" ) for i in range(NUM_RENDERED) ]
            depth_noisy_rendered = [os.path.join(depth_noisy_rendered_dir, f"{seq}_{idx:06d}_{i}-depth.png" ) for i in range(NUM_RENDERED) ]

            pose_noisy_rendered = [os.path.join(data_root, noisy_pose_dir, f"{seq}_{idx:06d}_{i}-pose.txt" ) for i in range(NUM_RENDERED) ]
            pose_noisy_rendered = [np.loadtxt(p, skiprows=1).reshape(3,4) for p in pose_noisy_rendered ]

            #generate data pairs

            for noisy_data_idx in range(NUM_RENDERED):
                if with_assertion:
                    assert os.path.exists(os.path.join(data_root, rgb_orig) ), os.path.join(data_root, rgb_orig) 
                    assert os.path.exists(os.path.join(data_root, depth_orig) ), os.path.join(data_root, depth_orig) 
                    assert os.path.exists(os.path.join(data_root, label_orig) ), os.path.join(data_root, label_orig)
                    assert os.path.exists(os.path.join(data_root, rgb_noisy_rendered[noisy_data_idx]) ), os.path.join(data_root, rgb_noisy_rendered[noisy_data_idx])
                    assert os.path.exists(os.path.join(data_root, depth_noisy_rendered[noisy_data_idx] ) ), os.path.join(data_root, depth_noisy_rendered[noisy_data_idx] )

                    assert os.path.exists(os.path.join(data_root, depth_rendered) ), os.path.join(data_root, depth_rendered) 
                    # assert os.path.exists(os.path.join(data_root, pose_noisy_rendered[noisy_data_idx] ) ), os.path.join(data_root, pose_noisy_rendered[noisy_data_idx] )

                info = {
                     "index": idx,
                    # "rgb_orig_path": rgb_orig,
                    "rgb_observed_path": rgb_orig,
                    "depth_observed_path": depth_orig,
                    "depth_gt_observed_path": depth_rendered,
                    "gt_pose": gt_pose,

                    "rgb_noisy_rendered": rgb_noisy_rendered[noisy_data_idx],
                    "depth_noisy_rendered": depth_noisy_rendered[noisy_data_idx],
                    "pose_noisy_rendered": pose_noisy_rendered[noisy_data_idx],

                    "model_points_path": f"{seq}.bin",
                    #legacy
                    "RT": gt_pose,
                    "K": linemod_config.linemod_K,
                }

            print(info['rgb_observed_path'], info['rgb_noisy_rendered'])

            res[seq].append(info)


    train_saving_path=saving_path+'.train'
    with open(train_saving_path, 'wb+') as f:
        print("Total data amount:", np.sum([len(res[r]) for r in res]))
        pickle.dump(res, f)

    # eval_saving_path=saving_path+'.eval'
    # with open(eval_saving_path, 'wb+') as f:
    #     print("Total data amount:", np.sum([len(test_res[r]) for r in test_res]))
    #     pickle.dump(test_res, f)


if __name__ == '__main__':
    fire.Fire()


================================================
FILE: tools/generate_data_info_deepim_2_posecnnval.py
================================================
import os
import numpy as np
import copy
import pickle
import fire
import glob
import re
from data.linemod import linemod_config
import scipy.io as sio


def parse_pose_file(file):
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]

    poses = []
    for line in lines:
        poses.append(
            np.array([np.float32(l) for l in line.split()],
                     dtype=np.float32).reshape((3, 4))
        )
    return poses


def parse_calib_file(file):
    info = {}
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]
    for i, l in enumerate(lines):
        nums = np.array([np.float32(x)
                         for x in l.split(' ')[1:]], dtype=np.float32)
        if i < 4:
            info[f"calib/P{i}"] = nums.reshape((3, 4))
        else:
            info[f"calib/Tr_velo_to_cam"] = nums.reshape((3, 4))
    return info


def create_data_info(data_root, saving_path, with_assertion=True):
    """[summary]
        info structure:
        {
            0:[
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
                },
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
               
                },
            }
            ...
            ],
            1:[

            ]
            ...
        }

    """
    # seqs=['cat', 'ape', 'camera', 'duck', 'glue', 'iron', 'phone','benchvise', 'can', 'driller', 'eggbox', 'holepuncher', 'lamp']
    idx2class = {
        1: "ape",
        2: "benchvise",
        # 3: 'bowl',
        4: "camera",
        5: "can",
        6: "cat",
        # 7: 'cup',
        8: "driller",
        9: "duck",
        10: "eggbox",
        11: "glue",
        12: "holepuncher",
        13: "iron",
        14: "lamp",
        15: "phone",
    }
    class2idx = dict([[idx2class[k],k ] for k in idx2class.keys() ])

    seqs=class2idx.keys()

    observed_dir = os.path.join('', 'data/observed')
    gt_observed_dir = os.path.join('', 'data/gt_observed')
    # rendered_dir = os.path.join('', 'data/rendered')
    rendered_dir = os.path.join('', 'data/rendered_val_PoseCNN')
    set_split_dir = os.path.join('','image_set/observed')

    # max_items_per_seq=10000#8000#100#10000#2000
    # create training data
    res = {}
   
    for seq in seqs:
        res[seq] = []

        rgb_orig_dir = os.path.join(observed_dir, f"{class2idx[seq]:02d}")        
        rgb_noisy_rendered_dir = os.path.join(rendered_dir,  f"{class2idx[seq]:02d}", seq )        

        depth_orig_dir= os.path.join(observed_dir, f"{class2idx[seq]:02d}")    
        depth_rendered_dir= os.path.join(gt_observed_dir, seq)    
        depth_noisy_rendered_dir= os.path.join(rendered_dir, f"{class2idx[seq]:02d}", seq)    

        gt_pose_dir = os.path.join(gt_observed_dir, seq)
        noisy_pose_dir = os.path.join(rendered_dir,  f"{class2idx[seq]:02d}" ,seq)

        label_dir = os.path.join(observed_dir,  f"{class2idx[seq]:02d}" )
        meta_dir = os.path.join(observed_dir,  f"{class2idx[seq]:02d}" )


        rgb_orig_paths = glob.glob(r'{}/*color.png'.format(rgb_orig_dir) ) 
        test_split_file=os.path.join(data_root,set_split_dir, f"{seq}_test.txt")
        
        with open(test_split_file, 'r') as f:
            test_split = f.readlines()
            test_split = [ int(t.split('/')[-1] ) for t in test_split]

        rgb_orig_paths.sort(key=lambda s: int(re.split( '\.|_|-' ,os.path.basename(s))[0]) )


        # for idx in range(len(image_paths[:max_items_per_seq])):
        NUM_RENDERED=1
        
        for idx in test_split:
            
            #original data paths
            gt_pose=np.loadtxt(os.path.join(data_root,gt_pose_dir, f"{idx:06d}-pose.txt"), skiprows=1).reshape(3,4)
            rgb_orig = os.path.join(rgb_orig_dir, f"{idx:06d}-color.png" )
            depth_orig=os.path.join(depth_orig_dir, f"{idx:06d}-depth.png" )
            depth_rendered = os.path.join(depth_rendered_dir, f"{idx:06d}-depth.png")
            label_orig=os.path.join(label_dir, f"{idx:06d}-label.png" )
            meta_path=os.path.join(meta_dir, f"{idx:06d}-meta.mat")

            #rendered data paths
            rgb_noisy_rendered = [os.path.join(rgb_noisy_rendered_dir, f"{seq}_{idx:06d}_{i}-color.png" ) for i in range(NUM_RENDERED) ]
            depth_noisy_rendered = [os.path.join(depth_noisy_rendered_dir, f"{seq}_{idx:06d}_{i}-depth.png" ) for i in range(NUM_RENDERED) ]

            pose_noisy_rendered = [os.path.join(data_root, noisy_pose_dir, f"{seq}_{idx:06d}_{i}-pose.txt" ) for i in range(NUM_RENDERED) ]
            pose_noisy_rendered = [np.loadtxt(p, skiprows=1).reshape(3,4) for p in pose_noisy_rendered ]

            # meta_data=sio.loadmat(meta_path)
            # assert meta_data["boxes"].shape[0] == 1
            #generate data pairs
            for noisy_data_idx in range(NUM_RENDERED):
                if with_assertion:
                    assert os.path.exists(os.path.join(data_root, rgb_orig) ), os.path.join(data_root, rgb_orig) 
                    assert os.path.exists(os.path.join(data_root, depth_orig) ), os.path.join(data_root, depth_orig) 
                    assert os.path.exists(os.path.join(data_root, label_orig) ), os.path.join(data_root, label_orig) 
                    assert os.path.exists(os.path.join(data_root, rgb_noisy_rendered[noisy_data_idx]) ), os.path.join(data_root, rgb_noisy_rendered[noisy_data_idx]) 
                    assert os.path.exists(os.path.join(data_root, depth_noisy_rendered[noisy_data_idx] ) ), os.path.join(data_root, depth_noisy_rendered[noisy_data_idx] ) 

                info = {
                    "index": idx,
                    # "rgb_orig_path": rgb_orig,
                    "rgb_observed_path": rgb_orig,
                    "depth_observed_path": depth_orig,
                    "depth_gt_observed_path": depth_rendered,
                    "gt_pose": gt_pose,

                    "rgb_noisy_rendered": rgb_noisy_rendered[noisy_data_idx],
                    "depth_noisy_rendered": depth_noisy_rendered[noisy_data_idx],
                    "pose_noisy_rendered": pose_noisy_rendered[noisy_data_idx],

                    "model_points_path": f"{seq}.bin",
                    #legacy
                    "RT": gt_pose,
                    "K": linemod_config.linemod_K,
                    # "boxes":  meta_data["boxes"]
                }
                res[seq].append(info)

                print(info['rgb_observed_path'], info['rgb_noisy_rendered'])
    
    # train_saving_path=saving_path+'.train'
    train_saving_path=saving_path+'.eval'
    with open(train_saving_path, 'wb+') as f:
        print("Total data amount:", np.sum([len(res[r]) for r in res]))
        pickle.dump(res, f)

    # eval_saving_path=saving_path+'.eval'
    # with open(eval_saving_path, 'wb+') as f:
    #     print("Total data amount:", np.sum([len(test_res[r]) for r in test_res]))
    #     pickle.dump(test_res, f)


if __name__ == '__main__':
    fire.Fire()


================================================
FILE: tools/generate_data_info_v2_deepim.py
================================================
#The version compatible with deepim   
import os
import numpy as np
import copy
import pickle
import fire
import glob
import re


def parse_pose_file(file):
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]

    poses = []
    for line in lines:
        poses.append(
            np.array([np.float32(l) for l in line.split()],
                     dtype=np.float32).reshape((3, 4))
        )
    return poses


def parse_calib_file(file):
    info = {}
    with open(file) as f:
        lines = [line.strip() for line in f.readlines()]
    for i, l in enumerate(lines):
        nums = np.array([np.float32(x)
                         for x in l.split(' ')[1:]], dtype=np.float32)
        if i < 4:
            info[f"calib/P{i}"] = nums.reshape((3, 4))
        else:
            info[f"calib/Tr_velo_to_cam"] = nums.reshape((3, 4))
    return info


# def create_data_info(data_root, saving_path, is_test_data=False):
# def create_data_info(data_root, saving_path, data_type='train'):
def create_data_info(data_root, saving_path, training_data_ratio=0.8, shuffle=True, ):
    """[summary]
        info structure:
        {
            0:[
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
                },
                {
                "index": idx,
                "lidar_bin_path": lidar_bin_paths[idx],
                "RT": poses[idx],
                "K": poses[idx],
               
                },
            }
            ...
            ],
            1:[

            ]
            ...
        }

    """

    image_dir = os.path.join(data_root, )
    pose_dir = os.path.join(data_root )
    depth_dir = os.path.join(data_root)
    # blender_to_bop_pose=np.load("/DATA/yxu/LINEMOD/metricpose/blender2bop_RT.npy", allow_pickle=True).flat[0]
    blender_to_bop_pose=np.load(f"{os.path.dirname(os.path.abspath(__file__)) }/../EXPDATA/init_poses/pose_conversion/blender2bop_RT.npy", allow_pickle=True).flat[0]
    # seqs=['cat']
    seqs=['cat', 'ape', 'cam', 'duck', 'glue', 'iron', 'phone','benchvise', 'can', 'driller', 'eggbox', 'holepuncher', 'lamp']
    print(seqs)
    max_items_per_seq=10000
    # create training data
    res = {}
    eval_res = {}
    for seq in seqs:
        res[seq] = []
        eval_res[seq]=[]

        image_path_dir = os.path.join(image_dir, seq)
        depth_path_dir = os.path.join(depth_dir, seq)
        pose_path_dir = os.path.join(pose_dir, seq)
        # image_paths = os.listdir(lidar_bin_dir)
        image_paths = glob.glob(r'{}/*.jpg'.format(image_path_dir) ) 
        depth_paths = glob.glob(r'{}/*depth*.npy'.format(depth_path_dir) ) 

        pose_paths = glob.glob(r'{}/*RT.pkl'.format(pose_path_dir) ) 
        #for compatibility
        if len(pose_paths) ==0:
            pose_paths = glob.glob(r'{}/*params.pkl'.format(pose_path_dir) ) 


        # image_paths.sort(key=lambda s: int(os.path.basename(s).split('.')[0]) )

        image_paths.sort(key=lambda s: int(re.split( '\.|_' ,os.path.basename(s))[0]) )
        depth_paths.sort(key=lambda s: int(os.path.basename(s).split('_')[0]))
        pose_paths.sort(key=lambda s: int(os.path.basename(s).split('_')[0]))


        data_num=len(image_paths[:max_items_per_seq] ) 
        if shuffle: 
            
            permute=np.random.permutation(data_num)
        else:
            permute = np.arange(data_num)
        train_split=permute[:int(data_num*training_data_ratio)]
        eval_split= permute[int(data_num*training_data_ratio):]

        # for idx in range(len(image_paths[:max_items_per_seq])):
        for idx in train_split:
            # print(image_paths[idx], depth_paths[idx])
            with open(pose_paths[idx],'rb') as f:
                pose = pickle.load(f) 

            # pose['K'] = np.array([[572.4114, 0., 325.2611],
            #                   [0., 573.57043, 242.04899],
            #                   [0., 0., 1.]])
            if seq=='cam':
                bl2bo = blender_to_bop_pose['camera']
            else:
                bl2bo = blender_to_bop_pose[seq]

            pose["RT"][:3,:3] =  pose["RT"][:3,:3]@bl2bo[:3,:3].T
            # pose["RT"][:3,3:] =  -pose["RT"][:3,:3]@bl2bo[:3,:3].T @bl2bo[:3,3:]  + pose["RT"][:3,3:] 
            pose["RT"][:3,3:] =  -pose["RT"][:3,:3] @bl2bo[:3,3:]  + pose["RT"][:3,3:] 
            info = {
                "index": idx,
                # "image_path": image_paths[idx].replace(image_dir+'/',''),
                # "depth_path": depth_paths[idx].replace(depth_dir+'/',''),
                "rgb_observed_path": image_paths[idx].replace(image_dir,'./'),
                "depth_gt_observed_path": depth_paths[idx].replace(depth_dir,'./'),
                "rgb_noisy_rendered": None,
                "depth_noisy_rendered": None,
                "pose_noisy_rendered": None,
                "model_points_path": f"{seq}.bin",
                # "RT": pose["RT"],
                "gt_pose":  pose["RT"],
                "K": pose["K"],
                "bbox":pose.get('bbox', None)
            }

            print(info['rgb_observed_path'], info['depth_gt_observed_path'], image_dir)#, bl2bo[:3,:3])
            res[seq].append(info)


    train_saving_path=saving_path+'.train'
    # eval_saving_path=saving_path+'.eval'
    with open(train_saving_path, 'wb+') as f:

        print("Total data amount:", np.sum([len(res[r]) for r in res]))
        pickle.dump(res, f)

    # with open(eval_saving_path, 'wb+') as f:

    #     print("Total data amount:", np.sum([len(eval_res[r]) for r in eval_res]))
    #     pickle.dump(eval_res, f)

if __name__ == '__main__':
    fire.Fire()


================================================
FILE: tools/train.py
================================================
import numpy as np 
import torch

from pathlib import Path
import json
import random
import re
import torch.backends.cudnn as cudnn
import torch.multiprocessing as mp
import time
import fire
import torch.distributed as dist
import os
from collections import defaultdict
import kornia
import flow_vis
import copy
import ast 

from utils.progress_bar import ProgressBar
from utils.log_tool import SimpleModelLog
from data.preprocess import merge_batch, get_dataloader_deepim  # merge_second_batch_multigpu
from utils.config_io import merge_cfg, save_cfg
import torchplus
from builder import (
    dataset_builder,
    input_reader_builder,
    lr_scheduler_builder,
    optimizer_builder,
    rnnpose_builder
)
from utils.distributed_utils import dist_init, average_gradients, DistModule, ParallelWrapper, DistributedSequatialSampler, DistributedGivenIterationSampler, DistributedGivenIterationSamplerEpoch 
# from utils.visualize import vis_pointclouds_cv2, vis_2d_keypoints_cv2
from utils.util import modify_parameter_name_with_map
from config.default import get_cfg
from data.ycb.basic import bop_ycb_class2idx


GLOBAL_GPUS_PER_DEVICE = 1  # None
GLOBAL_STEP = 0
RANK=-1
WORLD_SIZE=-1


def load_example_to_device(example,
                             device=None) -> dict:
    # global GLOBAL_GPUS_PER_DEVICE
    # device = device % GLOBAL_GPUS_PER_DEVICE or torch.device("cuda:0")
    # example_torch = defaultdict(list)

    example_torch = {}

    for k, v in example.items():  
        if k in ['class_name', 'idx'] or example[k] is None:
            example_torch[k] = v
            continue

        if type(v) == list:
            example_torch[k] = [item.to(device=device) for item in v]
        else:
            example_torch[k] = v.to(device=device)

    return example_torch
def build_network(model_cfg, measure_time=False, testing=False):
    net = rnnpose_builder.build(
        model_cfg, measure_time=measure_time, testing=testing)
    return net


def _worker_init_fn(worker_id):
    global GLOBAL_STEP
    time_seed = GLOBAL_STEP
    np.random.seed(time_seed + worker_id)
    print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0])


def freeze_params(params: dict, include: str = None, exclude: str = None):
    assert isinstance(params, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    remain_params = []
    for k, p in params.items():
        if include_re is not None:
            if include_re.match(k) is not None:
                continue
        if exclude_re is not None:
            if exclude_re.match(k) is None:
                continue
        remain_params.append(p)
    return remain_params


def freeze_params_v2(params: dict, include: str = None, exclude: str = None):
    assert isinstance(params, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    for k, p in params.items():
        if include_re is not None:
            if include_re.match(k) is not None:
                p.requires_grad = False
        if exclude_re is not None:
            if exclude_re.match(k) is None:
                p.requires_grad = False


def filter_param_dict(state_dict: dict, include: str = None, exclude: str = None):
    assert isinstance(state_dict, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    res_dict = {}
    for k, p in state_dict.items():
        if include_re is not None:
            if include_re.match(k) is None:
                continue
        if exclude_re is not None:
            if exclude_re.match(k) is not None:
                continue
        res_dict[k] = p
    return res_dict

def chk_rank(rank_, use_dist=False):
    if not use_dist:
        return True
    global RANK
    if RANK<0:
        RANK=dist.get_rank()
    cur_rank = RANK#dist.get_rank()
    # self.world_size = dist.get_world_size()
    return cur_rank == rank_

def get_rank(use_dist=False):
    if not use_dist:
        return 0
    else:
        # return dist.get_rank()
        global RANK 
        if RANK<0:
            RANK=dist.get_rank()
        return RANK 

def get_world(use_dist):
    if not use_dist:
        return 1
    else:
        global WORLD_SIZE 
        if WORLD_SIZE<0:
            WORLD_SIZE=dist.get_world_size()
        return WORLD_SIZE #dist.get_world_size()
def get_ngpus_per_node():
    global GLOBAL_GPUS_PER_DEVICE
    return GLOBAL_GPUS_PER_DEVICE
def get_logger():
    logger_name = "main-logger"
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler()
    fmt = "[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s"
    handler.setFormatter(logging.Formatter(fmt))
    logger.addHandler(handler)
    return logger


def multi_proc_train(
          config_path,
          model_dir,
          use_apex,
          world_size,
          result_path=None,
          create_folder=False,
          display_step=50,
          summary_step=5,
          pretrained_path=None,
          pretrained_include=None,
          pretrained_exclude=None,
          pretrained_param_map=None,
          freeze_include=None,
          freeze_exclude=None,
          measure_time=False,
          resume=False,
          use_dist=False,
          gpus_per_node=1,
          start_gpu_id=0,
          optim_eval=False,
          seed=7,
          dist_port="23335",
         force_resume_step=None,
         batch_size=None,
         apex_opt_level='O0'
          ):
    
    params = {
          "config_path": config_path,
          "model_dir": model_dir,
          "use_apex": use_apex,
          "result_path": result_path,
          "create_folder": create_folder,
          "display_step": display_step,
          "summary_step": summary_step,
          "pretrained_path": pretrained_path,
          "pretrained_include": pretrained_include,
          "pretrained_exclude": pretrained_exclude,
          "pretrained_param_map": pretrained_param_map,
          "freeze_include": freeze_include,
          "freeze_exclude": freeze_exclude,
        #   "multi_gpu": multi_gpu,
          "measure_time": measure_time,
          "resume": resume,
          "use_dist": use_dist,
          "gpus_per_node": gpus_per_node,
          "optim_eval": optim_eval,
          "seed": seed,
          "dist_port": dist_port,
          "world_size": world_size,
          "force_resume_step":force_resume_step,
          "batch_size": batch_size,
          "apex_opt_level":apex_opt_level,
          "start_gpu_id": start_gpu_id
    }
    from types import SimpleNamespace 
    params = SimpleNamespace(**params)

    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
        str(x) for x in range(start_gpu_id, start_gpu_id+gpus_per_node))
    print(f"CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}"  )

    mp.spawn(train_worker, nprocs=gpus_per_node,
                args=( params,) )

def train_worker(rank, params):
    global RANK, WORLD_SIZE
    RANK = rank
    WORLD_SIZE=params.world_size

    # import os
    # os.environ["CUDA_VISIBLE_DEVICES"] = str(params.start_gpu_id+rank) 
    print(f"CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}")
    torch.cuda.set_device(rank%params.gpus_per_node)
    
    train(config_path=params.config_path,
          model_dir=params.model_dir,
          use_apex=params.use_apex,
          result_path=params.result_path,
          create_folder=params.create_folder,
          display_step=params.display_step,
          pretrained_path=params.pretrained_path,
          pretrained_include=params.pretrained_include,
          pretrained_exclude=params.pretrained_exclude,
          pretrained_param_map=params.pretrained_param_map,
          freeze_include=params.freeze_include,
          freeze_exclude=params.freeze_exclude,
        #   multi_gpu=params.multi_gpu,
          measure_time=params.measure_time,
          resume=params.resume,
          use_dist=params.use_dist,
          dist_port=params.dist_port,
          gpus_per_node=params.gpus_per_node,
          optim_eval=params.optim_eval,
          seed=params.seed,
          force_resume_step=params.force_resume_step,
          batch_size = params.batch_size,
          apex_opt_level=params.apex_opt_level,
          gpu_id=params.start_gpu_id+rank, 
          ) 


def train(
         config_path,
          model_dir,
          use_apex,
          result_path=None,
          create_folder=False,
          display_step=50,
          summary_step=5,
          pretrained_path=None,
          pretrained_include=None,
          pretrained_exclude=None,
          pretrained_param_map=None,
          freeze_include=None,
          freeze_exclude=None,
          multi_gpu=False,
          measure_time=False,
          resume=False,
          use_dist=False,
          dist_port="23335",
          gpus_per_node=1,
          optim_eval=False,
          seed=7,
          force_resume_step=None,
          batch_size=None,
          apex_opt_level='O0',
          gpu_id=None
          ):
    """train a VoxelNet model specified by a config file.
    """

    print("force_resume_step:", force_resume_step)
    print("torch.cuda.is_available()=", torch.cuda.is_available())
    print("torch.version.cuda=",torch.version.cuda) 
    dist_url=f"tcp://127.0.0.1:{dist_port}"
    print(f"dist_url={dist_url}", flush=True)
    global RANK, WORLD_SIZE
    # RANK, WORLD_SIZE=rank, world_size
    if RANK<0:
        RANK=0
    if WORLD_SIZE<0:
        WORLD_SIZE=1

    global GLOBAL_GPUS_PER_DEVICE
    GLOBAL_GPUS_PER_DEVICE = gpus_per_node

    #fix the seeds 
    print(f"Set seed={seed}", flush=True)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


    ######################################## initialize the distributed env #########################################
    if use_dist:
        # torch.cuda.set_device(get_rank(use_dist))
        if use_apex:
            dist.init_process_group(
                backend="nccl", init_method=dist_url, world_size=get_world(use_dist), rank=get_rank(use_dist))
        else:
            # rank, world_size = dist_init(str(dist_port))
            dist.init_process_group(
                backend="nccl", init_method=dist_url, world_size=get_world(use_dist), rank=get_rank(use_dist))
    

    ############################################ create folders ############################################

    model_dir = str(Path(model_dir).resolve())
    model_dir = Path(model_dir)
    if chk_rank(0, use_dist):
        if not resume and model_dir.exists():
            raise ValueError("model dir exists and you don't specify resume.")
            print("Warning: model dir exists and you don't specify resume.")

        model_dir.mkdir(parents=True, exist_ok=True)
    if result_path is None:
        result_path = model_dir / 'results'
    config_file_bkp = "pipeline.config"

    ############################################# read config proto ############################################
    config = merge_cfg(
        [config_path], intersection=True)
    if chk_rank(0, use_dist):
        print(json.dumps(config, indent=4))

    if chk_rank(0, use_dist):
        # save_cfg([default_config_path, custom_config_path],
        save_cfg([config_path, config_path],
                 str(model_dir / config_file_bkp))
    #update the global config object
    get_cfg().merge(config.get("BASIC",{}),"BASIC" )  

    input_cfg = config.train_input_reader
    eval_input_cfg = config.eval_input_reader
    model_cfg = config.model
    train_cfg = config.train_config
    optimizer_cfg = train_cfg.optimizer
    loss_scale = train_cfg.loss_scale_factor


    ############################################# Update default options ############################################

    if batch_size is not None:
        input_cfg.batch_size = batch_size 
    print(input_cfg.batch_size)
    
    ############################################# build network, optimizer etc. ############################################
    #dummy dataset to get obj_seqs
    dataset_tmp = input_reader_builder.build(
        input_cfg,
        training=True,
    )
 
    model_cfg.obj_seqs = copy.copy(dataset_tmp._dataset.infos["seqs"])
    print(model_cfg.obj_seqs, type(model_cfg.obj_seqs), flush=True)

    model_cfg.gpu_id=gpu_id
    net = build_network(model_cfg, measure_time)  #.to(device)
    net.cuda()

    fastai_optimizer = optimizer_builder.build(
        optimizer_cfg, net,
        mixed=False,
        loss_scale=loss_scale)

    print("num parameters:", len(list(net.parameters())))

    ############################################# load pretrained model ############################################
    if pretrained_path is not None:
        model_dict = net.state_dict()
        pretrained_dict = torch.load(pretrained_path, map_location='cpu')
        print("Pretrained keys:", pretrained_dict.keys())
        print("Model keys:", model_dict.keys())

        pretrained_dict = filter_param_dict(
            pretrained_dict, pretrained_include, pretrained_exclude)

        pretrained_dict = modify_parameter_name_with_map(
            pretrained_dict, ast.literal_eval(str(pretrained_param_map)))
        new_pretrained_dict = {}
        for k, v in pretrained_dict.items():
            if k in model_dict and v.shape == model_dict[k].shape:
                new_pretrained_dict[k] = v
        print("Load pretrained parameters:")
        for k, v in new_pretrained_dict.items():
            print(k, v.shape)
        model_dict.update(new_pretrained_dict)
        net.load_state_dict(model_dict)
        freeze_params_v2(dict(net.named_parameters()),
                         freeze_include, freeze_exclude)
        net.clear_global_step()
        # net.clear_metrics()
        del pretrained_dict
    ############################################# try to resume from the latest chkpt ############################################
    torchplus.train.try_restore_latest_checkpoints(model_dir, [net])
    torchplus.train.try_restore_latest_checkpoints(model_dir,
                                                   [fastai_optimizer])

    ######################################## parallel the network  #########################################
    if use_dist:
        if use_apex:
            import apex
            net = apex.parallel.convert_syncbn_model(net)
            net, amp_optimizer = apex.amp.initialize(net.cuda(
            ), fastai_optimizer, opt_level="O0", keep_batchnorm_fp32=None, loss_scale=None)
            net_parallel = apex.parallel.DistributedDataParallel(net)
        else:
            amp_optimizer = optimizer_builder.build(
                optimizer_cfg, net,
                mixed=False,
                loss_scale=loss_scale)
            net_parallel = torch.nn.parallel.DistributedDataParallel(net, device_ids=[get_rank(use_dist)], output_device=get_rank(use_dist) ,find_unused_parameters=True)
    else:
        net_parallel = net.cuda()

    ############################################# build lr_scheduler ############################################
    lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, amp_optimizer,
                                              train_cfg.steps)

    if 0:  # TODO:
        float_dtype = torch.float16
    else:
        float_dtype = torch.float32
    ######################################## build dataloaders #########################################
    if use_dist:
        num_gpu = 1
        collate_fn = merge_batch
    else:
        raise NotImplementedError
    print(f"MULTI-GPU: use {num_gpu} gpu")

        
    ######################
    # PREPARE INPUT
    ######################
    dataset = input_reader_builder.build(
        input_cfg,
        training=True,
    )

    eval_dataset = input_reader_builder.build(
        eval_input_cfg,
        training=False,
    )


    if use_dist:
        train_sampler = DistributedGivenIterationSamplerEpoch(
            dataset, train_cfg.steps, input_cfg.batch_size, last_iter=net.get_global_step()-1, review_cycle=-1)
        shuffle = False
        eval_sampler = DistributedSequatialSampler(eval_dataset)

    else:
        train_sampler = None
        eval_sampler = None
        eval_train_sampler = None
        shuffle = True
    
    dataloader, neighborhood_limits=get_dataloader_deepim(dataset=dataset,
                                                kpconv_config=model_cfg.descriptor_net.keypoints_detector_3d,
                                                batch_size=input_cfg.batch_size * num_gpu,
                                                shuffle=shuffle,
                                                num_workers=input_cfg.preprocess.num_workers * num_gpu,
                                                sampler=train_sampler 
                                        )
    eval_dataloader, _ =get_dataloader_deepim(dataset=eval_dataset,
                                                kpconv_config=model_cfg.descriptor_net.keypoints_detector_3d,
                                    batch_size=eval_input_cfg.batch_size,
                                    shuffle=False,
                                    num_workers=eval_input_cfg.preprocess.num_workers,
                                    sampler=eval_sampler,
                                    neighborhood_limits=neighborhood_limits
                                   )

    ##########################################################################################
    #                                            TRAINING
    ##########################################################################################
    model_logging = SimpleModelLog(model_dir, disable=get_rank(use_dist) != 0)
    model_logging.open()
    start_step = net.get_global_step()
    total_step = train_cfg.steps
    t = time.time()
    steps_per_eval = train_cfg.steps_per_eval

    amp_optimizer.zero_grad()
    step_times = []
    step = start_step
    epoch = 0
    net_parallel.train()

    try:
        while True:

            if use_dist:
                epoch = (net.get_global_step() *
                         input_cfg.batch_size) // len(dataloader)

                dataloader.sampler.set_epoch(epoch)
            else:
                epoch += 1
            for example in dataloader:

                global GLOBAL_STEP
                GLOBAL_STEP = step

                lr_scheduler.step(net.get_global_step())
                example_torch=load_example_to_device(example, device=torch.device("cuda"))

                batch_size = example["image"].shape[0]

                ret_dict = net_parallel(example_torch)

                loss = ret_dict["loss"].mean()  # /get_world(use_dist)
                recall = ret_dict['recall'].mean()

                reduced_loss = loss.data.clone() / get_world(use_dist)
                reduced_recall = recall.data.clone() / get_world(use_dist)
                if use_dist:
                    dist.all_reduce_multigpu(
                        [reduced_loss])
                    dist.all_reduce_multigpu(
                        [reduced_recall])

                amp_optimizer.zero_grad()
                if use_apex:
                    with apex.amp.scale_loss(loss, amp_optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss = loss/get_world(use_dist)
                    loss.backward()
                    if use_dist:
                        average_gradients(net_parallel)


                torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0)
                amp_optimizer.step()

                net.update_global_step()
                step_time = (time.time() - t)
                step_times.append(step_time)
                t = time.time()
                metrics = defaultdict(dict)
                GLOBAL_STEP = net.get_global_step()

                if chk_rank(0, use_dist) and GLOBAL_STEP % display_step == 0:
                    print(f'Model directory: {str(model_dir)}')
                    if measure_time:
                        for name, val in net.get_avg_time_dict().items():
                            print(f"avg {name} time = {val * 1000:.3f} ms")

                    metrics["runtime"] = {
                        "step": GLOBAL_STEP,
                        "steptime": np.mean(step_times),
                    }

                    metrics["loss"]["loss"] = float(
                        reduced_loss.detach().cpu().numpy())
                    # metrics["loss"]["translation_loss"] = float(
                    metrics["recall"] = float(reduced_recall.detach().cpu().numpy())

                    metrics['learning_rate'] = amp_optimizer.lr
                    metrics['reproj_loss'] = float(ret_dict['reproj_loss'].median().detach().cpu().numpy())
                    metrics['loss_3d_proj'] = float(ret_dict['loss_3d_proj'].median().detach().cpu().numpy())
                    # metrics['chamfer_loss'] = float(ret_dict['chamfer_loss'].median().detach().cpu().numpy())
                    if hasattr(net.motion_net, "sigma"):
                        metrics['sigma'] = float(net.motion_net.sigma[0].detach().cpu().numpy())

                    metrics['epoch'] = epoch

                    model_logging.log_metrics(metrics, GLOBAL_STEP)


                    if isinstance(ret_dict["flow"], (list, tuple) ):
                        ret_dict["flow"] = ret_dict["flow"][-1]
                    flow=flow_vis.flow_to_color(ret_dict["flow"][0].squeeze().permute(1,2,0).detach().cpu().numpy(), convert_to_bgr=False)

                    model_logging.log_images(
                    {
                        "image": example["image"].cpu(),
                        "flow": flow.transpose(2,0,1)[None],
                        "weight": ret_dict["weight"].squeeze(1).mean(1,keepdims=True).detach().cpu(),
                        "syn_img": torch.cat(ret_dict["syn_img"], dim=0)[:,:3].detach().cpu(),
                        "syn_depth": (ret_dict["syn_depth"][-1]/ret_dict["syn_depth"][-1].max()).detach().cpu(),
                        "valid_mask": ret_dict["valid_mask"].detach().squeeze(1).permute(0,3,1,2).cpu(),
                        "ren_mask": example["ren_mask"][:,None].cpu()
                    }, GLOBAL_STEP, prefix="") 

                if optim_eval and GLOBAL_STEP < total_step/2 and GLOBAL_STEP > train_cfg.steps_per_eval*2:
                    steps_per_eval = 2*train_cfg.steps_per_eval
                else:
                    steps_per_eval = train_cfg.steps_per_eval

                if GLOBAL_STEP % steps_per_eval == 0:
                    if chk_rank(0, use_dist):  # logging
                        torchplus.train.save_models(model_dir, [net, amp_optimizer],
                                                    net.get_global_step())
                    eval_once(net,
                              eval_dataset=eval_dataset, eval_dataloader=eval_dataloader, eval_input_cfg=eval_input_cfg,
                              result_path=result_path,
                              global_step=GLOBAL_STEP,
                              model_logging=model_logging,
                              metrics=metrics,
                              float_dtype=float_dtype,
                              use_dist=use_dist,
                              prefix='eval_')

                    net.train()

                step += 1
                if step >= total_step:
                    break
            if step >= total_step:
                break
    except Exception as e:
        model_logging.log_text(str(e), step)
        raise e
    finally:
        model_logging.close()
    torchplus.train.save_models_cpu(model_dir, [net, amp_optimizer],
                                    net.get_global_step())


def eval_once(net,
              eval_dataset, eval_dataloader, eval_input_cfg,
              result_path, global_step, model_logging, metrics, float_dtype, use_dist, prefix='eval_'):
    from utils.eval_metric import LineMODEvaluator#, YCBEvaluator

    net.eval()
    result_path_step = result_path / \
        f"step_{global_step}"
    if chk_rank(0, use_dist):
        result_path_step.mkdir(parents=True, exist_ok=True)
        model_logging.log_text("#################################",
                               global_step)
        model_logging.log_text("# EVAL", global_step)
        model_logging.log_text("#################################",
                               global_step)
        model_logging.log_text(
            "Generate output labels...", global_step)
        prog_bar = ProgressBar()
        prog_bar.start(len(eval_dataloader))

    # RES={
    #     # "recall":[]
    #     "recall":[],
    #     "3d_proj_error":[],
    #     "2d_proj_error":[],
    # }
    t = 0
    detections = []
    
    #construct evaluator
    classes=list(set(eval_dataset.dataset.infos['seqs']))
    if classes[0] in bop_ycb_class2idx.keys():
        evaluator = dict([(c,YCBEvaluator(f"{c}",result_path) ) for c in classes ] )
    else:
        evaluator = dict([(c,LineMODEvaluator(f"{c}",result_path) ) for c in classes ] )

    # cnt = 0  
    for i, example in enumerate(eval_dataloader):
        example_torch=load_example_to_device(example, device=torch.device("cuda"))
        batch_size = example["image"].shape[0]
        device = example_torch["image"].device
        with torch.no_grad():
            ret_dict = net(example_torch)

        # RES['recall'].append(recall.detach())
        # RES['3d_proj_error'].append(ret_dict['loss_3d_proj'].mean().detach() )
        # RES['2d_proj_error'].append(ret_dict['reproj_loss'].mean().detach() )

        #do evaluation
        evaluator[example['class_name'][0]].evaluate_rnnpose(ret_dict, example)


        if chk_rank(0, use_dist) and i % 10 == 0:
            prog_bar.print_bar(finished_size=10)
    eval_res={} 
    for k in evaluator:
        eval_res[k]=evaluator[k].summarize()

    if use_dist:  # chk_rank(0, use_dist):
        #gather and summarize evaluation results
        for k in eval_res:
            eval_res[k]['seq_len'] =  torch.tensor(eval_res[k]['seq_len'], device=device)
            gather_list = [torch.zeros_like(eval_res[k]['seq_len'])
                           for i in range(get_world(use_dist))]
            dist.all_gather(gather_list, eval_res[k]['seq_len'])
            seq_len = torch.stack(gather_list, dim=-1).sum()

            for kk in eval_res[k]:
                if kk =='seq_len':
                    continue
                eval_res[k][kk] =  torch.tensor(eval_res[k][kk], device=device)
                eval_res[k][kk] =  eval_res[k][kk]*eval_res[k]['seq_len'] # calculate the sum 
                gather_list = [torch.zeros_like(eval_res[k][kk] )
                           for i in range(get_world(use_dist))]
                dist.all_gather(gather_list, eval_res[k][kk])
                eval_res[k][kk] = torch.stack(gather_list, dim=-1).sum()/seq_len #divide with the whole length to get the mean value 


    if chk_rank(0, use_dist):
        for k in eval_res:
            for kk in eval_res[k]:
                if kk == 'seq_len':
                    metrics[f'{k}_{kk}'] = float(seq_len.cpu().numpy() )
                metrics[f'{k}_{kk}'] = float(eval_res[k][kk].cpu().numpy())


    if chk_rank(0, use_dist):
        model_logging.log_metrics(metrics, global_step)
    # del RES 
    del eval_res
    net.train()


if __name__ == '__main__':
    fire.Fire()


================================================
FILE: tools/transform_data_format.py
================================================
import numpy as np 
import cv2
import pickle
import fire
import os
import argparse


linemod_K = np.array([[572.4114, 0., 325.2611],
                  [0., 573.57043, 242.04899],
                  [0., 0., 1.]])


blender_K = np.array([[700., 0., 320.],
                    [0., 700., 240.],
                    [0., 0., 1.]])


def range_to_depth(mask, range, K):
    '''
       Transform the range image to depth image
    '''
    f=K[0,0]
    cx=K[0,2]
    cy=K[1,2]

    ys_, xs_=np.nonzero(mask)
    rngs=range[ys_,xs_]
    xs,ys=np.asarray(xs_,np.float32)+0.5,np.asarray(ys_,np.float32)+0.5

    Zs=f*rngs/( f**2 + (cx-xs)**2 + (cy-ys)**2 )**0.5
    depth = np.zeros_like(range)
    depth[ys_,xs_] = Zs
    return  depth

def crop(image, depth, mask, K_old, margin_ratio=0.1, output_size=128 ):
    '''
        image: HxWx3
        mask: HxW
        K_old: 3x3
    '''

    H,W, _ = image.shape
    

    mask=mask.astype('uint8')*255
    _x,_y,_w,_h = cv2.boundingRect(mask) 

    center=[_x+_w/2, _y+_h/2]

    L=int (max(_w,_h)* (1+2*margin_ratio))


    x=max(0, int(center[0]- L/2) )
    y=max(0, int(center[1]- L/2) )
    
    crop=image[y:y+L, x:x+L]
    depth_crop=depth[y:y+L, x:x+L]

    w=h=L # actual crop size

    #automatically handle the "out of range" problem
    patch=np.zeros([h,w,3], dtype=image.dtype)
    depth_patch=np.ones([h,w], dtype=depth.dtype)
    try:
        xp = 0
        yp = 0
        patch[xp : xp+crop.shape[0], yp:yp+crop.shape[1] ] =  crop
        depth_patch[xp : xp+crop.shape[0], yp:yp+crop.shape[1] ] = depth_crop 
    except:
        import pdb 
        pdb.set_trace()
    patch=cv2.resize(patch, (output_size,output_size), interpolation=cv2.INTER_LINEAR )
    depth_patch=cv2.resize(depth_patch, (output_size,output_size), interpolation=cv2.INTER_NEAREST )

    #update the intrinsic parameters
    K_new=np.zeros_like(K_old)
    scale=output_size/L
    K_new[0,2] = (K_old[0,2]-x)*scale
    K_new[1,2] = (K_old[1,2]-y)*scale
    K_new[0,0] = K_old[0,0]*scale
    K_new[1,1] = K_old[1,1]*scale
    K_new[2,2] = 1

    return patch, depth_patch, K_new 

class DataFormatter(object):
    def __init__(self, data_type, data_info_path, crop_param=None ):
        assert data_type in ['LM_SYN_PVNET', "LM_SYN_PVNET_LMK",'LM_FUSE_PVNET','LM_FUSE_SINGLE_PVNET' ]
        self.data_type=data_type
        self.crop_param=crop_param
        with open(data_info_path, 'rb') as f:
            self.data_info=pickle.load(f)
        pass 

    
    def process(self, data_root,depth_root, save_root):

        if self.data_type == "LM_SYN_PVNET":
            self._proc_LM_SYN_PVNET(self.data_info, data_root, save_root)
        elif self.data_type=='LM_SYN_PVNET_LMK':
            self._proc_LM_SYN_PVNET_LMK(self.data_info, data_root, save_root)
        elif self.data_type=='LM_FUSE_PVNET':
            self._proc_LM_FUSE_PVNET(self.data_info, data_root,depth_root, save_root)
        elif self.data_type=='LM_FUSE_SINGLE_PVNET':
            self._proc_LM_FUSE_SINGLE_PVNET(self.data_info, data_root,depth_root, save_root)
        else:
            raise NotImplementedError

    def _proc_LM_SYN_PVNET(self, data_info, data_root, save_root):

        for seq in data_info:
            for idx in range(len(data_info[seq]) ):
                # info = {
                #     "index": idx,
                #     "image_path": image_paths[idx].replace(image_path_dir+'/',''),
                #     "depth_path": depth_paths[idx].replace(depth_path_dir+'/',''),
                #     "RT": pose["RT"],
                #     "K": pose["K"],
                # }
                info = data_info[seq][idx]
                # image=cv2.imread( os.path.join(data_root, seq, info['image_path']) )
                # depth=np.load(os.path.join(data_root, seq, info['depth_path'])) 
                image=cv2.imread( os.path.join(data_root,  info['image_path']) )
                depth=np.load(os.path.join(data_root,  info['depth_path'])) 
                # K_old = info["K"]
                K_old = blender_K.copy()

                # maximum depth value = 1, which indicates the invalid regions 

                hs,ws=np.nonzero(depth<1)
                hmin,hmax=np.min(hs),np.max(hs)
                wmin,wmax=np.min(ws),np.max(ws)
                bbox= [hmin, wmin, hmax, wmax]

                mask=depth<1
                #transform the range map to depth map 
                depth = range_to_depth(depth<1, depth*2, K_old)
                if self.crop_param is not None:
                    image, depth, K_new=crop(image, depth, mask, K_old, margin_ratio=self.crop_param['margin_ratio'], output_size=self.crop_param['output_size'] )
                else:
                    K_new = K_old


                print(info['image_path'], info['depth_path'])
                patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}.jpg")
                depth_patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_depth.npy")
                pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_params.pkl")

                if not os.path.exists(os.path.join(save_root, seq)):
                    os.makedirs(os.path.join(save_root, seq))
                #save
                cv2.imwrite(patch_save_path,image)
                np.save(depth_patch_save_path, depth)
                with open(pose_save_path, 'wb+') as f:
                    pickle.dump({
                        "RT": info["RT"],
                        "K": K_new,
                        "bbox": bbox
                    },f)

    def _proc_LM_SYN_PVNET_LMK(self, data_info, data_root, save_root):
        #cam intrinsic is LM 
    
        for seq in data_info:
            for idx in range(len(data_info[seq]) ):
                # info = {
                #     "index": idx,
                #     "image_path": image_paths[idx].replace(image_path_dir+'/',''),
                #     "depth_path": depth_paths[idx].replace(depth_path_dir+'/',''),
                #     "RT": pose["RT"],
                #     "K": pose["K"],
                # }
                info = data_info[seq][idx]

                image=cv2.imread( os.path.join(data_root,  info['image_path']) )
                depth=np.load(os.path.join(data_root,  info['depth_path'])) 
                with open( os.path.join(data_root,  info['image_path'].replace(".jpg", "_RT.pkl")), 'rb' ) as f:
                    old_params=pickle.load(f)
                # K_old = info["K"]
                K_old = old_params["K"] #linemod_K.copy()

                # maximum depth value = 1, which indicates the invalid regions 

                hs,ws=np.nonzero(depth<1)
                hmin,hmax=np.min(hs),np.max(hs)
                wmin,wmax=np.min(ws),np.max(ws)
                bbox= [hmin, wmin, hmax, wmax]

                mask=depth<1
                #transform the range map to depth map 
                depth = range_to_depth(depth<1, depth*2, K_old)
                if self.crop_param is not None:
                    image, depth, K_new=crop(image, depth, mask, K_old, margin_ratio=self.crop_param['margin_ratio'], output_size=self.crop_param['output_size'] )
                else:
                    K_new = K_old


                print(info['image_path'], info['depth_path'], "...")
                patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}.jpg")
                depth_patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_depth.npy")
                pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_params.pkl")

                if not os.path.exists(os.path.join(save_root, seq)):
                    os.makedirs(os.path.join(save_root, seq))
                #save
                cv2.imwrite(patch_save_path,image)
                np.save(depth_patch_save_path, depth)
                with open(pose_save_path, 'wb+') as f:
                    pickle.dump({
                        "RT": old_params["RT"] ,#info["RT"],
                        "K": K_new,
                        "bbox": bbox
                    },f)

    def _proc_LM_FUSE_PVNET(self, data_info, data_root, depth_root, save_root):
    
        # The class name list used during the fusing process, which is used to find the respective mask index  
        linemod_cls_names=['ape','cam','cat','duck','glue','iron','phone', 'benchvise','can','driller','eggbox','holepuncher','lamp']

        for seq in data_info:
            seq_idx = linemod_cls_names.index(seq)
            for idx in range(len(data_info[seq]) ):
                # info = {
                #     "index": idx,
                #     "image_path": image_paths[idx].replace(image_path_dir+'/',''),
                #     "depth_path": depth_paths[idx].replace(depth_path_dir+'/',''),
                #     "RT": pose["RT"],
                #     "K": pose["K"],
                # }
                info = data_info[seq][idx]
                # if info['image_path'] =='cat/2744.jpg':
                #     info =  data_info[seq][idx+1]

                with open(os.path.join(data_root,  info['image_path']).split('.jpg')[0].replace(seq,'')+'_info.pkl', 'rb'  ) as f:
                    fuse_info = pickle.load(f )

                image=cv2.imread( os.path.join(data_root,  info['image_path']).split('.jpg')[0].replace(seq,'')+'_rgb.jpg' )
                try: 
                    depth_idx = fuse_info[2][seq_idx]['img_idx']
                except:
                    import pdb 
                    pdb.set_trace()

                rendered_depth=np.load( os.path.dirname(os.path.join(depth_root, info['image_path']  ))+ f'/{depth_idx}_depth.png.npy'  ) 

                fuse_mask=cv2.imread( os.path.join(data_root,  info['image_path']).split('.jpg')[0].replace(seq,'')+'_mask.png', ) 
                fuse_mask = fuse_mask[...,0]==(seq_idx+1) # fuse mask id starts from 1

                # """
                #may have bug
                hs,ws=np.nonzero(rendered_depth<1)
                hmin,hmax=np.min(hs),np.max(hs)
                wmin,wmax=np.min(ws),np.max(ws)
                # """
                bbox= [hmin+fuse_info[0][seq_idx][0], wmin+fuse_info[0][seq_idx][1], hmax+fuse_info[0][seq_idx][0], wmax+fuse_info[0][seq_idx][1]]

                depth = np.ones_like(rendered_depth) 

                try:
                    depth[hmin+fuse_info[0][seq_idx][0]: fuse_info[0][seq_idx][0]+hmax+1, wmin+fuse_info[0][seq_idx][1]: wmax+fuse_info[0][seq_idx][1] +1] = rendered_depth[hmin:hmax+1, wmin:wmax+1]
                except:
                    print(info['image_path'],"failed!") 
                    continue
                    """
                    #TODO: temp fix, may fail
                    patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_pat.jpg")
                    depth_patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_pat_depth.npy")
                    pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_RT.pkl")
                    cv2.imwrite(patch_save_path, patch)
                    np.save(depth_patch_save_path, depth_patch)
                    with open(pose_save_path, 'wb+') as f:
                        pickle.dump({
                            # "RT": fuse_info[1][seq_idx], #info["RT"],
                            "RT": fuse_info_old[1][seq_idx], #info["RT"],
                            "K": K_new
                        },f)
                    # import pdb 
                    # pdb.set_trace()
                    # instance_inds.append(len(instance_inds))
                    continue
                    """
                # fuse_info_old = copy.deepcopy(fuse_info)
                
                # K_old = info["K"]
                K_old = linemod_K.copy()
                K_old[0,2] = (K_old[0,2]+fuse_info[0][seq_idx][1])
                K_old[1,2] = (K_old[1,2]+fuse_info[0][seq_idx][0])

                # maximum depth value = 1, which indicates the invalid regions 
                mask=depth<1
                # transform the range map to depth map 
                depth = range_to_depth(mask, depth*2, K_old)

                # depth = depth* fuse_mask + (1-fuse_mask) # use 1's to indicate the invalid depths
                # depth = depth* fuse_mask  # use 0's to indicate the invalid depths
                depth = depth # keep all the depths including occluded ones
                

                if self.crop_param is not None:
                    # image, depth, K_new=crop(image, depth, mask, K_old, margin_ratio=0.1, output_size=128 )
                    image, depth, K_new=crop(image, depth, mask, K_old, margin_ratio=self.crop_param['margin_ratio'], output_size=self.crop_param['output_size'] )
                else:
                    K_new = K_old

                print(info['image_path'], info['depth_path'], bbox)
                patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}.jpg")
                depth_patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_depth.npy")
                # pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_RT.pkl")
                pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_params.pkl")
                mask_visb_save_path = os.path.join(save_root, seq, f"{info['index']:05d}_mask_visb.png")

                if not os.path.exists(os.path.join(save_root, seq)):
                    os.makedirs(os.path.join(save_root, seq))
                #save
                cv2.imwrite(patch_save_path,image)
                cv2.imwrite(mask_visb_save_path, fuse_mask*255 )

                np.save(depth_patch_save_path, depth)
                with open(pose_save_path, 'wb+') as f:
                    pickle.dump({
                        "RT": fuse_info[1][seq_idx], #info["RT"],
                        "K": K_new,
                        "bbox": bbox
                    },f)
    
    def _proc_LM_FUSE_SINGLE_PVNET(self, data_info, data_root, depth_root, save_root):
        
        # The class name list used during the fusing process, which is used to find the respective mask index  
        linemod_cls_names=['ape','cam','cat','duck','glue','iron','phone', 'benchvise','can','driller','eggbox','holepuncher','lamp']

        for seq in data_info:
            # seq_idx = linemod_cls_names.index(seq)
            seq_idx = 0 
            for idx in range(len(data_info[seq]) ):
                # info = {
                #     "index": idx,
                #     "image_path": image_paths[idx].replace(image_path_dir+'/',''),
                #     "depth_path": depth_paths[idx].replace(depth_path_dir+'/',''),
                #     "RT": pose["RT"],
                #     "K": pose["K"],
                # }
                info = data_info[seq][idx]
                # if info['image_path'] =='cat/2744.jpg':
                #     info =  data_info[seq][idx+1]
                with open(os.path.join(data_root, info['image_path'].split('.jpg')[0]+'_info.pkl'  ), 'rb'  ) as f:
                    fuse_info = pickle.load(f )

                # image=cv2.imread( os.path.join(data_root,  seq, info['image_path'].split('.jpg')[0].replace(seq,'')+'_rgb.jpg' ) )
                image=cv2.imread( os.path.join(data_root,  info['image_path'].split('.jpg')[0]+'_rgb.jpg' ) )
                try: 
                    depth_idx = fuse_info[2][seq_idx]['img_idx']
                except:
                    import pdb 
                    pdb.set_trace()

                rendered_depth=np.load( os.path.dirname(os.path.join(depth_root, info['image_path']  ))+ f'/{depth_idx}_depth.png.npy'  ) 

                fuse_mask=cv2.imread( os.path.join(data_root,  info['image_path']).split('.jpg')[0]+'_mask.png', ) 
                fuse_mask = fuse_mask[...,0]==(seq_idx+1) # fuse mask id starts from 1

                # """
                #may have bug
                hs,ws=np.nonzero(rendered_depth<1)
                hmin,hmax=np.min(hs),np.max(hs)
                wmin,wmax=np.min(ws),np.max(ws)
                # """
                bbox= [hmin+fuse_info[0][seq_idx][0], wmin+fuse_info[0][seq_idx][1], hmax+fuse_info[0][seq_idx][0], wmax+fuse_info[0][seq_idx][1]]

                depth = np.ones_like(rendered_depth) 

                try:
                    depth[hmin+fuse_info[0][seq_idx][0]: fuse_info[0][seq_idx][0]+hmax+1, wmin+fuse_info[0][seq_idx][1]: wmax+fuse_info[0][seq_idx][1] +1] = rendered_depth[hmin:hmax+1, wmin:wmax+1]
                except:
                    import pdb 
                    pdb.set_trace()
                    print(info['image_path'],"failed!") 
                    continue
                    """
                    #TODO: temp fix, may fail
                    patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_pat.jpg")
                    depth_patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_pat_depth.npy")
                    pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_RT.pkl")
                    cv2.imwrite(patch_save_path, patch)
                    np.save(depth_patch_save_path, depth_patch)
                    with open(pose_save_path, 'wb+') as f:
                        pickle.dump({
                            # "RT": fuse_info[1][seq_idx], #info["RT"],
                            "RT": fuse_info_old[1][seq_idx], #info["RT"],
                            "K": K_new
                        },f)
                    # import pdb 
                    # pdb.set_trace()
                    # instance_inds.append(len(instance_inds))
                    continue
                    """
                # fuse_info_old = copy.deepcopy(fuse_info)
                
                # K_old = info["K"]
                K_old = linemod_K.copy()
                K_old[0,2] = (K_old[0,2]+fuse_info[0][seq_idx][1])
                K_old[1,2] = (K_old[1,2]+fuse_info[0][seq_idx][0])

                # maximum depth value = 1, which indicates the invalid regions 
                mask=depth<1
                # transform the range map to depth map 
                depth = range_to_depth(mask, depth*2, K_old)

                # depth = depth* fuse_mask + (1-fuse_mask) # use 1's to indicate the invalid depths
                depth = depth* fuse_mask  # use 0's to indicate the invalid depths
                

                if self.crop_param is not None:
                    # image, depth, K_new=crop(image, depth, mask, K_old, margin_ratio=0.1, output_size=128 )
                    image, depth, K_new=crop(image, depth, mask, K_old, margin_ratio=self.crop_param['margin_ratio'], output_size=self.crop_param['output_size'] )
                else:
                    K_new = K_old

                print(info['image_path'], info['depth_path'], bbox)
                patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}.jpg")
                depth_patch_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_depth.npy")
                # pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_RT.pkl")
                pose_save_path=os.path.join(save_root, seq, f"{info['index']:05d}_params.pkl")

                if not os.path.exists(os.path.join(save_root, seq)):
                    os.makedirs(os.path.join(save_root, seq))
                #save
                cv2.imwrite(patch_save_path,image)
                np.save(depth_patch_save_path, depth)
                with open(pose_save_path, 'wb+') as f:
                    pickle.dump({
                        "RT": fuse_info[1][seq_idx], #info["RT"],
                        "K": K_new,
                        "bbox": bbox
                    },f)


def run(data_type,data_info_path, image_root, depth_root, save_dir, crop_param=None):
    df = DataFormatter(data_type,data_info_path)
    df.process(image_root, depth_root,save_dir)

if __name__=='__main__':
    fire.Fire(run)
    

================================================
FILE: torchplus/__init__.py
================================================
from . import train
from . import nn
from . import metrics
from . import tools

from .tools import change_default_args
from torchplus.ops.array_ops import scatter_nd, gather_nd, roll


================================================
FILE: torchplus/metrics.py
================================================
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn


class Scalar(nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer('total', torch.FloatTensor([0.0]))
        self.register_buffer('count', torch.FloatTensor([0.0]))

    def forward(self, scalar):
        if not scalar.eq(0.0):
            self.count += 1
            self.total += scalar.data.float()
        return self.value.cpu()

    @property
    def value(self):
        return self.total / self.count

    def clear(self):
        self.total.zero_()
        self.count.zero_()

class Accuracy(nn.Module):
    def __init__(self,
                 dim=1,
                 ignore_idx=-1,
                 threshold=0.5,
                 encode_background_as_zeros=True):
        super().__init__()
        self.register_buffer('total', torch.FloatTensor([0.0]))
        self.register_buffer('count', torch.FloatTensor([0.0]))
        self._ignore_idx = ignore_idx
        self._dim = dim
        self._threshold = threshold
        self._encode_background_as_zeros = encode_background_as_zeros

    def forward(self, labels, preds, weights=None):
        # labels: [N, ...]
        # preds: [N, C, ...]
        if self._encode_background_as_zeros:
            scores = torch.sigmoid(preds)
            labels_pred = torch.max(preds, dim=self._dim)[1] + 1
            pred_labels = torch.where((scores > self._threshold).any(self._dim),
                                      labels_pred,
                                      torch.tensor(0).type_as(labels_pred))
        else:
            pred_labels = torch.max(preds, dim=self._dim)[1]
        N, *Ds = labels.shape
        labels = labels.view(N, int(np.prod(Ds)))
        pred_labels = pred_labels.view(N, int(np.prod(Ds)))
        if weights is None:
            weights = (labels != self._ignore_idx).float()
        else:
            weights = weights.float()

        num_examples = torch.sum(weights)
        num_examples = torch.clamp(num_examples, min=1.0).float()
        total = torch.sum((pred_labels == labels.long()).float())
        self.count += num_examples
        self.total += total
        return self.value.cpu()
        # return (total /  num_examples.data).cpu()
    @property
    def value(self):
        return self.total / self.count

    def clear(self):
        self.total.zero_()
        self.count.zero_()


class Precision(nn.Module):
    def __init__(self, dim=1, ignore_idx=-1, threshold=0.5):
        super().__init__()
        self.register_buffer('total', torch.FloatTensor([0.0]))
        self.register_buffer('count', torch.FloatTensor([0.0]))
        self._ignore_idx = ignore_idx
        self._dim = dim
        self._threshold = threshold

    def forward(self, labels, preds, weights=None):
        # labels: [N, ...]
        # preds: [N, C, ...]
        if preds.shape[self._dim] == 1:  # BCE
            pred_labels = (torch.sigmoid(preds) >
                           self._threshold).long().squeeze(self._dim)
        else:
            assert preds.shape[
                self._dim] == 2, "precision only support 2 class"
            pred_labels = torch.max(preds, dim=self._dim)[1]
        N, *Ds = labels.shape
        labels = labels.view(N, int(np.prod(Ds)))
        pred_labels = pred_labels.view(N, int(np.prod(Ds)))
        if weights is None:
            weights = (labels != self._ignore_idx).float()
        else:
            weights = weights.float()

        pred_trues = pred_labels > 0
        pred_falses = pred_labels == 0
        trues = labels > 0
        falses = labels == 0
        true_positives = (weights * (trues & pred_trues).float()).sum()
        true_negatives = (weights * (falses & pred_falses).float()).sum()
        false_positives = (weights * (falses & pred_trues).float()).sum()
        false_negatives = (weights * (trues & pred_falses).float()).sum()
        count = true_positives + false_positives
        # print(count, true_positives)
        if count > 0:
            self.count += count
            self.total += true_positives
        return self.value.cpu()
        # return (total /  num_examples.data).cpu()
    @property
    def value(self):
        return self.total / self.count
    def clear(self):
        self.total.zero_()
        self.count.zero_()


class Recall(nn.Module):
    def __init__(self, dim=1, ignore_idx=-1, threshold=0.5):
        super().__init__()
        self.register_buffer('total', torch.FloatTensor([0.0]))
        self.register_buffer('count', torch.FloatTensor([0.0]))
        self._ignore_idx = ignore_idx
        self._dim = dim
        self._threshold = threshold

    def forward(self, labels, preds, weights=None):
        # labels: [N, ...]
        # preds: [N, C, ...]
        if preds.shape[self._dim] == 1:  # BCE
            pred_labels = (torch.sigmoid(preds) >
                           self._threshold).long().squeeze(self._dim)
        else:
            assert preds.shape[
                self._dim] == 2, "precision only support 2 class"
            pred_labels = torch.max(preds, dim=self._dim)[1]
        N, *Ds = labels.shape
        labels = labels.view(N, int(np.prod(Ds)))
        pred_labels = pred_labels.view(N, int(np.prod(Ds)))
        if weights is None:
            weights = (labels != self._ignore_idx).float()
        else:
            weights = weights.float()
        pred_trues = pred_labels == 1
        pred_falses = pred_labels == 0
        trues = labels == 1
        falses = labels == 0
        true_positives = (weights * (trues & pred_trues).float()).sum()
        true_negatives = (weights * (falses & pred_falses).float()).sum()
        false_positives = (weights * (falses & pred_trues).float()).sum()
        false_negatives = (weights * (trues & pred_falses).float()).sum()
        count = true_positives + false_negatives
        if count > 0:
            self.count += count
            self.total += true_positives
        return self.value.cpu()
        # return (total /  num_examples.data).cpu()
    @property
    def value(self):
        return self.total / self.count
    def clear(self):
        self.total.zero_()
        self.count.zero_()


def _calc_binary_metrics(labels,
                         scores,
                         weights=None,
                         ignore_idx=-1,
                         threshold=0.5):

    pred_labels = (scores > threshold).long()
    N, *Ds = labels.shape
    labels = labels.view(N, int(np.prod(Ds)))
    pred_labels = pred_labels.view(N, int(np.prod(Ds)))
    pred_trues = pred_labels > 0
    pred_falses = pred_labels == 0
    trues = labels > 0
    falses = labels == 0
    true_positives = (weights * (trues & pred_trues).float()).sum()
    true_negatives = (weights * (falses & pred_falses).float()).sum()
    false_positives = (weights * (falses & pred_trues).float()).sum()
    false_negatives = (weights * (trues & pred_falses).float()).sum()
    return true_positives, true_negatives, false_positives, false_negatives


class PrecisionRecall(nn.Module):
    def __init__(self,
                 dim=1,
                 ignore_idx=-1,
                 thresholds=0.5,
                 use_sigmoid_score=False,
                 encode_background_as_zeros=True):
        super().__init__()
        if not isinstance(thresholds, (list, tuple)):
            thresholds = [thresholds]

        self.register_buffer('prec_total',
                             torch.FloatTensor(len(thresholds)).zero_())
        self.register_buffer('prec_count',
                             torch.FloatTensor(len(thresholds)).zero_())
        self.register_buffer('rec_total',
                             torch.FloatTensor(len(thresholds)).zero_())
        self.register_buffer('rec_count',
                             torch.FloatTensor(len(thresholds)).zero_())

        self._ignore_idx = ignore_idx
        self._dim = dim
        self._thresholds = thresholds
        self._use_sigmoid_score = use_sigmoid_score
        self._encode_background_as_zeros = encode_background_as_zeros

    def forward(self, labels, preds, weights=None):
        # labels: [N, ...]
        # preds: [N, ..., C]
        if self._encode_background_as_zeros:
            # this don't support softmax
            assert self._use_sigmoid_score is True
            total_scores = torch.sigmoid(preds)
            # scores, label_preds = torch.max(total_scores, dim=1)
        else:
            if self._use_sigmoid_score:
                total_scores = torch.sigmoid(preds)[..., 1:]
            else:
                total_scores = F.softmax(preds, dim=-1)[..., 1:]
        """
        if preds.shape[self._dim] == 1:  # BCE
            scores = torch.sigmoid(preds)
        else:
            # assert preds.shape[
            #     self._dim] == 2, "precision only support 2 class"
            # TODO: add support for [N, C, ...] format.
            # TODO: add multiclass support
            if self._use_sigmoid_score:
                scores = torch.sigmoid(preds)[:, ..., 1:].sum(-1)
            else:
                scores = F.softmax(preds, dim=self._dim)[:, ..., 1:].sum(-1)
        """
        scores = torch.max(total_scores, dim=-1)[0]
        if weights is None:
            weights = (labels != self._ignore_idx).float()
        else:
            weights = weights.float()
        for i, thresh in enumerate(self._thresholds):
            tp, tn, fp, fn = _calc_binary_metrics(labels, scores, weights,
                                                  self._ignore_idx, thresh)
            rec_count = tp + fn
            prec_count = tp + fp
            if rec_count > 0:
                self.rec_count[i] += rec_count
                self.rec_total[i] += tp
            if prec_count > 0:
                self.prec_count[i] += prec_count
                self.prec_total[i] += tp

        return self.value
        # return (total /  num_examples.data).cpu()
    @property
    def value(self):
        prec_count = torch.clamp(self.prec_count, min=1.0)
        rec_count = torch.clamp(self.rec_count, min=1.0)
        return ((self.prec_total / prec_count).cpu(),
                (self.rec_total / rec_count).cpu())

    @property
    def thresholds(self):
        return self._thresholds

    def clear(self):
        self.rec_count.zero_()
        self.prec_count.zero_()
        self.prec_total.zero_()
        self.rec_total.zero_()


================================================
FILE: torchplus/nn/__init__.py
================================================
from torchplus.nn.functional import one_hot
from torchplus.nn.modules.common import Empty, Sequential
from torchplus.nn.modules.normalization import GroupNorm


================================================
FILE: torchplus/nn/functional.py
================================================
import torch

def one_hot(tensor, depth, dim=-1, on_value=1.0, dtype=torch.float32):
    tensor_onehot = torch.zeros(
        *list(tensor.shape), depth, dtype=dtype, device=tensor.device)
    tensor_onehot.scatter_(dim, tensor.unsqueeze(dim).long(), on_value)
    return tensor_onehot


================================================
FILE: torchplus/nn/modules/__init__.py
================================================


================================================
FILE: torchplus/nn/modules/common.py
================================================
import sys
from collections import OrderedDict

import torch
from torch.nn import functional as F


class Empty(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super(Empty, self).__init__()
        self.weight = torch.zeros([1, ])  # dummy varaible

    def forward(self, *args, **kwargs):
        if len(args) == 1:
            return args[0]
        elif len(args) == 0:
            return None
        return args


class Sequential(torch.nn.Module):
    r"""A sequential container.
    Modules will be added to it in the order they are passed in the constructor.
    Alternatively, an ordered dict of modules can also be passed in.

    To make it easier to understand, given is a small example::

        # Example of using Sequential
        model = Sequential(
                  nn.Conv2d(1,20,5),
                  nn.ReLU(),
                  nn.Conv2d(20,64,5),
                  nn.ReLU()
                )

        # Example of using Sequential with OrderedDict
        model = Sequential(OrderedDict([
                  ('conv1', nn.Conv2d(1,20,5)),
                  ('relu1', nn.ReLU()),
                  ('conv2', nn.Conv2d(20,64,5)),
                  ('relu2', nn.ReLU())
                ]))

        # Example of using Sequential with kwargs(python 3.6+)
        model = Sequential(
                  conv1=nn.Conv2d(1,20,5),
                  relu1=nn.ReLU(),
                  conv2=nn.Conv2d(20,64,5),
                  relu2=nn.ReLU()
                )
    """

    def __init__(self, *args, **kwargs):
        super(Sequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
        for name, module in kwargs.items():
            if sys.version_info < (3, 6):
                raise ValueError("kwargs only supported in py36+")
            if name in self._modules:
                raise ValueError("name exists.")
            self.add_module(name, module)

    def __getitem__(self, idx):
        if not (-len(self) <= idx < len(self)):
            raise IndexError('index {} is out of range'.format(idx))
        if idx < 0:
            idx += len(self)
        it = iter(self._modules.values())
        for i in range(idx):
            next(it)
        return next(it)

    def __len__(self):
        return len(self._modules)

    def add(self, module, name=None):
        if name is None:
            name = str(len(self._modules))
            if name in self._modules:
                raise KeyError("name exists")
        self.add_module(name, module)

    def forward(self, input):
        # i = 0
        for module in self._modules.values():
            # print(i)
            input = module(input)
            # i += 1
        return input


================================================
FILE: torchplus/nn/modules/normalization.py
================================================
import torch


class GroupNorm(torch.nn.GroupNorm):
    def __init__(self, num_channels, num_groups, eps=1e-5, affine=True):
        super().__init__(
            num_groups=num_groups,
            num_channels=num_channels,
            eps=eps,
            affine=affine)


================================================
FILE: torchplus/ops/__init__.py
================================================


================================================
FILE: torchplus/ops/array_ops.py
================================================
import ctypes
import math
import time
import torch
from typing import Optional


def scatter_nd(indices, updates, shape):
    """pytorch edition of tensorflow scatter_nd.
    this function don't contain except handle code. so use this carefully
    when indice repeats, don't support repeat add which is supported
    in tensorflow.
    """
    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
    ndim = indices.shape[-1]
    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
    flatted_indices = indices.view(-1, ndim)
    slices = [flatted_indices[:, i] for i in range(ndim)]
    slices += [Ellipsis]
    ret[slices] = updates.view(*output_shape)
    return ret


def gather_nd(params, indices):
    # this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
    ndim = indices.shape[-1]
    output_shape = list(indices.shape[:-1]) + list(params.shape[indices.shape[-1]:])
    flatted_indices = indices.view(-1, ndim)
    slices = [flatted_indices[:, i] for i in range(ndim)]
    slices += [Ellipsis]
    return params[slices].view(*output_shape)


def roll(x: torch.Tensor, shift: int, dim: int = -1, fill_pad: Optional[int] = None):
    """
        shift<0: left roll
        shift>0: right roll
    """  
    device = x.device
    
    if 0 == shift:
        return x

    elif shift < 0:
        shift = -shift
        gap = x.index_select(dim, torch.arange(shift, device=device))
        if fill_pad is not None:
            gap = fill_pad * torch.ones_like(gap, device=device)
        return torch.cat([x.index_select(dim, torch.arange(shift, x.size(dim), device=device)), gap], dim=dim)

    else:
        shift = x.size(dim) - shift
        gap = x.index_select(dim, torch.arange(shift, x.size(dim), device=device))
        if fill_pad is not None:
            gap = fill_pad * torch.ones_like(gap, device=device)
        return torch.cat([gap, x.index_select(dim, torch.arange(shift, device=device))], dim=dim) 

================================================
FILE: torchplus/tools.py
================================================
import functools
import inspect
import sys
from collections import OrderedDict

import numba
import numpy as np
import torch


def get_pos_to_kw_map(func):
    pos_to_kw = {}
    fsig = inspect.signature(func)
    pos = 0
    for name, info in fsig.parameters.items():
        if info.kind is info.POSITIONAL_OR_KEYWORD:
            pos_to_kw[pos] = name
        pos += 1
    return pos_to_kw


def get_kw_to_default_map(func):
    kw_to_default = {}
    fsig = inspect.signature(func)
    for name, info in fsig.parameters.items():
        if info.kind is info.POSITIONAL_OR_KEYWORD:
            if info.default is not info.empty:
                kw_to_default[name] = info.default
    return kw_to_default


# def change_default_args(**kwargs):
#     def layer_wrapper(layer_class):
#         class DefaultArgLayer(layer_class):
#             def __init__(self, *args, **kw):
#                 pos_to_kw = get_pos_to_kw_map(layer_class.__init__)
#                 kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()}
#                 for key, val in kwargs.items():
#                     if key not in kw and kw_to_pos[key] > len(args):
#                         kw[key] = val
#                 super().__init__(*args, **kw)

#         return DefaultArgLayer

#     return layer_wrapper

def change_default_args(**kwargs):
    def layer_wrapper(layer_class):
        class DefaultArgLayer(layer_class):
            def __init__(self, *args, **kw):
                pos_to_kw = get_pos_to_kw_map(layer_class.__init__)
                kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()}
                for key, val in kwargs.items():
                    if key not in kw and kw_to_pos[key] > len(args):
                        kw[key] = val
                super(DefaultArgLayer,self).__init__(*args, **kw)

        return DefaultArgLayer

    return layer_wrapper
def torch_to_np_dtype(ttype):
    type_map = {
        torch.float16: np.dtype(np.float16),
        torch.float32: np.dtype(np.float32),
        torch.float16: np.dtype(np.float64),
        torch.int32: np.dtype(np.int32),
        torch.int64: np.dtype(np.int64),
        torch.uint8: np.dtype(np.uint8),
    }
    return type_map[ttype]


================================================
FILE: torchplus/train/__init__.py
================================================
from torchplus.train.checkpoint import (latest_checkpoint, restore,
                                        restore_latest_checkpoints,
                                        restore_models, save, save_models,
                                        try_restore_latest_checkpoints,
                                        save_models_cpu
                                        )
from torchplus.train.common import create_folder
from torchplus.train.optim import MixedPrecisionWrapper


================================================
FILE: torchplus/train/checkpoint.py
================================================
import json
import logging
import os
import signal
from pathlib import Path

import torch


class DelayedKeyboardInterrupt(object):
    def __enter__(self):
        self.signal_received = False
        self.old_handler = signal.signal(signal.SIGINT, self.handler)

    def handler(self, sig, frame):
        self.signal_received = (sig, frame)
        logging.debug('SIGINT received. Delaying KeyboardInterrupt.')

    def __exit__(self, type, value, traceback):
        signal.signal(signal.SIGINT, self.old_handler)
        if self.signal_received:
            self.old_handler(*self.signal_received)


def latest_checkpoint(model_dir, model_name):
    """return path of latest checkpoint in a model_dir
    Args:
        model_dir: string, indicate your model dir(save ckpts, summarys,
            logs, etc).
        model_name: name of your model. we find ckpts by name
    Returns:
        path: None if isn't exist or latest checkpoint path.
    """
    ckpt_info_path = Path(model_dir) / "checkpoints.json"
    if not ckpt_info_path.is_file():
        return None
    with open(ckpt_info_path, 'r') as f:
        ckpt_dict = json.loads(f.read())
    if model_name not in ckpt_dict['latest_ckpt']:
        return None
    latest_ckpt = ckpt_dict['latest_ckpt'][model_name]
    ckpt_file_name = Path(model_dir) / latest_ckpt
    if not ckpt_file_name.is_file():
        return None

    return str(ckpt_file_name)


def _ordered_unique(seq):
    seen = set()
    return [x for x in seq if not (x in seen or seen.add(x))]


def save(model_dir,
         model,
         model_name,
         global_step,
         max_to_keep=8,
         keep_latest=True):
    """save a model into model_dir.
    Args:
        model_dir: string, indicate your model dir(save ckpts, summarys,
            logs, etc).
        model: torch.nn.Module instance.
        model_name: name of your model. we find ckpts by name
        global_step: int, indicate current global step.
        max_to_keep: int, maximum checkpoints to keep.
        keep_latest: bool, if True and there are too much ckpts, 
            will delete oldest ckpt. else will delete ckpt which has
            smallest global step.
    Returns:
        path: None if isn't exist or latest checkpoint path.
    """

    # prevent save incomplete checkpoint due to key interrupt
    with DelayedKeyboardInterrupt():
        ckpt_info_path = Path(model_dir) / "checkpoints.json"
        ckpt_filename = "{}-{}.tckpt".format(model_name, global_step)
        ckpt_path = Path(model_dir) / ckpt_filename
        if not ckpt_info_path.is_file():
            ckpt_info_dict = {'latest_ckpt': {}, 'all_ckpts': {}}
        else:
            with open(ckpt_info_path, 'r') as f:
                ckpt_info_dict = json.loads(f.read())
        ckpt_info_dict['latest_ckpt'][model_name] = ckpt_filename
        if model_name in ckpt_info_dict['all_ckpts']:
            ckpt_info_dict['all_ckpts'][model_name].append(ckpt_filename)
        else:
            ckpt_info_dict['all_ckpts'][model_name] = [ckpt_filename]
        all_ckpts = ckpt_info_dict['all_ckpts'][model_name]

        torch.save(model.state_dict(), ckpt_path)
        # check ckpt in all_ckpts is exist, if not, delete it from all_ckpts
        all_ckpts_checked = []
        for ckpt in all_ckpts:
            ckpt_path_uncheck = Path(model_dir) / ckpt
            if ckpt_path_uncheck.is_file():
                all_ckpts_checked.append(str(ckpt_path_uncheck))
        all_ckpts = all_ckpts_checked
        if len(all_ckpts) > max_to_keep:
           
            if keep_latest:
                ckpt_to_delete = all_ckpts.pop(0)
            else:
                # delete smallest step
                def get_step(name): return int(
                    name.split('.')[0].split('-')[1])
                min_step = min([get_step(name) for name in all_ckpts])
                ckpt_to_delete = "{}-{}.tckpt".format(model_name, min_step)
                all_ckpts.remove(ckpt_to_delete)
            os.remove(str(Path(model_dir) / ckpt_to_delete))
        all_ckpts_filename = _ordered_unique([Path(f).name for f in all_ckpts])
        ckpt_info_dict['all_ckpts'][model_name] = all_ckpts_filename
        with open(ckpt_info_path, 'w') as f:
            f.write(json.dumps(ckpt_info_dict, indent=2))


def restore(ckpt_path, model, map_func=None, map_location='cpu'):
    if not Path(ckpt_path).is_file():
        raise ValueError("checkpoint {} not exist.".format(ckpt_path))
    state_dict = torch.load(ckpt_path, map_location=map_location)
    if map_func is not None:
        # map_func(state_dict)
        state_dict = map_func(state_dict) #modified @01/01/2020
    model.load_state_dict(state_dict)
    print("Restoring parameters from {}".format(ckpt_path))


def _check_model_names(models):
    model_names = []
    for model in models:
        if not hasattr(model, "name"):
            raise ValueError("models must have name attr")
        model_names.append(model.name)
    if len(model_names) != len(set(model_names)):
        raise ValueError("models must have unique name: {}".format(
            ", ".join(model_names)))


def _get_name_to_model_map(models):
    if isinstance(models, dict):
        name_to_model = {name: m for name, m in models.items()}
    else:
        _check_model_names(models)
        name_to_model = {m.name: m for m in models}
    return name_to_model


def try_restore_latest_checkpoints(model_dir, models, map_func=None, map_location='cpu'):
    name_to_model = _get_name_to_model_map(models)
    for name, model in name_to_model.items():
        latest_ckpt = latest_checkpoint(model_dir, name)
        if latest_ckpt is not None:
            restore(latest_ckpt, model, map_func, map_location)


def restore_latest_checkpoints(model_dir, models, map_func=None,  map_location='cpu'):
    name_to_model = _get_name_to_model_map(models)
    for name, model in name_to_model.items():
        latest_ckpt = latest_checkpoint(model_dir, name)
        if latest_ckpt is not None:
            restore(latest_ckpt, model, map_func,map_location)
        else:
            raise ValueError("model {}\'s ckpt isn't exist".format(name))


def restore_models(model_dir, models, global_step, map_func=None, map_location='cpu'):
    name_to_model = _get_name_to_model_map(models)
    for name, model in name_to_model.items():
        ckpt_filename = "{}-{}.tckpt".format(name, global_step)
        ckpt_path = model_dir + "/" + ckpt_filename
        restore(ckpt_path, model, map_func, map_location)


def save_models(model_dir,
                models,
                global_step,
                max_to_keep=15,
                keep_latest=True):
    with DelayedKeyboardInterrupt():
        name_to_model = _get_name_to_model_map(models)
        for name, model in name_to_model.items():
            save(model_dir, model, name, global_step, max_to_keep, keep_latest)


def gpu_to_cpu(models):
    if len(models) == 1:
        models[0].cpu()
        return
    for state in models[1].state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cpu()
    models[0].cpu()


def cpu_to_gpu(models):
    if len(models) == 1:
        models[0].cuda()
        return
    for state in models[1].state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                state[k] = v.cuda()
    models[0].cuda()


def save_models_cpu(model_dir,
                    models,
                    global_step,
                    max_to_keep=15,
                    keep_latest=True):
    with DelayedKeyboardInterrupt():
        name_to_model = _get_name_to_model_map(models)
        gpu_to_cpu(models)
        for name, model in name_to_model.items():
            save(model_dir, model, name, global_step, max_to_keep, keep_latest)
        cpu_to_gpu(models)


================================================
FILE: torchplus/train/common.py
================================================
import datetime
import os
import shutil

def create_folder(prefix, add_time=True, add_str=None, delete=False):
    additional_str = ''
    if delete is True:
        if os.path.exists(prefix):
            shutil.rmtree(prefix)
        os.makedirs(prefix)
    folder = prefix
    if add_time is True:
        # additional_str has a form such as '170903_220351'
        additional_str += datetime.datetime.now().strftime("%y%m%d_%H%M%S")
        if add_str is not None:
            folder += '/' + additional_str + '_' + add_str
        else:
            folder += '/' + additional_str
    if delete is True:
        if os.path.exists(folder):
            shutil.rmtree(folder)
    os.makedirs(folder)
    return folder

================================================
FILE: torchplus/train/fastai_optim.py
================================================
from collections import Iterable, defaultdict
from copy import deepcopy
from itertools import chain

import torch
from torch import nn
from torch._utils import _unflatten_dense_tensors
from torch.autograd import Variable
from torch.nn.utils import parameters_to_vector

bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, )


def split_bn_bias(layer_groups):
    "Split the layers in `layer_groups` into batchnorm (`bn_types`) and non-batchnorm groups."
    split_groups = []
    for l in layer_groups:
        l1, l2 = [], []
        for c in l.children():
            if isinstance(c, bn_types):
                l2.append(c)
            else:
                l1.append(c)
        split_groups += [nn.Sequential(*l1), nn.Sequential(*l2)]
    return split_groups


def get_master(layer_groups, flat_master: bool = False):
    "Return two lists, one for the model parameters in FP16 and one for the master parameters in FP32."
    split_groups = split_bn_bias(layer_groups)
    model_params = [[
        param for param in lg.parameters() if param.requires_grad
    ] for lg in split_groups]
    if flat_master:
        master_params = []
        for lg in model_params:
            if len(lg) != 0:
                mp = parameters_to_vector([param.data.float() for param in lg])
                mp = torch.nn.Parameter(mp, requires_grad=True)
                if mp.grad is None:
                    mp.grad = mp.new(*mp.size())
                master_params.append([mp])
            else:
                master_params.append([])
        return model_params, master_params
    else:
        master_params = [[param.clone().float().detach() for param in lg]
                         for lg in model_params]
        for mp in master_params:
            for param in mp:
                param.requires_grad = True
        return model_params, master_params


def model_g2master_g(model_params, master_params,
                     flat_master: bool = False) -> None:
    "Copy the `model_params` gradients to `master_params` for the optimizer step."
    if flat_master:
        for model_group, master_group in zip(model_params, master_params):
            if len(master_group) != 0:
                master_group[0].grad.data.copy_(
                    parameters_to_vector(
                        [p.grad.data.float() for p in model_group]))
    else:
        for model_group, master_group in zip(model_params, master_params):
            for model, master in zip(model_group, master_group):
                if model.grad is not None:
                    if master.grad is None:
                        master.grad = master.data.new(*master.data.size())
                    master.grad.data.copy_(model.grad.data)
                else:
                    master.grad = None


def master2model(model_params, master_params,
                 flat_master: bool = False) -> None:
    "Copy `master_params` to `model_params`."
    if flat_master:
        for model_group, master_group in zip(model_params, master_params):
            if len(model_group) != 0:
                for model, master in zip(
                        model_group,
                        _unflatten_dense_tensors(master_group[0].data,
                                                 model_group)):
                    model.data.copy_(master)
    else:
        for model_group, master_group in zip(model_params, master_params):
            for model, master in zip(model_group, master_group):
                model.data.copy_(master.data)


def listify(p=None, q=None):
    "Make `p` listy and the same length as `q`."
    if p is None:
        p = []
    elif isinstance(p, str):
        p = [p]
    elif not isinstance(p, Iterable):
        p = [p]
    n = q if type(q) == int else len(p) if q is None else len(q)
    if len(p) == 1:
        p = p * n
    assert len(p) == n, f'List len mismatch ({len(p)} vs {n})'
    return list(p)


def trainable_params(m: nn.Module):
    "Return list of trainable params in `m`."

    res = filter(lambda p: p.requires_grad, m.parameters())
    return res


def is_tuple(x) -> bool:
    return isinstance(x, tuple)


# copy from fastai.
class OptimWrapper(torch.optim.Optimizer):
    "Basic wrapper around `opt` to simplify hyper-parameters changes."

    def __init__(self, opt, wd, true_wd: bool = False, bn_wd: bool = True):
        # super().__init__(opt.param_groups, dict())
        self.opt, self.true_wd, self.bn_wd = opt, true_wd, bn_wd
        self.opt_keys = list(self.opt.param_groups[0].keys())
        self.opt_keys.remove('params')
        self.read_defaults()
        self.wd = wd
        self.param_segs=[]
    @classmethod
    def create(cls, opt_func, lr, layer_groups, **kwargs):
        "Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`."

        # param_segs=[]
        if len(layer_groups)==1:
            split_groups = split_bn_bias(layer_groups) #non-bn and bns
        else:
            split_groups = []
            for lg in layer_groups:
                split_groups+=split_bn_bias(lg)
                # param_segs.append(len(lg ))
            #
            # buf = layer_groups[0] 
            # for i in range(1,len(layer_groups)):
            #     buf+=layer_groups[i]

            # layer_groups =buf # nn.Sequential(*buf)
                
        opt = opt_func([{
            'params': trainable_params(l),
            'lr': 0
        } for l in split_groups])
        # import pdb 
        # pdb.set_trace()
        opt = cls(opt, **kwargs)
        opt.lr, opt.opt_func = listify(lr, layer_groups), opt_func
        
        # opt.param_segs=param_segs
        return opt

    def new(self, layer_groups):
        "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters."
        opt_func = getattr(self, 'opt_func', self.opt.__class__)
        split_groups = split_bn_bias(layer_groups)
        opt = opt_func([{
            'params': trainable_params(l),
            'lr': 0
        } for l in split_groups])
        return self.create(
            opt_func,
            self.lr,
            layer_groups,
            wd=self.wd,
            true_wd=self.true_wd,
            bn_wd=self.bn_wd)

    def __repr__(self) -> str:
        return f'OptimWrapper over {repr(self.opt)}.\nTrue weight decay: {self.true_wd}'

    # Pytorch optimizer methods
    def step(self) -> None:
        "Set weight decay and step optimizer."
        # weight decay outside of optimizer step (AdamW)
        if self.true_wd:
            for lr, wd, pg1, pg2 in zip(self._lr, self._wd,
                                        self.opt.param_groups[::2],
                                        self.opt.param_groups[1::2]):
                for p in pg1['params']:
                    p.data.mul_(1 - wd * lr)
                if self.bn_wd:
                    for p in pg2['params']:
                        p.data.mul_(1 - wd * lr)
            self.set_val('weight_decay', listify(0, self._wd))
        self.opt.step()

    def zero_grad(self) -> None:
        "Clear optimizer gradients."
        self.opt.zero_grad()

    # Passthrough to the inner opt.
    def __getstate__(self):
        return self.opt.__getstate__()

    def __setstate__(self, state):
        return self.opt.__setstate__(state)

    def state_dict(self):
        return self.opt.state_dict()

    def load_state_dict(self, state_dict):
        return self.opt.load_state_dict(state_dict)

    def add_param_group(self, param_group):
        return self.opt.add_param_group(param_group)

    def clear(self):
        "Reset the state of the inner optimizer."
        sd = self.state_dict()
        sd['state'] = {}
        self.load_state_dict(sd)

    @property
    def param_groups(self):
        return self.opt.param_groups

    @property
    def defaults(self):
        return self.opt.defaults

    @property
    def state(self):
        return self.opt.state

    # Hyperparameters as properties
    @property
    def lr(self) -> float:
        return self._lr[-1]

    @lr.setter
    def lr(self, val: float) -> None:
        self._lr = self.set_val('lr', listify(val, self._lr))

    @property
    def mom(self) -> float:
        return self._mom[-1]

    @mom.setter
    def mom(self, val: float) -> None:
        if 'momentum' in self.opt_keys:
            self.set_val('momentum', listify(val, self._mom))
        elif 'betas' in self.opt_keys:
            self.set_val('betas', (listify(val, self._mom), self._beta))
        self._mom = listify(val, self._mom)

    @property
    def beta(self) -> float:
        return None if self._beta is None else self._beta[-1]

    @beta.setter
    def beta(self, val: float) -> None:
        "Set beta (or alpha as makes sense for given optimizer)."
        if val is None:
            return
        if 'betas' in self.opt_keys:
            self.set_val('betas', (self._mom, listify(val, self._beta)))
        elif 'alpha' in self.opt_keys:
            self.set_val('alpha', listify(val, self._beta))
        self._beta = listify(val, self._beta)

    @property
    def wd(self) -> float:
        return self._wd[-1]

    @wd.setter
    def wd(self, val: float) -> None:
        "Set weight decay."
        if not self.true_wd:
            self.set_val(
                'weight_decay', listify(val, self._wd), bn_groups=self.bn_wd)
        self._wd = listify(val, self._wd)

    # Helper functions
    def read_defaults(self) -> None:
        "Read the values inside the optimizer for the hyper-parameters."
        self._beta = None
        if 'lr' in self.opt_keys:
            self._lr = self.read_val('lr')
        if 'momentum' in self.opt_keys:
            self._mom = self.read_val('momentum')
        if 'alpha' in self.opt_keys:
            self._beta = self.read_val('alpha')
        if 'betas' in self.opt_keys:
            self._mom, self._beta = self.read_val('betas')
        if 'weight_decay' in self.opt_keys:
            self._wd = self.read_val('weight_decay')

    def set_val(self, key: str, val, bn_groups: bool = True):
        "Set `val` inside the optimizer dictionary at `key`."
        if is_tuple(val):
            val = [(v1, v2) for v1, v2 in zip(*val)]
        for v, pg1, pg2 in zip(val, self.opt.param_groups[::2],
                               self.opt.param_groups[1::2]):
            pg1[key] = v
            if bn_groups:
                pg2[key] = v
        return val

    def read_val(self, key: str):
        "Read a hyperparameter `key` in the optimizer dictionary."
        val = [pg[key] for pg in self.opt.param_groups[::2]]
        if is_tuple(val[0]):
            val = [o[0] for o in val], [o[1] for o in val]
        return val


class FastAIMixedOptim(OptimWrapper):
    @classmethod
    def create(cls,
               opt_func,
               lr,
               layer_groups,
               model,
               flat_master=False,
               loss_scale=512.0,
               **kwargs):
        "Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`."
        opt = OptimWrapper.create(opt_func, lr, layer_groups, **kwargs)
        opt.model_params, opt.master_params = get_master(
            layer_groups, flat_master)
        opt.flat_master = flat_master
        opt.loss_scale = loss_scale
        opt.model = model
        # Changes the optimizer so that the optimization step is done in FP32.
        # opt = self.learn.opt
        mom, wd, beta = opt.mom, opt.wd, opt.beta
        lrs = [lr for lr in opt._lr for _ in range(2)]
        opt_params = [{
            'params': mp,
            'lr': lr
        } for mp, lr in zip(opt.master_params, lrs)]
        opt.opt = opt_func(opt_params)
        opt.mom, opt.wd, opt.beta = mom, wd, beta
        return opt

    def step(self):
        model_g2master_g(self.model_params, self.master_params,
                         self.flat_master)
        for group in self.master_params:
            for param in group:
                param.grad.div_(self.loss_scale)
        super(FastAIMixedOptim, self).step()
        self.model.zero_grad()
        # Update the params from master to model.
        master2model(self.model_params, self.master_params, self.flat_master)


================================================
FILE: torchplus/train/learning_schedules.py
================================================
"""PyTorch edition of TensorFlow learning schedule in tensorflow object
detection API. 
"""
import numpy as np
from torch.optim.optimizer import Optimizer
class _LRSchedulerStep(object):
    def __init__(self, optimizer, last_step=-1):
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        if last_step == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError(
                        "param 'initial_lr' is not specified "
                        "in param_groups[{}] when resuming an optimizer".
                        format(i))
        self.base_lrs = list(
            map(lambda group: group['initial_lr'], optimizer.param_groups))
        self.step(last_step + 1)
        self.last_step = last_step

    """
    def get_lr(self):
        raise NotImplementedError
    """

    def get_lr(self):
        ret = [self._get_lr_per_group(base_lr) for base_lr in self.base_lrs]
        return ret

    def _get_lr_per_group(self, base_lr):
        raise NotImplementedError

    def step(self, step=None):
        if step is None:
            step = self.last_step + 1
        self.last_step = step
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


class Constant(_LRSchedulerStep):
    def __init__(self, optimizer, last_step=-1):
        super().__init__(optimizer, last_step)

    def _get_lr_per_group(self, base_lr):
        return base_lr


class ManualStepping(_LRSchedulerStep):
    """Pytorch edition of manual_stepping in tensorflow.
    DON'T SUPPORT PARAM GROUPS.
    """

    def __init__(self, optimizer, boundaries, rates, last_step=-1):
        self._boundaries = boundaries
        self._num_boundaries = len(boundaries)
        self._learning_rates = rates

        if any([b < 0 for b in boundaries]) or any(
            [not isinstance(b, int) for b in boundaries]):
            raise ValueError('boundaries must be a list of positive integers')
        if any(
            [bnext <= b for bnext, b in zip(boundaries[1:], boundaries[:-1])]):
            raise ValueError(
                'Entries in boundaries must be strictly increasing.')
        if any([not isinstance(r, float) for r in rates]):
            raise ValueError('Learning rates must be floats')
        if len(rates) != len(boundaries) + 1:
            raise ValueError('Number of provided learning rates must exceed '
                             'number of boundary points by exactly 1.')
        super().__init__(optimizer, last_step)

    def _get_lr_per_group(self, base_lr):
        step = self.last_step
        ret = None
        for i, bound in enumerate(self._boundaries):
            if step > bound:
                ret = self._learning_rates[i + 1]
        if ret is not None:
            return ret
        return self._learning_rates[0]


class ExponentialDecayWithBurnin(_LRSchedulerStep):
    """Pytorch edition of manual_stepping in tensorflow.
    """

    def __init__(self,
                 optimizer,
                 learning_rate_decay_steps,
                 learning_rate_decay_factor,
                 burnin_learning_rate,
                 burnin_steps,
                 last_step=-1):
        self._decay_steps = learning_rate_decay_steps
        self._decay_factor = learning_rate_decay_factor
        self._burnin_learning_rate = burnin_learning_rate
        self._burnin_steps = burnin_steps

        super().__init__(optimizer, last_step)

    def _get_lr_per_group(self, base_lr):
        if self._burnin_learning_rate == 0:
            burnin_learning_rate = base_lr
        step = self.last_step
        post_burnin_learning_rate = (base_lr * self._decay_factor ^
                                     (step // self._decay_steps))
        if step < self._burnin_steps:
            return burnin_learning_rate
        else:
            return post_burnin_learning_rate


class ExponentialDecay(_LRSchedulerStep):
    def __init__(self,
                 optimizer,
                 learning_rate_decay_steps,
                 learning_rate_decay_factor,
                 staircase=True,
                 last_step=-1):
        self._decay_steps = learning_rate_decay_steps
        self._decay_factor = learning_rate_decay_factor
        self._staircase = staircase

        super().__init__(optimizer, last_step)

    def _get_lr_per_group(self, base_lr):
        step = self.last_step
        if self._staircase:
            post_burnin_learning_rate = base_lr * pow(self._decay_factor,
                                         (step // self._decay_steps))
        else:
            post_burnin_learning_rate = base_lr * pow(self._decay_factor,
                                         (step / self._decay_steps))

        return post_burnin_learning_rate


class CosineDecayWithWarmup(_LRSchedulerStep):
    def __init__(self,
                 optimizer,
                 total_steps,
                 warmup_learning_rate,
                 warmup_steps,
                 last_step=-1):
        if total_steps < warmup_steps:
            raise ValueError('total_steps must be larger or equal to '
                             'warmup_steps.')
        self._total_steps = total_steps
        self._warmup_learning_rate = warmup_learning_rate
        self._warmup_steps = warmup_steps

        super().__init__(optimizer, last_step)

    def _get_lr_per_group(self, base_lr):
        if base_lr < self._warmup_learning_rate:
            raise ValueError('learning_rate_base must be larger '
                             'or equal to warmup_learning_rate.')

        step = self.last_step
        learning_rate = 0.5 * base_lr * (
            1 + np.cos(np.pi *
                       (float(step) - self._warmup_steps
                        ) / float(self._total_steps - self._warmup_steps)))
        if self._warmup_steps > 0:
            slope = (base_lr - self._warmup_learning_rate) / self._warmup_steps
            pre_cosine_learning_rate = slope * float(
                step) + self._warmup_learning_rate
            if step < self._warmup_steps:
                return pre_cosine_learning_rate
            else:
                return learning_rate


class OneCycle(_LRSchedulerStep):
    def __init__(self,
                 optimizer,
                 total_steps,
                 lr_max,
                 moms,
                 div_factor=25,
                 pct_start=0.3,
                 last_step=-1):
        if total_steps < warmup_steps:
            raise ValueError('total_steps must be larger or equal to '
                             'warmup_steps.')
        self._total_steps = total_steps
        self._lr_max = lr_max
        self._moms = moms

        self._warmup_learning_rate = warmup_learning_rate
        self._warmup_steps = warmup_steps

        super().__init__(optimizer, last_step)

    def _get_lr_per_group(self, base_lr):
        if base_lr < self._warmup_learning_rate:
            raise ValueError('learning_rate_base must be larger '
                             'or equal to warmup_learning_rate.')

        step = self.last_step
        learning_rate = 0.5 * base_lr * (
            1 + np.cos(np.pi *
                       (float(step) - self._warmup_steps
                        ) / float(self._total_steps - self._warmup_steps)))
        if self._warmup_steps > 0:
            slope = (base_lr - self._warmup_learning_rate) / self._warmup_steps
            pre_cosine_learning_rate = slope * float(
                step) + self._warmup_learning_rate
            if step < self._warmup_steps:
                return pre_cosine_learning_rate
            else:
                return learning_rate


================================================
FILE: torchplus/train/learning_schedules_fastai.py
================================================
import numpy as np
import math
from functools import partial
import torch


class LRSchedulerStep(object):
    def __init__(self, fai_optimizer, total_step, lr_phases, mom_phases):
        self.optimizer = fai_optimizer
        self.total_step = total_step
        self.lr_phases = []

        for i, (start, lambda_func) in enumerate(lr_phases):
            if len(self.lr_phases) != 0:
                assert self.lr_phases[-1][0] < int(start * total_step)
            if isinstance(lambda_func, str):
                lambda_func = eval(lambda_func)
            if i < len(lr_phases) - 1:
                self.lr_phases.append((int(start * total_step),
                                       int(lr_phases[i + 1][0] * total_step),
                                       lambda_func))
            else:
                self.lr_phases.append((int(start * total_step), total_step,
                                       lambda_func))
        assert self.lr_phases[0][0] == 0
        self.mom_phases = []

        for i, (start, lambda_func) in enumerate(mom_phases):
            if len(self.mom_phases) != 0:
                assert self.mom_phases[-1][0] < int(start * total_step)
            if isinstance(lambda_func, str):
                lambda_func = eval(lambda_func)
            if i < len(mom_phases) - 1:
                self.mom_phases.append((int(start * total_step),
                                        int(mom_phases[i + 1][0] * total_step),
                                        lambda_func))
            else:
                self.mom_phases.append((int(start * total_step), total_step,
                                        lambda_func))
        if len(mom_phases) > 0:
            assert self.mom_phases[0][0] == 0

    def step(self, step):
        lrs = []
        moms = []
        for start, end, func in self.lr_phases:
            if step >= start:
                # func: lr decay function
                lrs.append(func((step - start) / (end - start)))
        if len(lrs) > 0:
            self.optimizer.lr = lrs[-1]
            # import pdb 
            # pdb.set_trace()
            # print(self.optimizer.lr,'!!!!')

        for start, end, func in self.mom_phases:
            if step >= start:
                moms.append(func((step - start) / (end - start)))
                self.optimizer.mom = func((step - start) / (end - start))
        if len(moms) > 0:
            self.optimizer.mom = moms[-1]

    @property
    def learning_rate(self):
        return self.optimizer.lr


def annealing_cos(start, end, pct):
    # print(pct, start, end)
    "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    cos_out = np.cos(np.pi * pct) + 1
    return end + (start - end) / 2 * cos_out


class OneCycle(LRSchedulerStep):
    def __init__(self, fai_optimizer, total_step, lr_max, moms, div_factor,
                 pct_start):
        self.lr_max = lr_max
        self.moms = moms
        self.div_factor = div_factor
        self.pct_start = pct_start
        a1 = int(total_step * self.pct_start)
        a2 = total_step - a1
        low_lr = self.lr_max / self.div_factor
        lr_phases = ((0, partial(annealing_cos, low_lr, self.lr_max)),
                     (self.pct_start,
                      partial(annealing_cos, self.lr_max, low_lr / 1e4))
                      )
        mom_phases = ((0, partial(annealing_cos, *self.moms)),
                      (self.pct_start, partial(annealing_cos,
                                               *self.moms[::-1])))
        fai_optimizer.lr, fai_optimizer.mom = low_lr, self.moms[0]

        super().__init__(fai_optimizer, total_step, lr_phases, mom_phases)


class ExponentialDecayWarmup(LRSchedulerStep):
    def __init__(self,
                 fai_optimizer,
                 total_step,
                 initial_learning_rate,
                 decay_length,
                 decay_factor,
                 div_factor=1,
                 pct_start=0,
                 staircase=True):
        """
        Args:
            decay_length: must in (0, 1)
        """
        assert decay_length > 0
        assert decay_length < 1
        self._decay_steps_unified = decay_length
        self._decay_factor = decay_factor
        self._staircase = staircase
        self.div_factor = div_factor
        self.pct_start = pct_start
        step = pct_start*total_step  # 0
        stage = 1
        lr_phases = [
            (0, partial(annealing_cos, initial_learning_rate/div_factor, initial_learning_rate))]
        if staircase:
            while step <= total_step:
                func = lambda p, _d=initial_learning_rate * stage: _d
                lr_phases.append((step / total_step, func))
                stage *= decay_factor
                step += int(decay_length * total_step)
        else:
            def func(p): return pow(decay_factor, (p / decay_length))
            lr_phases.append((pct_start, func))
            # lr_phases.append((step/total_step, func))
        super().__init__(fai_optimizer, total_step, lr_phases, [])


class ExponentialDecay(LRSchedulerStep):
    def __init__(self,
                 fai_optimizer,
                 total_step,
                 initial_learning_rate,
                 decay_length,
                 decay_factor,
                 staircase=True):
        """
        Args:
            decay_length: must in (0, 1)
        """
        assert decay_length > 0
        assert decay_length < 1
        self._decay_steps_unified = decay_length
        self._decay_factor = decay_factor
        self._staircase = staircase
        step = 0
        stage = 1
        lr_phases = []
        if staircase:
            while step <= total_step:
                func = lambda p, _d=initial_learning_rate * stage: _d
                lr_phases.append((step / total_step, func))
                stage *= decay_factor
                step += int(decay_length * total_step)
        else:
            def func(p): return pow(decay_factor, (p / decay_length))
            lr_phases.append((0, func))
        super().__init__(fai_optimizer, total_step, lr_phases, [])


class ManualStepping(LRSchedulerStep):
    def __init__(self, fai_optimizer, total_step, boundaries, rates):
        assert all([b > 0 and b < 1 for b in boundaries])
        assert len(boundaries) + 1 == len(rates)
        boundaries.insert(0, 0.0)
        lr_phases = []
        for start, rate in zip(boundaries, rates):
            def func(p, _d=rate): return _d
            lr_phases.append((start, func))
        super().__init__(fai_optimizer, total_step, lr_phases, [])


class FakeOptim:
    def __init__(self):
        self.lr = 0
        self.mom = 0


if __name__ == "__main__":
    import matplotlib.pyplot as plt
    opt = FakeOptim()  # 3e-3, wd=0.4, div_factor=10
    # schd = OneCycle(opt, 100, 3e-3, (0.95, 0.85), 10.0, 0.4)
    schd = ExponentialDecay(opt, 100, 3e-4, 0.1, 0.8, staircase=True)
    schd = ManualStepping(opt, 100, [0.8, 0.9], [0.001, 0.0001, 0.00005])
    lrs = []
    moms = []
    for i in range(100):
        schd.step(i)
        lrs.append(opt.lr)
        moms.append(opt.mom)

    plt.plot(lrs)
    # plt.plot(moms)
    # plt.show()
    # plt.plot(moms)
    plt.show()


================================================
FILE: torchplus/train/optim.py
================================================
from collections import defaultdict, Iterable

import torch
from copy import deepcopy
from itertools import chain
from torch.autograd import Variable

required = object()

def param_fp32_copy(params):
    param_copy = [
        param.clone().type(torch.cuda.FloatTensor).detach() for param in params
    ]
    for param in param_copy:
        param.requires_grad = True
    return param_copy

def set_grad(params, params_with_grad, scale=1.0):
    for param, param_w_grad in zip(params, params_with_grad):
        if param.grad is None:
            param.grad = torch.nn.Parameter(
                param.data.new().resize_(*param.data.size()))
        grad = param_w_grad.grad.data
        if scale is not None:
            grad /= scale
        if torch.isnan(grad).any() or torch.isinf(grad).any():
            return True # invalid grad
        param.grad.data.copy_(grad)
    return False

class MixedPrecisionWrapper(object):
    """mixed precision optimizer wrapper.
    Arguments:
        optimizer (torch.optim.Optimizer): an instance of 
            :class:`torch.optim.Optimizer`
        scale: (float): a scalar for grad scale.
        auto_scale: (bool): whether enable auto scale.
            The algorihm of auto scale is discribled in 
            http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
    """

    def __init__(self,
                 optimizer,
                 scale=None,
                 auto_scale=True,
                 inc_factor=2.0,
                 dec_factor=0.5,
                 num_iters_be_stable=500):
        # if not isinstance(optimizer, torch.optim.Optimizer):
        #     raise ValueError("must provide a torch.optim.Optimizer")
        self.optimizer = optimizer
        if hasattr(self.optimizer, 'name'):
            self.name = self.optimizer.name  # for ckpt system
        param_groups_copy = []
        for i, group in enumerate(optimizer.param_groups):
            group_copy = {n: v for n, v in group.items() if n != 'params'}
            group_copy['params'] = param_fp32_copy(group['params'])
            param_groups_copy.append(group_copy)

        # switch param_groups, may be dangerous
        self.param_groups = optimizer.param_groups
        optimizer.param_groups = param_groups_copy
        self.grad_scale = scale
        self.auto_scale = auto_scale
        self.inc_factor = inc_factor
        self.dec_factor = dec_factor
        self.stable_iter_count = 0
        self.num_iters_be_stable = num_iters_be_stable

    def __getstate__(self):
        return self.optimizer.__getstate__()

    def __setstate__(self, state):
        return self.optimizer.__setstate__(state)

    def __repr__(self):
        return self.optimizer.__repr__()

    def state_dict(self):
        return self.optimizer.state_dict()

    def load_state_dict(self, state_dict):
        return self.optimizer.load_state_dict(state_dict)

    def zero_grad(self):
        return self.optimizer.zero_grad()

    def step(self, closure=None):
        for g, g_copy in zip(self.param_groups, self.optimizer.param_groups):
            invalid = set_grad(g_copy['params'], g['params'], self.grad_scale)
            if invalid:
                if self.grad_scale is None or self.auto_scale is False:
                    raise ValueError("nan/inf detected but auto_scale disabled.")
                self.grad_scale *= self.dec_factor
                print('scale decay to {}'.format(self.grad_scale))
                return
        if self.auto_scale is True:
            self.stable_iter_count += 1
            if self.stable_iter_count > self.num_iters_be_stable:
                if self.grad_scale is not None:
                    self.grad_scale *= self.inc_factor
                self.stable_iter_count = 0

        if closure is None:
            self.optimizer.step()
        else:
            self.optimizer.step(closure)
        for g, g_copy in zip(self.param_groups, self.optimizer.param_groups):
            for p_copy, p in zip(g_copy['params'], g['params']):
                p.data.copy_(p_copy.data)


================================================
FILE: utils/__init__.py
================================================


================================================
FILE: utils/config_io.py
================================================

from easydict import EasyDict as edict
import os
import shutil
import yaml


def mkdir_if_not_exists(path):
    """Make a directory if it does not exist.
    Args:
        path: directory to create
    """
    if not os.path.exists(path):
        os.makedirs(path)


def read_yaml(filename):
    """Load yaml file as a dictionary item
    Args:
        filename (str): .yaml file path
    Returns:
        cfg (dict): configuration
    """
    if filename is not None:
        with open(filename, 'r') as f:
            return yaml.load(f, Loader=yaml.FullLoader)
    else:
        return {}


def copy_file(src_file, tgt_file):
    """Copy a file
    Args:
        src_file (str): source file
        tgt_file (str): target file
    """
    shutil.copyfile(src_file, tgt_file)


def update_dict(dict1, dict2, intersection=False):
    """update dict1 according to dict2
    Args:
        dict1 (dict): reference dictionary
        dict2 (dict): new dictionary
    return
        dict1 (dict): updated reference dictionary
    """
    for item in dict2:
        # if dict1.get(item, -1) != -1:
        if item in dict1:
            if isinstance(dict1[item], dict):
                dict1[item] = update_dict(dict1[item], dict2[item], intersection)
            else:
                dict1[item] = dict2[item]
        else:
            if not intersection:
                dict1[item] = dict2[item]
            else:
                raise ValueError(f"Key '{item}' is in the second dict but not in the first dict!")
    #inverse check
    for item in dict1:
        if item not in dict2:
            print(f"Warning: key {item} is not given and will use the default values")
    
    return dict1


def merge_cfg(cfg_files, intersection=False):
    """merge default configuration and custom configuration
    Args:
        cfg_files (str): configuration file paths [default, custom]
    Returns:
        cfg (edict): merged EasyDict
    """
    edict_items = []
    # cfg = {}
    cfg = read_yaml(cfg_files[0])
    # for f in cfg_files:
    for f in cfg_files[1:]:
        # if f is not None:
        if os.path.exists(f):
            cfg = update_dict(cfg, read_yaml(f), intersection)
        else:
            raise ValueError(f"File {f} does not exist." )
    return edict(cfg)


def write_cfg(default, custom, f, level_cnt=0):
    """write configuration to file
    Args:
        default (dict): default configuration dictionary
        custom (dict): custom configuration dictionary
        file (TextIOWrapper)
    """
    offset_len = 100
    for item in default:
        if isinstance(default[item], dict):
            if custom.get(item, -1) == -1:
                custom[item] = {}
            line = "  "*level_cnt + item + ": "
            offset = offset_len - len(line)
            line += " "*offset + " # |"
            f.writelines(line + "\n")
            write_cfg(default[item], custom[item], f, level_cnt+1)
        else:
            line = "  " * level_cnt + item + ": "
            if custom.get(item, -1) == -1:
                if default[item] is not None:
                    line += str(default[item])
                offset = offset_len - len(line)
                line += " "*offset + " # | "
            else: 
                if custom[item] is not None:
                    line += str(custom[item])
                offset = offset_len - len(line)
                line += " "*offset + " # | "
                if custom[item] != default[item]:
                    line += str(default[item])
            f.writelines(line)
            f.writelines("\n")


def save_cfg(cfg_files, file_path):
    """Save configuration file
    Args:
        cfg_files (str): configuration file paths [default, custom]
    Returns:
        cfg (edict): merged EasyDict
    """
    # read configurations
    default = read_yaml(cfg_files[0])
    custom = read_yaml(cfg_files[1])

    # create file to be written
    f = open(file_path, 'w')

    # write header line
    line = "# " + "-"*20 + " Setup " + "-"*74
    line += "|" + "-"*10 + " Default " + "-"*20 + "\n"
    f.writelines(line)

    # write configurations
    write_cfg(default, custom, f)
    f.close()


================================================
FILE: utils/distributed_utils.py
================================================
from torch.utils.data import Sampler
import math
import os
import pdb
import torch
import torch.distributed as dist
from torch.nn import Module
import multiprocessing as mp
import numpy as np


class ParallelWrapper(Module):
    def __init__(self, net, parallel_mode='none'):
        super(ParallelWrapper, self).__init__()
        assert parallel_mode in ['dist', 'data_parallel', 'none']
        self.parallel_mode = parallel_mode
        if parallel_mode == 'none':
            self.net = net
            self.module = net
        elif parallel_mode == 'dist':
            self.net = DistModule(net)
            self.module = self.net.module
        else:
            self.net = torch.nn.DataParallel(net)
            self.module = self.net.module

    def forward(self, *inputs, **kwargs):
        return self.net.forward(*inputs, **kwargs)

    def train(self, mode=True):
        super(ParallelWrapper, self).train(mode)
        self.net.train(mode)


class DistModule(Module):
    def __init__(self, module):
        super(DistModule, self).__init__()
        self.module = module
        broadcast_params(self.module)

    def forward(self, *inputs, **kwargs):
        return self.module(*inputs, **kwargs)

    def train(self, mode=True):
        super(DistModule, self).train(mode)
        self.module.train(mode)

def gradients_multiply(model, multiplier=1):
    for param in model.parameters():
        if param.requires_grad and param.grad is not None:
            param.grad.data *= multiplier
    
def average_gradients(model):
    """ average gradients """

    # for n, param, in model.named_parameters():
    #     if 'dynamic_sigma' in n:
    #         print(param.requires_grad, param.grad.data, param.data)
    #     if param.requires_grad and param.grad is not None:
    #         dist.all_reduce(param.grad.data)

    for param in model.parameters():
        if param.requires_grad and param.grad is not None:
            dist.all_reduce(param.grad.data)
            # param.grad.data *= multiplier


def broadcast_params(model):
    """ broadcast model parameters """
    for p in model.state_dict().values():
        dist.broadcast(p, 0)


def dist_init(port):
    # os.environ["OMP_NUM_THREADS"] = "1"
    if mp.get_start_method(allow_none=True) != 'spawn':
        mp.set_start_method('spawn', force=True)
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)

    if '[' in node_list:
        beg = node_list.find('[')
        pos1 = node_list.find('-', beg)
        if pos1 < 0:
            pos1 = 1000
        pos2 = node_list.find(',', beg)
        if pos2 < 0:
            pos2 = 1000
        node_list = node_list[:min(pos1, pos2)].replace('[', '')
    #added by dy
    # addr = node_list[8:].replace('-', '.')
    addr = node_list.replace('-', '.')
    if ',' in addr:
        addr = addr.split(',')[0]
    addr = addr[8:]
    # addr = ','.join([ad[8:] for ad in addrs ])

    print(addr)

    os.environ['MASTER_PORT'] = port
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend='nccl')

    rank = dist.get_rank()
    world_size = dist.get_world_size()
    return rank, world_size


# from . import Sampler


class DistributedSequatialSampler(Sampler):
    """Sampler that restricts data loading to a subset of the dataset.

    It is especially useful in conjunction with
    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
    process can pass a DistributedSampler instance as a DataLoader sampler,
    and load a subset of the original dataset that is exclusive to it.

    .. note::
        Dataset is assumed to be of constant size.

    Arguments:
        dataset: Dataset used for sampling.
        num_replicas (optional): Number of processes participating in
            distributed training.
        rank (optional): Rank of the current process within num_replicas.
    """

    def __init__(self, dataset, num_replicas=None, rank=None):
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError(
                    "Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError(
                    "Requires distributed package to be available")
            rank = dist.get_rank()
        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.num_samples = int(
            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas

    def __iter__(self):
        # deterministically shuffle based on epoch
        # g = torch.Generator()
        # g.manual_seed(self.epoch)
        # indices = torch.randperm(len(self.dataset), generator=g).tolist()

        indices = list(range(len(self.dataset)))
        # add extra samples to make it evenly divisible
        indices += indices[:(self.total_size - len(indices))]
        assert len(indices) == self.total_size

        # subsample
        indices = indices[self.rank:self.total_size:self.num_replicas]
        assert len(indices) == self.num_samples

        return iter(indices)

    def __len__(self):
        return self.num_samples

    def set_epoch(self, epoch):
        self.epoch = epoch


class DistributedGivenIterationSampler(Sampler):
    def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1):
        if world_size is None:
            world_size = dist.get_world_size()  # link.get_world_size()
        if rank is None:
            rank = dist.get_rank()  # link.get_rank()
        assert rank < world_size
        self.dataset = dataset
        self.total_iter = total_iter
        self.batch_size = batch_size
        self.world_size = world_size
        self.rank = rank
        self.last_iter = last_iter

        self.total_size = self.total_iter*self.batch_size

        self.indices = self.gen_new_list()
        self.call = 0

    def __iter__(self):
        if self.call == 0:
            self.call = 1
            return iter(self.indices[(self.last_iter+1)*self.batch_size:])
        else:
            return iter(self.indices[(self.last_iter+1)*self.batch_size:])
            raise RuntimeError(
                "this sampler is not designed to be called more than once!!")

    def gen_new_list(self):

        # each process shuffle all list with same seed, and pick one piece according to rank
        # np.random.seed(0)
        np.random.seed(7)

        all_size = self.total_size * self.world_size
        indices = np.arange(len(self.dataset))
        indices = indices[:all_size]
        num_repeat = (all_size-1) // indices.shape[0] + 1
        indices = np.tile(indices, num_repeat)
        indices = indices[:all_size]

        np.random.shuffle(indices)
        beg = self.total_size * self.rank
        indices = indices[beg:beg+self.total_size]

        assert len(indices) == self.total_size

        return indices

    def __len__(self):
        # note here we do not take last iter into consideration, since __len__
        # should only be used for displaying, the correct remaining size is
        # handled by dataloader
        # return self.total_size - (self.last_iter+1)*self.batch_size
        return self.total_size

    def set_epoch(self, epoch):
        pass


class DistributedGivenIterationSamplerEpoch(Sampler):
    def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1, review_cycle=-1):
        if world_size is None:
            world_size = dist.get_world_size()  # link.get_world_size()
        if rank is None:
            rank = dist.get_rank()  # link.get_rank()
        assert rank < world_size
        self.dataset = dataset
        self.total_iter = total_iter
        self.batch_size = batch_size
        self.world_size = world_size
        self.rank = rank
        self.last_iter = last_iter

        self.total_size = self.total_iter*self.batch_size
        self.review_cycle = review_cycle # in unit of epoch

        self.indices = self.gen_new_list()
        self.call = 0
    def __iter__(self):
        if self.call == 0:
            self.call = 1
            return iter(self.indices[(self.last_iter+1)*self.batch_size:])
        else:
            return iter(self.indices[(self.last_iter+1)*self.batch_size:])
            # raise RuntimeError(
            #     "this sampler is not designed to be called more than once!!")

    def gen_new_list(self):

        # each process shuffle all list with same seed, and pick one piece according to rank
        # np.random.seed(0)
        np.random.seed(7)

        all_size = self.total_size * self.world_size
        indices = np.arange(len(self.dataset))
        indices = indices[:all_size]
        num_repeat = (all_size-1) // indices.shape[0] + 1

        # indices = np.tile(indices, num_repeat)
        indices = np.concatenate([np.random.permutation(indices) for i in range(num_repeat) ] )
        seeds = np.arange(indices.size).reshape(indices.shape)

        if self.review_cycle>0:
            assert (1/self.review_cycle)%1==0
            # review_freq = 1/1/self.review_cycle
            h = len(indices) // int(self.review_cycle*len(self.dataset))
            
            # print(indices.shape,'???!!!',  indices[:h*int(self.review_cycle*len(self.dataset) ) ].shape)
            indices = indices[:h*int(self.review_cycle*len(self.dataset) ) ].reshape([h,-1] )
            seeds = seeds[:h*int(self.review_cycle*len(self.dataset) ) ].reshape([h,-1] )

            indices = np.concatenate([indices, indices], axis=1).reshape(-1)
            seeds = np.concatenate([seeds, seeds], axis=1).reshape(-1)

        indices = indices[:all_size]
        seeds = seeds[:all_size]

        # np.random.shuffle(indices)
        beg = self.total_size * self.rank
        indices = indices[beg:beg+self.total_size]
        seeds = seeds[beg:beg+self.total_size]

        assert len(indices) == self.total_size

        # return indices
        return list(zip(list(indices), list(seeds)))

    def __len__(self):
        # note here we do not take last iter into consideration, since __len__
        # should only be used for displaying, the correct remaining size is
        # handled by dataloader
        # return self.total_size - (self.last_iter+1)*self.batch_size
        return self.total_size

    def set_epoch(self, epoch):
        pass


================================================
FILE: utils/eval_metric.py
================================================
import os
import numpy as np
from plyfile import PlyData
# from utils import icp_utils
from data.linemod import linemod_config
from thirdparty.vsd import inout
from thirdparty.nn import nn_utils
from utils.img_utils import read_depth
from thirdparty.kpconv.lib.utils import square_distance
from utils.geometric import rotation_angle
from utils.visualize import *
# from thirdparty.fps import fps_utils
import torch
import open3d as o3d
from transforms3d.quaternions import mat2quat, quat2mat, qmult
# import data.bop_ycb.ycb_config as ycb_config #import bop_ycb_class2idx, model_info

def get_ply_model(model_path, scale=1):
    ply = PlyData.read(model_path)
    data = ply.elements[0].data
    x = data['x']*scale
    y = data['y']*scale
    z = data['z']*scale
    model = np.stack([x, y, z], axis=-1)
    return model


def project(xyz, K, RT):
    """
    xyz: [N, 3]
    K: [3, 3]
    RT: [3, 4]
    """
    xyz = np.dot(xyz, RT[:, :3].T) + RT[:, 3:].T
    xyz = np.dot(xyz, K.T)
    xy = xyz[:, :2] / xyz[:, 2:]
    return xy


def find_nearest_point_idx(ref_pts, que_pts):
    assert(ref_pts.shape[1] == que_pts.shape[1] and 1 < que_pts.shape[1] <= 3)
    pn1 = ref_pts.shape[0]
    pn2 = que_pts.shape[0]
    dim = ref_pts.shape[1]

    ref_pts = np.ascontiguousarray(ref_pts[None, :, :], np.float32)
    que_pts = np.ascontiguousarray(que_pts[None, :, :], np.float32)
    idxs = np.zeros([1, pn2], np.int32)

    ref_pts_ptr = ffi.cast('float *', ref_pts.ctypes.data)
    que_pts_ptr = ffi.cast('float *', que_pts.ctypes.data)
    idxs_ptr = ffi.cast('int *', idxs.ctypes.data)
    lib.findNearestPointIdxLauncher(
        ref_pts_ptr, que_pts_ptr, idxs_ptr, 1, pn1, pn2, dim, 0)

    return idxs[0]


class LineMODEvaluator:
    def __init__(self, class_name, result_dir, icp_refine=False):

        # self.result_dir = os.path.join(result_dir, cfg.test.dataset)
        self.result_dir = os.path.join(result_dir, "LINEMOD")
        os.system('mkdir -p {}'.format(self.result_dir))


        # data_root = args['data_root']
        # cls = cfg.cls_type
        self.class_name = class_name
        self.icp_refine = icp_refine

        # model_path = os.path.join(os.path.dirname(os.path.abspath(
        #     __file__)), '../EXPDATA/LINEMOD', class_name, class_name + '.ply')
        model_path = os.path.join(os.path.dirname(os.path.abspath(
            __file__)), '../EXPDATA/LM6d_converted/models', class_name, class_name + '.ply')
        # self.model = pvnet_data_utils.get_ply_model(model_path)
        self.model = get_ply_model(model_path)
        self.diameter = linemod_config.diameters[class_name] / 100

        self.proj2d = []
        self.add = []
        self.adds = [] #force sym
        self.add2 = []
        self.add5 = []
        self.cmd5 = []

        self.icp_proj2d = []
        self.icp_add = []
        self.icp_cmd5 = []

        self.mask_ap = []
        self.pose_preds=[]

        self.height = 480
        self.width = 640

        model = inout.load_ply(model_path)
        model['pts'] = model['pts'] * 1000
        self.icp_refiner = icp_utils.ICPRefiner(
            model, (self.width, self.height)) if icp_refine else None

    def projection_2d(self, pose_pred, pose_targets, K, icp=False, threshold=5):
        model_2d_pred = project(self.model, K, pose_pred)
        model_2d_targets = project(self.model, K, pose_targets)
        proj_mean_diff = np.mean(np.linalg.norm(
            model_2d_pred - model_2d_targets, axis=-1))
        if icp:
            self.icp_proj2d.append(proj_mean_diff < threshold)
        else:
            self.proj2d.append(proj_mean_diff < threshold)

    def projection_2d_sym(self, pose_pred, pose_targets, K, threshold=5):
        model_2d_pred = project(self.model, K, pose_pred)
        model_2d_targets = project(self.model, K, pose_targets)
        proj_mean_diff=np.mean(find_nearest_point_distance(model_2d_pred,model_2d_targets))

        self.proj_mean_diffs.append(proj_mean_diff)
        self.projection_2d_recorder.append(proj_mean_diff < threshold)

    def add2_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.02):
        diameter = self.diameter * percentage
        model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]
        model_targets = np.dot(
            self.model, pose_targets[:, :3].T) + pose_targets[:, 3]

        if syn:
            idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)
            # idxs = find_nearest_point_idx(model_pred, model_targets)
            mean_dist = np.mean(np.linalg.norm(
                model_pred[idxs] - model_targets, 2, 1))
        else:
            mean_dist = np.mean(np.linalg.norm(
                model_pred - model_targets, axis=-1))

        if icp:
            self.icp_add.append(mean_dist < diameter)
        else:
            self.add2.append(mean_dist < diameter)

    def add5_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.05):
        diameter = self.diameter * percentage
        model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]
        model_targets = np.dot(
            self.model, pose_targets[:, :3].T) + pose_targets[:, 3]

        if syn:
            idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)
            # idxs = find_nearest_point_idx(model_pred, model_targets)
            mean_dist = np.mean(np.linalg.norm(
                model_pred[idxs] - model_targets, 2, 1))
        else:
            mean_dist = np.mean(np.linalg.norm(
                model_pred - model_targets, axis=-1))

        if icp:
            self.icp_add.append(mean_dist < diameter)
        else:
            self.add5.append(mean_dist < diameter)

    
    def add_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.1):
        diameter = self.diameter * percentage
        model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]
        model_targets = np.dot(
            self.model, pose_targets[:, :3].T) + pose_targets[:, 3]

        if syn:
            idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)
            # idxs = find_nearest_point_idx(model_pred, model_targets)
            mean_dist = np.mean(np.linalg.norm(
                model_pred[idxs] - model_targets, 2, 1))
        else:
            mean_dist = np.mean(np.linalg.norm(
                model_pred - model_targets, axis=-1))

        if icp:
            self.icp_add.append(mean_dist < diameter)
        else:
            self.add.append(mean_dist < diameter)

    def cm_degree_5_metric(self, pose_pred, pose_targets, icp=False):
        translation_distance = np.linalg.norm(
            pose_pred[:, 3] - pose_targets[:, 3]) * 100
        rotation_diff = np.dot(pose_pred[:, :3], pose_targets[:, :3].T)
        trace = np.trace(rotation_diff)
        trace = trace if trace <= 3 else 3
        angular_distance = np.rad2deg(np.arccos((trace - 1.) / 2.))
        if icp:
            self.icp_cmd5.append(translation_distance <
                                 5 and angular_distance < 5)
        else:
            self.cmd5.append(translation_distance < 5 and angular_distance < 5)

    def mask_iou(self, output, batch):
        mask_pred = torch.argmax(output['seg'], dim=1)[
            0].detach().cpu().numpy()
        mask_gt = batch['mask'][0].detach().cpu().numpy()
        iou = (mask_pred & mask_gt).sum() / (mask_pred | mask_gt).sum()
        self.mask_ap.append(iou > 0.7)

    def icp_refine(self, pose_pred, anno, output, K):
        depth = read_depth(anno['depth_path'])
        mask = torch.argmax(output['seg'], dim=1)[0].detach().cpu().numpy()
        if pose_pred[2, 3] <= 0:
            return pose_pred
        depth[mask != 1] = 0
        pose_pred_tmp = pose_pred.copy()
        pose_pred_tmp[:3, 3] = pose_pred_tmp[:3, 3] * 1000

        R_refined, t_refined = self.icp_refiner.refine(
            depth, pose_pred_tmp[:3, :3], pose_pred_tmp[:3, 3], K.copy(), depth_only=True, max_mean_dist_factor=5.0)
        R_refined, _ = self.icp_refiner.refine(
            depth, R_refined, t_refined, K.copy(), no_depth=True)

        pose_pred = np.hstack((R_refined, t_refined.reshape((3, 1)) / 1000))

        return pose_pred


    def icp_refine_(self, pose, anno, output):
        depth = read_depth(anno['depth_path']).astype(np.uint16)
        mask = torch.argmax(output['seg'], dim=1)[0].detach().cpu().numpy()
        mask = mask.astype(np.int32)
        pose = pose.astype(np.float32)

        poses = np.zeros([1, 7], dtype=np.float32)
        poses[0, :4] = mat2quat(pose[:, :3])
        poses[0, 4:] = pose[:, 3]

        poses_new = np.zeros([1, 7], dtype=np.float32)
        poses_icp = np.zeros([1, 7], dtype=np.float32)

        fx = 572.41140
        fy = 573.57043
        px = 325.26110
        py = 242.04899
        zfar = 6.0
        znear = 0.25
        factor = 1000.0
        error_threshold = 0.01

        rois = np.zeros([1, 6], dtype=np.float32)
        rois[:, :] = 1

        self.icp_refiner.solveICP(mask, depth,
                                  self.height, self.width,
                                  fx, fy, px, py,
                                  znear, zfar,
                                  factor,
                                  rois.shape[0], rois,
                                  poses, poses_new, poses_icp,
                                  error_threshold
                                  )

        pose_icp = np.zeros([3, 4], dtype=np.float32)
        pose_icp[:, :3] = quat2mat(poses_icp[0, :4])
        pose_icp[:, 3] = poses_icp[0, 4:]

        return pose_icp

    def summarize(self):
        proj2d = np.mean(self.proj2d)
        add = np.mean(self.add)
        # adds = np.mean(self.adds)
        add2 = np.mean(self.add2)
        add5 = np.mean(self.add5)
        cmd5 = np.mean(self.cmd5)
        ap = np.mean(self.mask_ap)
        seq_len=len(self.add)
        print('2d projections metric: {}'.format(proj2d))
        print('ADD metric: {}'.format(add))
        print('ADD2 metric: {}'.format(add2))
        print('ADD5 metric: {}'.format(add5))
        # print('ADDS metric: {}'.format(adds))
        print('5 cm 5 degree metric: {}'.format(cmd5))
        print('mask ap70: {}'.format(ap))
        print('seq_len: {}'.format(seq_len))
        # if cfg.test.icp:
        if self.icp_refine:
            print('2d projections metric after icp: {}'.format(
                np.mean(self.icp_proj2d)))
            print('ADD metric after icp: {}'.format(np.mean(self.icp_add)))
            print('5 cm 5 degree metric after icp: {}'.format(
                np.mean(self.icp_cmd5)))
        self.proj2d = []
        self.add = []
        self.add2 = []
        self.add5 = []
        # self.adds = []
        self.cmd5 = []
        self.mask_ap = []
        self.icp_proj2d = []    
        self.icp_add = []
        self.icp_cmd5 = []
        

        #save pose predictions
        if len(self.pose_preds)> 0:
            np.save(f"{self.class_name}_pose_preds.npy",self.pose_preds)
        self.pose_preds=[]

        return {'proj2d': proj2d, 'add': add, 'add2': add2, 'add5': add5,'cmd5': cmd5, 'ap': ap, "seq_len": seq_len}


    def evaluate_rnnpose(self, preds_dict, example): # sample_correspondence_pairs=False, direct_align=False, use_cnnpose=True):
        len_src_f = example['stack_lengths'][0][0]
        # lifted_points = example['lifted_points'].squeeze(0)
        assert len( example['lifted_points']) == 1, "TODO: support bs>1"
        lifted_points = example['lifted_points'][0].squeeze(0)
        model_points = example['original_model_points'][:len_src_f]

        K = example["K"].cpu().numpy().squeeze()
        R_pred = preds_dict['Ti_pred'].G[:,0, :3,:3].squeeze().detach().cpu().numpy()
        t_pred = preds_dict['Ti_pred'].G[:,0, :3,3:].squeeze(0).detach().cpu().numpy()
        pose_pred= preds_dict['Ti_pred'].G[:,0, :3].squeeze().detach().cpu().numpy()
#             print(example['POSECNN_RT'].dtype, example['rendered_RT'].dtype, flush=True)
#             R_pred = example['POSECNN_RT'][:,:3,:3].squeeze().detach().cpu().numpy()
#             t_pred = example['POSECNN_RT'][:,:3,3:].squeeze(0).detach().cpu().numpy()
#             pose_pred= example['POSECNN_RT'][:, :3].squeeze().detach().cpu().numpy()


        pose_gt = example['original_RT'].squeeze()[:3].cpu().numpy()
        
        
        ang_err = rotation_angle(pose_gt[:3, :3], R_pred)
        trans_err = np.linalg.norm(t_pred-pose_gt[:3, -1:])  # 3x1

        if self.class_name in ['eggbox', 'glue']:
            self.add_metric(pose_pred, pose_gt, syn=True)
            self.add2_metric(pose_pred, pose_gt, syn=True)
            self.add5_metric(pose_pred, pose_gt, syn=True)
        else:
            self.add_metric(pose_pred, pose_gt)
            self.add2_metric(pose_pred, pose_gt)
            self.add5_metric(pose_pred, pose_gt)

        self.projection_2d(pose_pred, pose_gt, K=linemod_config.linemod_K)
        self.cm_degree_5_metric(pose_pred, pose_gt)
        # self.mask_iou(output, batch)

        # vis
        pc_proj_vis = vis_pointclouds_cv2((pose_gt[:3, :3]@model_points.cpu().numpy(
        ).T+pose_gt[:3, -1:]).T, example["K"].cpu().numpy().squeeze(), [480,640])
        pc_proj_vis_pred = vis_pointclouds_cv2((pose_pred[:3, :3]@model_points.cpu().numpy(
        ).T+pose_pred[:3, -1:]).T, example["K"].cpu().numpy().squeeze(), [ 480, 640])


        return {
            "ang_err": ang_err,
            "trans_err": trans_err,
            "pnp_inliers": -1,#len(inliers),
            "pc_proj_vis": pc_proj_vis,
            "pc_proj_vis_pred": pc_proj_vis_pred,
            "keypoints_2d_vis": np.zeros_like(pc_proj_vis_pred) #keypoints_2d_vis
        }

        
# class YCBEvaluator:
#     def __init__(self, class_name, result_dir, icp_refine=False):

#         self.result_dir = os.path.join(result_dir, "LINEMOD")
#         os.system('mkdir -p {}'.format(self.result_dir))

#         self.class_name = class_name
#         self.icp_refine = icp_refine
        
#         model_path = os.path.join(os.path.dirname(os.path.abspath(
#             __file__)), '../EXPDATA/BOP_YCB/models', f'obj_{ycb_config.bop_ycb_class2idx[class_name]:06d}.ply' )
#         self.model = get_ply_model(model_path, scale=0.001)
#         # self.diameter = linemod_config.diameters[class_name] / 100
#         self.diameter = ycb_config.model_info[ str(ycb_config.bop_ycb_class2idx[class_name]) ]["diameter"]*0.001  # in mm # / 1000

#         self.proj2d = []
#         self.add = []
#         self.adds=[]
#         self.cmd5 = []
#         self.add_dist=[]
#         self.adds_dist=[]

#         self.icp_proj2d = []
#         self.icp_add = []
#         self.icp_cmd5 = []

#         self.mask_ap = []
#         self.pose_preds=[]

#         self.height = 480
#         self.width = 640

#         model = inout.load_ply(model_path)
#         model['pts'] = model['pts'] * 1000
#         # self.icp_refiner = icp_utils.ICPRefiner(model, (self.width, self.height)) if cfg.test.icp else None
#         self.icp_refiner = icp_utils.ICPRefiner(
#             model, (self.width, self.height)) if icp_refine else None
#         self.direct_align_module = DirectAlignment(None)
#         # if cfg.test.icp:
#         #     self.icp_refiner = ext_.Synthesizer(os.path.realpath(model_path))
#         #     self.icp_refiner.setup(self.width, self.height)

#     def projection_2d(self, pose_pred, pose_targets, K, icp=False, threshold=5):
#         model_2d_pred = project(self.model, K, pose_pred)
#         model_2d_targets = project(self.model, K, pose_targets)
#         proj_mean_diff = np.mean(np.linalg.norm(
#             model_2d_pred - model_2d_targets, axis=-1))
#         if icp:
#             self.icp_proj2d.append(proj_mean_diff < threshold)
#         else:
#             self.proj2d.append(proj_mean_diff < threshold)

#     def projection_2d_sym(self, pose_pred, pose_targets, K, threshold=5):
#         model_2d_pred = project(self.model, K, pose_pred)
#         model_2d_targets = project(self.model, K, pose_targets)
#         proj_mean_diff=np.mean(find_nearest_point_distance(model_2d_pred,model_2d_targets))

#         self.proj_mean_diffs.append(proj_mean_diff)
#         self.projection_2d_recorder.append(proj_mean_diff < threshold)

#     def add_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.1):
#         diameter = self.diameter * percentage
#         model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]
#         model_targets = np.dot(
#             self.model, pose_targets[:, :3].T) + pose_targets[:, 3]

#         if syn:
#             idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)
#             # idxs = find_nearest_point_idx(model_pred, model_targets)
#             mean_dist = np.mean(np.linalg.norm(
#                 model_pred[idxs] - model_targets, 2, 1))
#         else:
#             mean_dist = np.mean(np.linalg.norm(
#                 model_pred - model_targets, axis=-1))
#         self.add_dist.append(mean_dist)
#         if icp:
#             self.icp_add.append(mean_dist < diameter)
#         else:
#             self.add.append(mean_dist < diameter)
#     def auc_add(self, max_thresh=0.1):
#         add_dist = np.array(self.add_dist)
#         interval=0.001
#         acc=0
#         for k in range(int(max_thresh/interval)):
#             acc+= interval* np.sum( ((k+1)*interval)>=add_dist)/ add_dist.shape[0]

#         return acc/max_thresh
#     def auc_adds(self, max_thresh=0.1):
#         add_dist = np.array(self.adds_dist)
#         interval=0.001
#         acc=0
#         for k in range(int(max_thresh/interval)):
#             acc+= interval* np.sum( ((k+1)*interval)>=add_dist )/ add_dist.shape[0]

#         return acc/max_thresh
#     def adds_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.1):
#         diameter = self.diameter * percentage
#         model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]
#         model_targets = np.dot(
#             self.model, pose_targets[:, :3].T) + pose_targets[:, 3]

#         if syn:
#             idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)
#             # idxs = find_nearest_point_idx(model_pred, model_targets)
#             mean_dist = np.mean(np.linalg.norm(
#                 model_pred[idxs] - model_targets, 2, 1))
#         else:
#             mean_dist = np.mean(np.linalg.norm(
#                 model_pred - model_targets, axis=-1))
#         self.adds_dist.append(mean_dist)
#         if icp:
#             self.icp_add.append(mean_dist < diameter)
#         else:
#             self.adds.append(mean_dist < diameter)

#     def cm_degree_5_metric(self, pose_pred, pose_targets, icp=False):
#         translation_distance = np.linalg.norm(
#             pose_pred[:, 3] - pose_targets[:, 3]) * 100
#         rotation_diff = np.dot(pose_pred[:, :3], pose_targets[:, :3].T)
#         trace = np.trace(rotation_diff)
#         trace = trace if trace <= 3 else 3
#         angular_distance = np.rad2deg(np.arccos((trace - 1.) / 2.))
#         if icp:
#             self.icp_cmd5.append(translation_distance <
#                                  5 and angular_distance < 5)
#         else:
#             self.cmd5.append(translation_distance < 5 and angular_distance < 5)

#     def mask_iou(self, output, batch):
#         mask_pred = torch.argmax(output['seg'], dim=1)[
#             0].detach().cpu().numpy()
#         mask_gt = batch['mask'][0].detach().cpu().numpy()
#         iou = (mask_pred & mask_gt).sum() / (mask_pred | mask_gt).sum()
#         self.mask_ap.append(iou > 0.7)

#     def icp_refine(self, pose_pred, anno, output, K):
#         depth = read_depth(anno['depth_path'])
#         mask = torch.argmax(output['seg'], dim=1)[0].detach().cpu().numpy()
#         if pose_pred[2, 3] <= 0:
#             return pose_pred
#         depth[mask != 1] = 0
#         pose_pred_tmp = pose_pred.copy()
#         pose_pred_tmp[:3, 3] = pose_pred_tmp[:3, 3] * 1000

#         R_refined, t_refined = self.icp_refiner.refine(
#             depth, pose_pred_tmp[:3, :3], pose_pred_tmp[:3, 3], K.copy(), depth_only=True, max_mean_dist_factor=5.0)
#         R_refined, _ = self.icp_refiner.refine(
#             depth, R_refined, t_refined, K.copy(), no_depth=True)

#         pose_pred = np.hstack((R_refined, t_refined.reshape((3, 1)) / 1000))

#         return pose_pred


#     def icp_refine_(self, pose, anno, output):
#         depth = read_depth(anno['depth_path']).astype(np.uint16)
#         mask = torch.argmax(output['seg'], dim=1)[0].detach().cpu().numpy()
#         mask = mask.astype(np.int32)
#         pose = pose.astype(np.float32)

#         poses = np.zeros([1, 7], dtype=np.float32)
#         poses[0, :4] = mat2quat(pose[:, :3])
#         poses[0, 4:] = pose[:, 3]

#         poses_new = np.zeros([1, 7], dtype=np.float32)
#         poses_icp = np.zeros([1, 7], dtype=np.float32)

#         fx = 572.41140
#         fy = 573.57043
#         px = 325.26110
#         py = 242.04899
#         zfar = 6.0
#         znear = 0.25
#         factor = 1000.0
#         error_threshold = 0.01

#         rois = np.zeros([1, 6], dtype=np.float32)
#         rois[:, :] = 1

#         self.icp_refiner.solveICP(mask, depth,
#                                   self.height, self.width,
#                                   fx, fy, px, py,
#                                   znear, zfar,
#                                   factor,
#                                   rois.shape[0], rois,
#                                   poses, poses_new, poses_icp,
#                                   error_threshold
#                                   )

#         pose_icp = np.zeros([3, 4], dtype=np.float32)
#         pose_icp[:, :3] = quat2mat(poses_icp[0, :4])
#         pose_icp[:, 3] = poses_icp[0, 4:]

#         return pose_icp

    
#     def summarize(self):
#         proj2d = np.mean(self.proj2d)
#         add = np.mean(self.add)
#         adds = np.mean(self.adds)
#         cmd5 = np.mean(self.cmd5)
#         ap = np.mean(self.mask_ap)
#         try:
#             auc_add=self.auc_add(max_thresh=0.1)
#         except:
#             auc_add=0
#         try:
#             auc_adds=self.auc_adds(max_thresh=0.1)
#         except:
#             auc_adds=0
#         seq_len=len(self.add)
#         print('2d projections metric: {}'.format(proj2d))
#         print('ADD metric: {}'.format(add))
#         print('AUC ADD metric: {}'.format(auc_add))
#         print('ADDS metric: {}'.format(adds))
#         print('AUC ADDS metric: {}'.format(auc_adds))
#         print('5 cm 5 degree metric: {}'.format(cmd5))
#         print('mask ap70: {}'.format(ap))
#         print('seq_len: {}'.format(seq_len))
#         # if cfg.test.icp:
#         if self.icp_refine:
#             print('2d projections metric after icp: {}'.format(
#                 np.mean(self.icp_proj2d)))
#             print('ADD metric after icp: {}'.format(np.mean(self.icp_add)))
#             print('5 cm 5 degree metric after icp: {}'.format(
#                 np.mean(self.icp_cmd5)))
#         self.proj2d = []
#         self.add = []
#         self.adds = []
#         self.cmd5 = []
#         self.mask_ap = []
#         self.icp_proj2d = []    
#         self.icp_add = []
#         self.icp_cmd5 = []
#         self.add_dist = []
#         self.adds_dist = []
        

#         #save pose predictions
#         if len(self.pose_preds)> 0:
#             np.save(f"{self.class_name}_pose_preds.npy",self.pose_preds)
#         self.pose_preds=[]

#         return {'proj2d': proj2d, 'add': add, 'adds': adds,'cmd5': cmd5, 'ap': ap, "seq_len": seq_len}

#     def evaluate_flowpose(self, preds_dict, example, sample_correspondence_pairs=False, direct_align=False, use_cnnpose=True):
#         len_src_f = example['stack_lengths'][0][0]
#         # lifted_points = example['lifted_points'].squeeze(0)
#         assert len( example['lifted_points']) == 1, "TODO: support bs>1"
#         lifted_points = example['lifted_points'][0].squeeze(0)
#         model_points = example['original_model_points'][:len_src_f]
#         K = example["K"].cpu().numpy().squeeze()

#         if not use_cnnpose: # use pnp 
#             len_src_f = example['stack_lengths'][0][0]
#             descriptors_2d = preds_dict['descriptors_2d']
#             descriptors_3d = preds_dict['descriptors_3d'][:len_src_f]

#             mask = cv2.erode(( example['depth'].detach().cpu().numpy().squeeze()>0).astype(np.uint8)*255, kernel=np.ones([3,3], np.uint8),iterations = 1)
#             mask=torch.tensor(mask, device=descriptors_3d.device)
#             ys_, xs_ = torch.nonzero(mask, as_tuple=True)

#             # ys_, xs_ = torch.nonzero(example['depth'].squeeze(), as_tuple=True)
#             descriptors_2d = descriptors_2d[:, :,
#                                             ys_, xs_].squeeze().permute([1, 0])
#             img_coods = torch.stack([xs_, ys_], dim=-1)


#             if sample_correspondence_pairs:
#                 correspondences_2d3d = example['correspondences_2d3d'].squeeze()
#                 if(correspondences_2d3d.size(0) > 256):
#                     choice = np.random.permutation(
#                         correspondences_2d3d.size(0))[:256]
#                     correspondences_2d3d = correspondences_2d3d[choice]

#                 src_idx = correspondences_2d3d[:, 0]
#                 tgt_idx = correspondences_2d3d[:, 1]
#             else:
#                 # src_idx = np.random.permutation(len(xs_))[:256]
#                 # src_idx = np.random.permutation(len(xs_))[:256]
#                 _, idx = fps_utils.farthest_point_sampling_withidx(np.stack([xs_.cpu().numpy(
#                 ), ys_.cpu().numpy(), np.zeros_like(xs_.cpu().numpy())], axis=-1), 256, False)
#                 # _, idx = fps_utils.farthest_point_sampling_withidx(np.stack([xs_.cpu().numpy(
#                 # ), ys_.cpu().numpy(), np.zeros_like(xs_.cpu().numpy())], axis=-1), 1024, False)
#                 # src_idx = np.arange(len(xs_))[::len(xs_)//256]
#                 src_idx = np.arange(len(xs_))[idx]
#                 tgt_idx = np.arange(len(model_points))

#             src_pcd, tgt_pcd = lifted_points[src_idx], model_points[tgt_idx]
#             img_coods = img_coods[src_idx]
#             src_feats, tgt_feats = descriptors_2d[src_idx], descriptors_3d[tgt_idx]

#             feats_dist = torch.sqrt(square_distance(
#                 src_feats[None, :, :], tgt_feats[None, :, :], normalised=True)).squeeze(0)

#             _, sel_idx = torch.min(feats_dist, -1)
#             K = example["K"].cpu().numpy().squeeze()
#             try:
#                 # retval, R_pred,t_pred, inliers =cv2.solvePnPRansac(tgt_pcd[sel_idx].cpu().numpy(), img_coods.cpu().numpy().astype(np.float32), K,distCoeffs=np.zeros(4),reprojectionError=1)
#                 retval, R_pred, t_pred, inliers = cv2.solvePnPRansac(tgt_pcd[sel_idx].cpu().numpy(), img_coods.cpu(
#                 ).numpy().astype(np.float32), K, distCoeffs=np.zeros(4), reprojectionError=1, iterationsCount=1000)

#                 if inliers is None:
#                     raise ValueError
#             except:
#                 # try:
#                 print("PNP RANSAC reprojectionError threshold =3")
#                 retval, R_pred, t_pred, inliers = cv2.solvePnPRansac(tgt_pcd[sel_idx].cpu().numpy(), img_coods.cpu(
#                 ).numpy().astype(np.float32), K, distCoeffs=np.zeros(4), reprojectionError=3, iterationsCount=1000)
    
#             R_pred, _ = cv2.Rodrigues(R_pred)
#             pose_pred = np.concatenate([R_pred, t_pred], axis=-1)
#         else:
#             K = example["K"].cpu().numpy().squeeze()
#             R_pred = preds_dict['Ti_pred'].G[:,0, :3,:3].squeeze().detach().cpu().numpy()
#             t_pred = preds_dict['Ti_pred'].G[:,0, :3,3:].squeeze(0).detach().cpu().numpy()
#             pose_pred= preds_dict['Ti_pred'].G[:,0, :3].squeeze().detach().cpu().numpy()
# #             print(example['POSECNN_RT'].dtype, example['rendered_RT'].dtype, flush=True)
# #             R_pred = example['POSECNN_RT'][:,:3,:3].squeeze().detach().cpu().numpy()
# #             t_pred = example['POSECNN_RT'][:,:3,3:].squeeze(0).detach().cpu().numpy()
# #             pose_pred= example['POSECNN_RT'][:, :3].squeeze().detach().cpu().numpy()


#         # pose_gt = example['RT'].squeeze()[:3].cpu().numpy()
#         pose_gt = example['original_RT'].squeeze()[:3].cpu().numpy()
        
        
#         ang_err = rotation_angle(pose_gt[:3, :3], R_pred)
#         trans_err = np.linalg.norm(t_pred-pose_gt[:3, -1:])  # 3x1

#         if self.class_name in ['024_bowl', '036_wood_block', '051_large_clamp', '052_extra_large_clamp', '061_foam_brick']:
#             self.add_metric(pose_pred, pose_gt, syn=True)
#         else:
#             self.add_metric(pose_pred, pose_gt)
#         self.adds_metric(pose_pred, pose_gt, syn=True)
#         self.projection_2d(pose_pred, pose_gt, K=linemod_config.linemod_K)
#         self.cm_degree_5_metric(pose_pred, pose_gt)

#         # self.mask_iou(output, batch)

#         # vis
#         pc_proj_vis = vis_pointclouds_cv2((pose_gt[:3, :3]@model_points.cpu().numpy(
#         ).T+pose_gt[:3, -1:]).T, example["K"].cpu().numpy().squeeze(), [480,640])
#         pc_proj_vis_pred = vis_pointclouds_cv2((pose_pred[:3, :3]@model_points.cpu().numpy(
#         ).T+pose_pred[:3, -1:]).T, example["K"].cpu().numpy().squeeze(), [ 480, 640])


#         if trans_err > 0.5:
#             print("translation err>0.5")
#             # cv2.imwrite(f'tmp/{len(self.add)}.png',
#             #             keypoints_2d_vis[..., ::-1]*255,)
#             # torch.save( example, f'tmp/{len(self.add)}.pt' )

#         return {
#             "ang_err": ang_err,
#             "trans_err": trans_err,
#             "pnp_inliers": -1,#len(inliers),
#             "pc_proj_vis": pc_proj_vis,
#             "pc_proj_vis_pred": pc_proj_vis_pred,
#             "keypoints_2d_vis": np.zeros_like(pc_proj_vis_pred) #keypoints_2d_vis
#         }


================================================
FILE: utils/furthest_point_sample.py
================================================

import numpy as np
from scipy import spatial


def fragmentation_fps(vertices, num_frags):
  """Fragmentation by the furthest point sampling algorithm.

  The fragment centers are found by iterative selection of the vertex from
  vertices that is furthest from the already selected vertices. The algorithm
  starts with the centroid of the object model which is then discarded from the
  final set of fragment centers.

  A fragment is defined by a set of points on the object model that are the
  closest to the fragment center.

  Args:
    vertices: [num_vertices, 3] ndarray with 3D vertices of the object model.
    num_frags: Number of fragments to define.

  Returns:
    [num_frags, 3] ndarray with fragment centers and [num_vertices] ndarray
    storing for each vertex the ID of the assigned fragment.
  """
  # Start with the origin of the model coordinate system.
  frag_centers = [np.array([0., 0., 0.])]

  # Calculate distances to the center from all the vertices.
  nn_index = spatial.cKDTree(frag_centers)
  nn_dists, _ = nn_index.query(vertices, k=1)
  center_inds=[]
  for _ in range(num_frags):
    # Select the furthest vertex as the next center.
    new_center_ind = np.argmax(nn_dists)
    new_center = vertices[new_center_ind]
    frag_centers.append(vertices[new_center_ind])
    center_inds.append(new_center_ind)

    # Update the distances to the nearest center.
    nn_dists[new_center_ind] = -1
    nn_dists = np.minimum(
      nn_dists, np.linalg.norm(vertices - new_center, axis=1))

  # Remove the origin.
  frag_centers.pop(0)
  frag_centers = np.array(frag_centers)

  # Assign vertices to the fragments.
  # TODO: This information can be maintained during the FPS algorithm.
  nn_index = spatial.cKDTree(frag_centers)
  _, vertex_frag_ids = nn_index.query(vertices, k=1)

#   return frag_centers, vertex_frag_ids
  return frag_centers, np.array(center_inds), vertex_frag_ids


if __name__=="__main__":
    #test
    pass


================================================
FILE: utils/geometric.py
================================================
import numpy as np 


def range_to_depth(mask, range, K):
    '''
       Transform the range image to depth image
    '''
    f=K[0,0]
    cx=K[0,2]
    cy=K[1,2]

    ys_, xs_=np.nonzero(mask)
    rngs=range[ys_,xs_]
    # xs,ys=np.asarray(xs,np.float32),np.asarray(ys,np.float32)
    xs,ys=np.asarray(xs_,np.float32)+0.5,np.asarray(ys_,np.float32)+0.5

    Zs=f*rngs/( f**2 + (cx-xs)**2 + (cy-ys)**2 )**0.5
    depth = np.zeros_like(range)
    depth[ys_,xs_] = Zs
    return  depth

def mask_depth_to_point_cloud(mask,depth,K):
    '''
        lift the depth under the mask to 3D point clouds
    '''
    ys, xs=np.nonzero(mask)
    dpts=depth[ys,xs]
    # xs,ys=np.asarray(xs,np.float32),np.asarray(ys,np.float32)
    xs,ys=np.asarray(xs,np.float32)+0.5,np.asarray(ys,np.float32)+0.5
    xys=np.concatenate([xs[:,None],ys[:,None]],1)
    xys*=dpts[:,None]
    xyds=np.concatenate([xys,dpts[:,None]],1)
    pts=np.matmul(xyds,np.linalg.inv(K).transpose())
    return pts.astype(np.float32), np.stack([xs,ys], axis=-1 )

def chordal_distance(R1,R2):
    return np.sqrt(np.sum((R1-R2)*(R1-R2))) 

def rotation_angle(R1, R2):
    return 2*np.arcsin( chordal_distance(R1,R2)/np.sqrt(8) )

def render_pointcloud(pc, T, K, render_image_size):
        """
        Args:
            T: (B,3,4) or (B,4,4)
            K: (B,3,3)
            render_image_size (tuple): (h,w)
            near (float, optional):  Defaults to 0.1.
            far (int, optional): Defaults to 6.
            mode: 'bilinear' or 'neareast'
        """

        B=T.shape[0]

        # T = self.cam_opencv2pytch3d.to(device=T.device)@T

        ## X_cam = X_world R + t
        # R = T[...,:3,:3].transpose(-1,-2)
        R = T[...,:3,:3].transpose( [0,2,1] )
        t = T[...,:3,3]

        #render depths
        # vert_depths= (self.verts@R+t).squeeze(0)[...,2:]
        X_cam= (pc@R+t)#.squeeze(0)

        x=X_cam@K.transpose([0,2,1])  #BxNx3
        depth = x[...,-1]
        x = x/x[...,-1:]

        out = np.zeros([1,1, *render_image_size], dtype=R.dtype)
        out[:, :, 
            np.round(x[0, :, 1]).astype(np.int64).clip(0, out.shape[2]-1),
            np.round(x[0, :, 0]).astype(np.int64).clip(0, out.shape[3]-1)] = depth 

        return out #1x1xHxW

================================================
FILE: utils/img_utils.py
================================================
import torch
from matplotlib import cm
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import cv2
from PIL import Image

def read_depth(path):
    if (path[-3:] == 'dpt'):
        with open(path) as f:
            h,w = np.fromfile(f,dtype=np.uint32,count=2)
            data = np.fromfile(f,dtype=np.uint16,count=w*h)
            depth = data.reshape((h,w))
    else:
        depth = np.asarray(Image.open(path)).copy()
    return depth

def unnormalize_img(img, mean, std, in_gpu=True):
    """
    img: [3, h, w]
    """
    img = img.detach().cpu().clone()
    # img = img / 255.
    img *= torch.tensor(std).view(3, 1, 1)
    img += torch.tensor(mean).view(3, 1, 1)
    min_v = torch.min(img)
    img = (img - min_v) / (torch.max(img) - min_v)
    return img


def draw_seg_th(seg, num_cls=-1):
    """
    seg: [h, w]
    """
    r = seg.clone()
    g = seg.clone()
    b = seg.clone()
    num_cls = len(colors) if num_cls == -1 else num_cls
    seg_colors = 1 - colors[:, 0, 0]
    for l in range(num_cls):
        inds = (seg == l)
        r[inds] = int(seg_colors[l][0])
        g[inds] = int(seg_colors[l][1])
        b[inds] = int(seg_colors[l][2])
    seg = torch.stack([r, g, b], dim=0).float() / 255.
    return seg


def draw_seg_prob_th(seg_prob):
    """
    seg_prob: [num_cls, h, w]
    """
    num_cls = seg_prob.shape[0]
    seg = torch.argmax(seg_prob, dim=0).long()
    return draw_seg_th(seg, num_cls)


def draw_vertex_th(vertex):
    """
    vertex: [h, w]
    """
    min_ver = torch.min(vertex)
    max_ver = torch.max(vertex)
    vertex = (vertex - min_ver) / (max_ver - min_ver)
    vertex = cmap(vertex.detach().cpu().numpy())[..., :3]
    return torch.tensor(vertex).permute(2, 0, 1)


def visualize_coco_bbox(img, boxes):
    """
    img: [h, w, 3]
    boxes: [n, 4], [[x, y, x_max, y_max]]
    """
    _, ax = plt.subplots(1)
    ax.imshow(img)
    n = len(boxes)
    for ni in range(n):
        x, y, x_max, y_max = boxes[ni]
        ax.add_patch(patches.Polygon(xy=[[x, y], [x, y_max], [x_max, y_max], [x_max, y]], fill=False, linewidth=1, edgecolor='r'))
    plt.show()


def visualize_heatmap(img, hm):
    """
    img: [h, w, 3]
    hm: [c, h, w]
    """
    hm = np.max(hm, axis=0)
    h, w = hm.shape[:2]
    img = cv2.resize(img, dsize=(w, h), interpolation=cv2.INTER_LINEAR)
    hm = np.array([255, 255, 255]) - (hm.reshape(h, w, 1) * colors[0]).astype(np.uint8)
    ratio = 0.5
    blend = (img * ratio + hm * (1 - ratio)).astype(np.uint8)
    _, (ax1, ax2) = plt.subplots(1, 2)
    ax1.imshow(img)
    ax2.imshow(blend)
    plt.show()


def visualize_coco_img_mask(img, mask):
    _, (ax1, ax2) = plt.subplots(1, 2)
    ax1.imshow(img)
    ax2.imshow(mask)
    plt.show()


def visualize_color_aug(orig_img, aug_img):
    _, (ax1, ax2) = plt.subplots(1, 2)
    ax1.imshow(orig_img[:, :, [2, 1, 0]])
    ax2.imshow(aug_img[:, :, [2, 1, 0]])
    plt.show()


def visualize_coco_ann(coco, img, ann):
    plt.imshow(img)
    coco.showAnns(ann)
    plt.show()


def bgr_to_rgb(img):
    return img[:, :, [2, 1, 0]]


cmap = cm.get_cmap()
color_list = np.array(
    [
        0.000, 0.447, 0.741,
        0.850, 0.325, 0.098,
        0.929, 0.694, 0.125,
        0.494, 0.184, 0.556,
        0.466, 0.674, 0.188,
        0.301, 0.745, 0.933,
        0.635, 0.078, 0.184,
        0.300, 0.300, 0.300,
        0.600, 0.600, 0.600,
        1.000, 0.000, 0.000,
        1.000, 0.500, 0.000,
        0.749, 0.749, 0.000,
        0.000, 1.000, 0.000,
        0.000, 0.000, 1.000,
        0.667, 0.000, 1.000,
        0.333, 0.333, 0.000,
        0.333, 0.667, 0.000,
        0.333, 1.000, 0.000,
        0.667, 0.333, 0.000,
        0.667, 0.667, 0.000,
        0.667, 1.000, 0.000,
        1.000, 0.333, 0.000,
        1.000, 0.667, 0.000,
        1.000, 1.000, 0.000,
        0.000, 0.333, 0.500,
        0.000, 0.667, 0.500,
        0.000, 1.000, 0.500,
        0.333, 0.000, 0.500,
        0.333, 0.333, 0.500,
        0.333, 0.667, 0.500,
        0.333, 1.000, 0.500,
        0.667, 0.000, 0.500,
        0.667, 0.333, 0.500,
        0.667, 0.667, 0.500,
        0.667, 1.000, 0.500,
        1.000, 0.000, 0.500,
        1.000, 0.333, 0.500,
        1.000, 0.667, 0.500,
        1.000, 1.000, 0.500,
        0.000, 0.333, 1.000,
        0.000, 0.667, 1.000,
        0.000, 1.000, 1.000,
        0.333, 0.000, 1.000,
        0.333, 0.333, 1.000,
        0.333, 0.667, 1.000,
        0.333, 1.000, 1.000,
        0.667, 0.000, 1.000,
        0.667, 0.333, 1.000,
        0.667, 0.667, 1.000,
        0.667, 1.000, 1.000,
        1.000, 0.000, 1.000,
        1.000, 0.333, 1.000,
        1.000, 0.667, 1.000,
        0.167, 0.000, 0.000,
        0.333, 0.000, 0.000,
        0.500, 0.000, 0.000,
        0.667, 0.000, 0.000,
        0.833, 0.000, 0.000,
        1.000, 0.000, 0.000,
        0.000, 0.167, 0.000,
        0.000, 0.333, 0.000,
        0.000, 0.500, 0.000,
        0.000, 0.667, 0.000,
        0.000, 0.833, 0.000,
        0.000, 1.000, 0.000,
        0.000, 0.000, 0.167,
        0.000, 0.000, 0.333,
        0.000, 0.000, 0.500,
        0.000, 0.000, 0.667,
        0.000, 0.000, 0.833,
        0.000, 0.000, 1.000,
        0.000, 0.000, 0.000,
        0.143, 0.143, 0.143,
        0.286, 0.286, 0.286,
        0.429, 0.429, 0.429,
        0.571, 0.571, 0.571,
        0.714, 0.714, 0.714,
        0.857, 0.857, 0.857,
        1.000, 1.000, 1.000,
        0.50, 0.5, 0
    ]
).astype(np.float32)
colors = color_list.reshape((-1, 3)) * 255
colors = np.array(colors, dtype=np.uint8).reshape(len(colors), 1, 1, 3)


================================================
FILE: utils/log_tool.py
================================================
import numpy as np
from tensorboardX import SummaryWriter
import json
from pathlib import Path
import logging


def _flat_nested_json_dict(json_dict, flatted, sep=".", start=""):
    for k, v in json_dict.items():
        if isinstance(v, dict):
            _flat_nested_json_dict(v, flatted, sep, start + sep + str(k))
        else:
            flatted[start + sep + str(k)] = v


def flat_nested_json_dict(json_dict, sep=".") -> dict:
    """flat a nested json-like dict. this function make shadow copy.
    """
    flatted = {}
    for k, v in json_dict.items():
        if isinstance(v, dict):
            _flat_nested_json_dict(v, flatted, sep, str(k))
        else:
            flatted[str(k)] = v
    return flatted


def metric_to_str(metrics, sep='.'):
    flatted_metrics = flat_nested_json_dict(metrics, sep)
    metrics_str_list = []
    for k, v in flatted_metrics.items():
        if isinstance(v, float):
            metrics_str_list.append(f"{k}={v:.5}")
        elif isinstance(v, (list, tuple)):
            if v and isinstance(v[0], float):
                v_str = ', '.join([f"{e:.5}" for e in v])
                metrics_str_list.append(f"{k}=[{v_str}]")
            else:
                metrics_str_list.append(f"{k}={v}")
        else:
            metrics_str_list.append(f"{k}={v}")
    return ', '.join(metrics_str_list)


class SimpleModelLog:
    """For simple log.
    generate 4 kinds of log: 
    1. simple log.txt, all metric dicts are flattened to produce
    readable results.
    2. TensorBoard scalars and texts
    3. multi-line json file log.json.lst
    4. tensorboard_scalars.json, all scalars are stored in this file
        in tensorboard json format.
    """

    def __init__(self, model_dir, disable=False):
        self.model_dir = Path(model_dir)
        self.log_file = None
        self.log_mjson_file = None
        self.summary_writter = None
        self.metrics = []
        self._text_current_gstep = -1
        self._tb_texts = []
        self.disable = disable

        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)

    def open(self):
        if self.disable:
            return self
        model_dir = self.model_dir
        assert model_dir.exists()
        summary_dir = model_dir / 'summary'
        summary_dir.mkdir(parents=True, exist_ok=True)

        log_mjson_file_path = model_dir / f'log.json.lst'
        if log_mjson_file_path.exists():
            with open(log_mjson_file_path, 'r') as f:
                for line in f.readlines():
                    self.metrics.append(json.loads(line))
        log_file_path = model_dir / f'log.txt'
        self.log_mjson_file = open(log_mjson_file_path, 'a')
        self.log_file = open(log_file_path, 'a')
        self.summary_writter = SummaryWriter(str(summary_dir))
        return self

    def close(self):
        if self.disable:
            return 
        assert self.summary_writter is not None
        self.log_mjson_file.close()
        self.log_file.close()
        tb_json_path = str(self.model_dir / "tensorboard_scalars.json")
        self.summary_writter.export_scalars_to_json(tb_json_path)
        self.summary_writter.close()
        self.log_mjson_file = None
        self.log_file = None
        self.summary_writter = None

    def log_text(self, text, step, tag="regular log"):
        if self.disable:
            return 
        """This function only add text to log.txt and tensorboard texts
        """
        print(text,flush=True)
        print(text, file=self.log_file,flush=True)
        if step > self._text_current_gstep and self._text_current_gstep != -1:
            total_text = '\n'.join(self._tb_texts)
            self.summary_writter.add_text(tag, total_text, global_step=step)
            self._tb_texts = []
            self._text_current_gstep = step
        else:
            self._tb_texts.append(text)
        if self._text_current_gstep == -1:
            self._text_current_gstep = step

    def log_metrics(self, metrics: dict, step):
        if self.disable:
            return 
        flatted_summarys = flat_nested_json_dict(metrics, "/")
        for k, v in flatted_summarys.items():
            if isinstance(v, (list, tuple)):
                if any([isinstance(e, str) for e in v]):
                    continue
                v_dict = {str(i): e for i, e in enumerate(v)}
                for k1, v1 in v_dict.items():
                    self.summary_writter.add_scalar(k + "/" + k1, v1, step)
            else:
                if isinstance(v, str):
                    continue
                self.summary_writter.add_scalar(k, v, step)
        log_str = metric_to_str(metrics)
        # print(log_str, flush=True)
        self.logger.info(log_str)
        print(log_str, file=self.log_file, flush=True)
        print(json.dumps(metrics), file=self.log_mjson_file, flush=True)

    def log_images(self, images: dict, step, prefix=''):
        if self.disable:
            return 
        for k, v in images.items():
            self.summary_writter.add_images(prefix+str(k), v, step)
            print(f"Summarize images {k}",flush=True)

    def log_histograms(self, vals: dict, step, prefix=''):
        if self.disable:
            return 
        for k, v in vals.items():
            self.summary_writter.add_histogram(prefix+str(k), v, step)
            print(f"Summarize histograms {k}",flush=True)

    # def log_distributions(self, vals: dict, step, prefix=''):
    #     if self.disable:
    #         return 
    #     for k, v in vals.items():
    #         self.summary_writter.histogram(prefix+str(k), v, step)
    #         print(f"Summarize histograms {k}",flush=True)

================================================
FILE: utils/pose_utils.py
================================================
"""
Copyright (C) 2018 NVIDIA Corporation.  All rights reserved.
Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
"""

import torch
from torch.nn import Module
from torch.autograd import Variable
from torch.nn.functional import pad
import numpy as np
import scipy.linalg as slin
import math
import transforms3d.quaternions as txq
import transforms3d.euler as txe
# see for formulas:
# https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-801-machine-vision-fall-2004/readings/quaternions.pdf
# and "Quaternion and Rotation" - Yan-Bin Jia, September 18, 2016
#from IPython.core.debugger import set_trace

# PYTORCH

def pose_padding(P):
    """
    Padding 3x4 SE3 to 4x4
    Args:
        P ([type]): [description]
        dim ([type]): [description]
    """
    assert (P.shape[-2] == 3 and P.shape[-1] == 4) or (P.shape[-2] == 2 and P.shape[-1] == 3)

    pad = torch.zeros_like(P[...,:1,:] )
    pad[...,-1]=1
    return torch.cat([P,pad], dim=-2)
    
def vdot(v1, v2):
    """
    Dot product along the dim=1
    :param v1: N x d
    :param v2: N x d
    :return: N x 1
    """
    out = torch.mul(v1, v2)
    out = torch.sum(out, 1)
    return out


def normalize(x, p=2, dim=0):
    """
    Divides a tensor along a certain dim by the Lp norm
    :param x: 
    :param p: Lp norm
    :param dim: Dimension to normalize along
    :return: 
    """
    xn = x.norm(p=p, dim=dim)
    x = x / xn.unsqueeze(dim=dim)
    return x


def qmult(q1, q2):
    """
    Multiply 2 quaternions
    :param q1: Tensor N x 4
    :param q2: Tensor N x 4
    :return: quaternion product, Tensor N x 4
    """
    q1s, q1v = q1[:, :1], q1[:, 1:]
    q2s, q2v = q2[:, :1], q2[:, 1:]

    qs = q1s*q2s - vdot(q1v, q2v)
    qv = q1v.mul(q2s.expand_as(q1v)) + q2v.mul(q1s.expand_as(q2v)) +\
        torch.cross(q1v, q2v, dim=1)
    q = torch.cat((qs, qv), dim=1)

    # normalize
    q = normalize(q, dim=1)

    return q


def qinv(q):
    """
    Inverts quaternions
    :param q: N x 4
    :return: q*: N x 4 
    """
    q_inv = torch.cat((q[:, :1], -q[:, 1:]), dim=1)
    return q_inv


def qexp_t(q):
    """
    Applies exponential map to log quaternion
    :param q: N x 3
    :return: N x 4
    """
    n = torch.norm(q, p=2, dim=1, keepdim=True)
    n = torch.clamp(n, min=1e-8)
    q = q * torch.sin(n)
    q = q / n
    q = torch.cat((torch.cos(n), q), dim=1)
    return q


def qlog_t(q):
    """
    Applies the log map to a quaternion
    :param q: N x 4
    :return: N x 3
    """
    n = torch.norm(q[:, 1:], p=2, dim=1, keepdim=True)
    n = torch.clamp(n, min=1e-8)
    q = q[:, 1:] * torch.acos(torch.clamp(q[:, :1], min=-1.0, max=1.0))
    q = q / n
    return q


def qexp_t_safe(q):
    """
    Applies exponential map to log quaternion (safe implementation that does not
    maintain gradient flow)
    :param q: N x 3
    :return: N x 4
    """
    q = torch.from_numpy(np.asarray([qexp(qq) for qq in q.numpy()],
                                    dtype=np.float32))
    return q


def qlog_t_safe(q):
    """
    Applies the log map to a quaternion (safe implementation that does not
    maintain gradient flow)
    :param q: N x 4
    :return: N x 3
    """
    q = torch.from_numpy(np.asarray([qlog(qq) for qq in q.numpy()],
                                    dtype=np.float32))
    return q


def rotate_vec_by_q(t, q):
    """
    rotates vector t by quaternion q
    :param t: vector, Tensor N x 3
    :param q: quaternion, Tensor N x 4
    :return: t rotated by q: t' = t + 2*qs*(qv x t) + 2*qv x (qv x r) 
    """
    qs, qv = q[:, :1], q[:, 1:]
    b = torch.cross(qv, t, dim=1)
    c = 2 * torch.cross(qv, b, dim=1)
    b = 2 * b.mul(qs.expand_as(b))
    tq = t + b + c
    return tq


def compose_pose_quaternion(p1, p2):
    """
    pyTorch implementation
    :param p1: input pose, Tensor N x 7 
    :param p2: pose to apply, Tensor N x 7
    :return: output pose, Tensor N x 7
    all poses are translation + quaternion
    #!comments: first apply p2 and then p1 !!
    """
    p1t, p1q = p1[:, :3], p1[:, 3:]
    p2t, p2q = p2[:, :3], p2[:, 3:]
    q = qmult(p1q, p2q)
    t = p1t + rotate_vec_by_q(p2t, p1q)
    return torch.cat((t, q), dim=1)


def invert_pose_quaternion(p):
    """
    inverts the pose
    :param p: pose, Tensor N x 7
    :return: inverted pose
    """
    t, q = p[:, :3], p[:, 3:]
    q_inv = qinv(q)
    tinv = -rotate_vec_by_q(t, q_inv)
    return torch.cat((tinv, q_inv), dim=1)


def calc_vo(p0, p1):
    """
    calculates VO (in the p0 frame) from 2 poses
    :param p0: N x 7
    :param p1: N x 7
    """
    # assert p0.shape==p1.shape
    return compose_pose_quaternion(invert_pose_quaternion(p0), p1)


def calc_vo_logq(p0, p1):
    """
    VO (in the p0 frame) (logq)
    :param p0: N x 6
    :param p1: N x 6
    :return: N-1 x 6
    """
    q0 = qexp_t(p0[:, 3:])
    q1 = qexp_t(p1[:, 3:])
    vos = calc_vo(torch.cat((p0[:, :3], q0), dim=1), torch.cat((p1[:, :3], q1),
                                                               dim=1))
    vos_q = qlog_t(vos[:, 3:])
    return torch.cat((vos[:, :3], vos_q), dim=1)


def calc_vo_relative(p0, p1):
    """
    calculates VO (in the world frame) from 2 poses
    :param p0: N x 7
    :param p1: N x 7
    """
    vos_t = p1[:, :3] - p0[:, :3]
    vos_q = qmult(qinv(p0[:, 3:]), p1[:, 3:])
    return torch.cat((vos_t, vos_q), dim=1)


def calc_vo_relative_logq(p0, p1):
    """
    Calculates VO (in the world frame) from 2 poses (log q)
    :param p0: N x 6
    :param p1: N x 6
    :return:
    """
    q0 = qexp_t(p0[:, 3:])
    q1 = qexp_t(p1[:, 3:])
    vos = calc_vo_relative(torch.cat((p0[:, :3], q0), dim=1),
                           torch.cat((p1[:, :3], q1), dim=1))
    vos_q = qlog_t(vos[:, 3:])
    return torch.cat((vos[:, :3], vos_q), dim=1)


def calc_vo_relative_logq_safe(p0, p1):
    """
    Calculates VO (in the world frame) from 2 poses (log q) through numpy fns
    :param p0: N x 6
    :param p1: N x 6
    :return:
    """
    vos_t = p1[:, :3] - p0[:, :3]
    q0 = qexp_t_safe(p0[:, 3:])
    q1 = qexp_t_safe(p1[:, 3:])
    vos_q = qmult(qinv(q0), q1)
    vos_q = qlog_t_safe(vos_q)
    return torch.cat((vos_t, vos_q), dim=1)


def calc_vo_logq_safe(p0, p1):
    """
    VO in the p0 frame using numpy fns
    :param p0:
    :param p1:
    :return:
    """
    vos_t = p1[:, :3] - p0[:, :3]
    q0 = qexp_t_safe(p0[:, 3:])
    q1 = qexp_t_safe(p1[:, 3:])
    vos_t = rotate_vec_by_q(vos_t, qinv(q0))
    vos_q = qmult(qinv(q0), q1)
    vos_q = qlog_t_safe(vos_q)
    return torch.cat((vos_t, vos_q), dim=1)


def calc_vos_simple(poses):
    """
    calculate the VOs, from a list of consecutive poses
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [p[i+1].unsqueeze(0) - p[i].unsqueeze(0)
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)

    return vos


def calc_vos(poses):
    """
    calculate the VOs, from a list of consecutive poses (in the p0 frame)
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [calc_vo_logq(p[i].unsqueeze(0), p[i+1].unsqueeze(0))
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos


def calc_vos_relative(poses):
    """
    calculate the VOs, from a list of consecutive poses (in the world frame)
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [calc_vo_relative_logq(p[i].unsqueeze(0), p[i+1].unsqueeze(0))
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos


def calc_vos_safe(poses):
    """
    calculate the VOs, from a list of consecutive poses
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [calc_vo_logq_safe(p[i].unsqueeze(0), p[i+1].unsqueeze(0))
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos


def calc_vos_safe_fc(poses):
    """
    calculate the VOs, from a list of consecutive poses (fully connected)
    :param poses: N x T x 7
    :return: N x TC2 x 7
    """
    vos = []
    for p in poses:
        pvos = []
        for i in range(p.size(0)):
            for j in range(i+1, p.size(0)):
                pvos.append(calc_vo_logq_safe(
                    p[i].unsqueeze(0), p[j].unsqueeze(0)))
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos

# NUMPY


def qlog(q):
    """
    Applies logarithm map to q
    :param q: (4,)
    :return: (3,)
    """
    if all(q[1:] == 0):
        q = np.zeros(3)
    else:
        q = np.arccos(q[0]) * q[1:] / np.linalg.norm(q[1:])
    return q


def qexp(q):
    """
    Applies the exponential map to q
    :param q: (3,)
    :return: (4,)
    """
    n = np.linalg.norm(q)
    q = np.hstack((np.cos(n), np.sinc(n/np.pi)*q))
    return q


def process_poses(poses_in, mean_t, std_t, align_R, align_t, align_s):
    """
    processes the 1x12 raw pose from dataset by aligning and then normalizing
    :param poses_in: N x 12
    :param mean_t: 3
    :param std_t: 3
    :param align_R: 3 x 3
    :param align_t: 3
    :param align_s: 1
    :return: processed poses (translation + quaternion) N x 7
    """
    poses_out = np.zeros((len(poses_in), 6))
    poses_out[:, 0:3] = poses_in[:, [3, 7, 11]]

    # align
    for i in range(len(poses_out)):
        R = poses_in[i].reshape((3, 4))[:3, :3]
        q = txq.mat2quat(np.dot(align_R, R))
        q *= np.sign(q[0])  # constrain to hemisphere
        q = qlog(q)
        poses_out[i, 3:] = q
        t = poses_out[i, :3] - align_t
        poses_out[i, :3] = align_s * \
            np.dot(align_R, t[:, np.newaxis]).squeeze()

    # normalize translation
    poses_out[:, :3] -= mean_t
    poses_out[:, :3] /= std_t
    return poses_out


def log_quaternion_angular_error(q1, q2):
    return quaternion_angular_error(qexp(q1), qexp(q2))


def quaternion_angular_error(q1, q2):
    """
    angular error between two quaternions
    :param q1: (4, )
    :param q2: (4, )
    :return:
    """
    d = abs(np.dot(q1, q2))
    d = min(1.0, max(-1.0, d))
    theta = 2 * np.arccos(d) * 180 / np.pi
    return theta


def skew(x):
    """
    returns skew symmetric matrix from vector
    :param x: 3 x 1
    :return:
    """
    s = np.asarray([[0, -x[2], x[1]], [x[2], 0, -x[0]], [-x[1], x[0], 0]])
    return s


def dpq_q(p):
    """
    returns the jacobian of quaternion product pq w.r.t. q
    :param p: 4 x 1
    :return: 4 x 4
    """
    J = np.zeros((4, 4))
    J[0, 0] = p[0]
    J[0, 1:] = -p[1:].squeeze()
    J[1:, 0] = p[1:].squeeze()
    J[1:, 1:] = p[0] * np.eye(3) + skew(p[1:])
    return J


def dpsq_q(p):
    """
    returns the jacobian of quaternion product (p*)q w.r.t. q
    :param p: 4 x 1
    :return: 4 x 4
    """
    J = np.zeros((4, 4))
    J[0, 0] = p[0]
    J[0, 1:] = -p[1:].squeeze()
    J[1:, 0] = -p[1:].squeeze()
    J[1:, 1:] = p[0] * np.eye(3) - skew(p[1:])
    return J


def dpsq_p(q):
    """
    returns the jacobian of quaternion product (p*)q w.r.t. p
    :param q: 4 x 1
    :return: 4 x 4
    """
    J = np.zeros((4, 4))
    J[0, 0] = q[0]
    J[0, 1:] = q[1:].squeeze()
    J[1:, 0] = q[1:].squeeze()
    J[1:, 1:] = -q[0] * np.eye(3) + skew(q[1:])
    return J


def dqstq_q(q, t):
    """
    jacobian of q* t q w.r.t. q
    :param q: 4 x 1
    :param t: 3 x 1
    :return: 3 x 4
    """
    J = np.zeros((3, 4))
    J[:, :1] = q[0]*t - np.cross(q[1:], t, axis=0)
    J[:, 1:] = -np.dot(t, q[1:].T) + np.dot(t.T, q[1:])*np.eye(3) + \
        np.dot(q[1:], t.T) + q[0]*skew(t)
    J *= 2
    return J


def dqstq_t(q):
    """
    jacobian of q* t q w.r.t. t
    :param q: 4 x 1
    :return: 3 x 3
    """
    J = (q[0]*q[0] - np.dot(q[1:].T, q[1:])) * np.eye(3) + 2*np.dot(q[1:], q[1:].T) -\
        2*q[0]*skew(q[1:])
    return J


def m_rot(x):
    """
    returns Jacobian of exponential map w.r.t. manifold increment
    :param x: part of state vector affected by increment, 4 x 1
    :return: 4 x 3
    """
    # jacobian of full q wrt qm (quaternion update on manifold),
    # evaluated at qv = (0, 0, 0)
    # full q is derived using either the exponential map or q0 = sqrt(1-qm^2)
    jm = np.vstack((np.zeros((1, 3)), np.eye(3)))  # 4 x 3
    m = np.dot(dpq_q(p=x), jm)
    return m


class PoseGraph:
    def __init__(self):
        """
        implements pose graph optimization from
        "Hybrid Hessians for Optimization of Pose Graphs" - Y. LeCun et al
        and "A Tutorial on Graph-Based SLAM" - W. Burgard et al
        """
        self.N = 0
        self.z = np.zeros((0, 0))

    def jacobian(self, L_ax, L_aq, L_rx, L_rq):
        # 6 because updates for rotation are on manifold
        J = np.zeros((0, 6*self.N))

        # unary constraints
        for i in range(self.N):
            # translation constraint
            jt = np.zeros((3, J.shape[1]))
            jt[:, 6*i: 6*i+3] = np.eye(3)
            J = np.vstack((J, np.dot(L_ax, jt)))

            # rotation constraint
            jr = np.zeros((4, J.shape[1]))
            jr[:, 6*i+3: 6*i+6] = m_rot(x=self.z[7*i+3: 7*i+7])
            J = np.vstack((J, np.dot(L_aq, jr)))

        # pairwise constraints
        for i in range(self.N-1):
                # translation constraint
            jt = np.zeros((3, J.shape[1]))
            dt = dqstq_t(q=self.z[7*i+3: 7*i+7])
            # dt = np.eye(3)
            jt[:, 6*i: 6*i+3] = -dt
            jt[:, 6*(i+1): 6*(i+1)+3] = dt
            # m = m_rot(x=self.z[7*i+3 : 7*i+7])
            # a = dqstq_q(q=self.z[7*i+3 : 7*i+7],
            #             t=self.z[7*(i+1) : 7*(i+1)+3]-self.z[7*i : 7*i+3])
            # jt[:, 6*i+3 : 6*i+6] = np.dot(a, m)
            J = np.vstack((J, np.dot(L_rx, jt)))

            # rotation constraint
            jr = np.zeros((4, J.shape[1]))
            m = m_rot(x=self.z[7*i+3: 7*i+7])
            a = dpsq_p(q=self.z[7*(i+1)+3: 7*(i+1)+7])
            jr[:, 6*i+3: 6*i+6] = np.dot(a, m)
            m = m_rot(x=self.z[7*(i+1)+3: 7*(i+1)+7])
            b = dpsq_q(p=self.z[7*i+3: 7*i+7])
            jr[:, 6*(i+1)+3: 6*(i+1)+6] = np.dot(b, m)
            J = np.vstack((J, np.dot(L_rq, jr)))

        return J

    def residuals(self, poses, vos, L_ax, L_aq, L_rx, L_rq):
        """
        computes the residuals
        :param poses: N x 7
        :param vos: (N-1) x 7
        :param L_ax: 3 x 3
        :param L_aq: 4 x 4
        :param L_rx: 3 x 3
        :param L_rq: 4 x 4
        :return:
        """
        r = np.zeros((0, 1))

        # unary residuals
        L = np.zeros((7, 7))
        L[:3, :3] = L_ax
        L[3:, 3:] = L_aq
        for i in range(self.N):
            rr = self.z[7*i: 7*(i+1)] - np.reshape(poses[i], (-1, 1))
            r = np.vstack((r, np.dot(L, rr)))

        # pairwise residuals
        for i in range(self.N-1):
            # translation residual
            v = self.z[7*(i+1):7*(i+1)+3, 0]-self.z[7*i:7*i+3, 0]
            q = txq.qinverse(self.z[7*i+3:7*i+7, 0])
            rt = txq.rotate_vector(v, q)
            rt = rt[:, np.newaxis] - vos[i, :3].reshape((-1, 1))
            # rt = self.z[7*(i+1) : 7*(i+1)+3] - self.z[7*i : 7*i+3] - \
            #     vos[i, :3].reshape((-1, 1))
            r = np.vstack((r, np.dot(L_rx, rt)))

            # rotation residual
            q0 = self.z[7*i+3: 7*i+7].squeeze()
            q1 = self.z[7*(i+1)+3: 7*(i+1)+7].squeeze()
            qvo = txq.qmult(txq.qinverse(q0), q1).reshape((-1, 1))
            rq = qvo - vos[i, 3:].reshape((-1, 1))
            r = np.vstack((r, np.dot(L_rq, rq)))

        return r

    def update_on_manifold(self, x):
        """
        Updates the state vector on manifold
        :param x: manifold increment, column vector
        :return:
        """
        for i in range(self.N):
            # update translation
            t = x[6*i: 6*i+3]
            self.z[7*i: 7*i+3] += t

            # update rotation
            qm = x[6*i+3: 6*i+6]  # quaternion on the manifold
            dq = np.zeros(4)
            # method in Burgard paper
            # dq[1:] = qm.squeeze()
            # dq[0] = math.sqrt(1 - sum(np.square(qm)))  # incremental quaternion
            # method of exponential map
            n = np.linalg.norm(qm)
            dq[0] = math.cos(n)
            dq[1:] = np.sinc(n/np.pi) * qm.squeeze()
            q = self.z[7*i+3: 7*i+7].squeeze()
            q = txq.qmult(q, dq).reshape((-1, 1))
            self.z[7*i+3: 7*i+7] = q

    def optimize(self, poses, vos, sax=1, saq=1, srx=1, srq=1, n_iters=10):
        """
        run PGO, with init = poses
        :param poses:
        :param vos:
        :param sax: sigma for absolute translation
        :param saq: sigma for absolute rotation
        :param srx: sigma for relative translation
        :param srq: sigma for relative rotation
        :param n_iters:
        :return:
        """
        self.N = len(poses)
        # init state vector with the predicted poses
        self.z = np.reshape(poses.copy(), (-1, 1))

        # construct the information matrices
        L_ax = np.linalg.cholesky(np.eye(3) / sax)
        L_aq = np.linalg.cholesky(np.eye(4) / saq)
        L_rx = np.linalg.cholesky(np.eye(3) / srx)
        L_rq = np.linalg.cholesky(np.eye(4) / srq)

        for n_iter in range(n_iters):
            J = self.jacobian(L_ax.T, L_aq.T, L_rx.T, L_rq.T)
            r = self.residuals(poses.copy(), vos.copy(), L_ax.T, L_aq.T, L_rx.T,
                               L_rq.T)
            H = np.dot(J.T, J)  # hessian
            b = np.dot(J.T, r)  # residuals

            # solve Hx = -b for x
            R = slin.cholesky(H)  # H = R' R
            y = slin.solve_triangular(R.T, -b)
            x = slin.solve_triangular(R, y)

            self.update_on_manifold(x)

        return self.z.reshape((-1, 7))


class PoseGraphFC:
    def __init__(self):
        """
        implements pose graph optimization from
        "Hybrid Hessians for Optimization of Pose Graphs" - Y. LeCun et al
        and "A Tutorial on Graph-Based SLAM" - W. Burgard et al
        fully connected version
        """
        self.N = 0
        self.z = np.zeros((0, 0))

    def jacobian(self, L_ax, L_aq, L_rx, L_rq):
        # 6 because updates for rotation are on manifold
        J = np.zeros((0, 6*self.N))

        # unary constraints
        for i in range(self.N):
            # translation constraint
            jt = np.zeros((3, J.shape[1]))
            jt[:, 6*i: 6*i+3] = np.eye(3)
            J = np.vstack((J, np.dot(L_ax, jt)))

            # rotation constraint
            jr = np.zeros((4, J.shape[1]))
            jr[:, 6*i+3: 6*i+6] = m_rot(x=self.z[7*i+3: 7*i+7])
            J = np.vstack((J, np.dot(L_aq, jr)))

        # pairwise constraints
        for i in range(self.N):
            for j in range(i+1, self.N):
                # translation constraint
                jt = np.zeros((3, J.shape[1]))
                dt = dqstq_t(q=self.z[7*i+3: 7*i+7])
                # dt = np.eye(3)
                jt[:, 6*i: 6*i+3] = -dt
                jt[:, 6*j: 6*j+3] = dt
                # m = m_rot(x=self.z[7*i+3 : 7*i+7])
                # a = dqstq_q(q=self.z[7*i+3 : 7*i+7],
                #             t=self.z[7*(i+1) : 7*(i+1)+3]-self.z[7*i : 7*i+3])
                # jt[:, 6*i+3 : 6*i+6] = np.dot(a, m)
                J = np.vstack((J, np.dot(L_rx, jt)))

                # rotation constraint
                jr = np.zeros((4, J.shape[1]))
                m = m_rot(x=self.z[7*i+3: 7*i+7])
                a = dpsq_p(q=self.z[7*j+3: 7*j+7])
                jr[:, 6*i+3: 6*i+6] = np.dot(a, m)
                m = m_rot(x=self.z[7*j+3: 7*j+7])
                b = dpsq_q(p=self.z[7*i+3: 7*i+7])
                jr[:, 6*j+3: 6*j+6] = np.dot(b, m)
                J = np.vstack((J, np.dot(L_rq, jr)))

        return J

    def residuals(self, poses, vos, L_ax, L_aq, L_rx, L_rq):
        """
        computes the residuals
        :param poses: N x 7
        :param vos: (N-1) x 7
        :param L_ax: 3 x 3
        :param L_aq: 4 x 4
        :param L_rx: 3 x 3
        :param L_rq: 4 x 4
        :return: 
        """
        r = np.zeros((0, 1))

        # unary residuals
        L = np.zeros((7, 7))
        L[:3, :3] = L_ax
        L[3:, 3:] = L_aq
        for i in range(self.N):
            rr = self.z[7*i: 7*(i+1)] - np.reshape(poses[i], (-1, 1))
            r = np.vstack((r, np.dot(L, rr)))

        # pairwise residuals
        k = 0
        for i in range(self.N):
            for j in range(i+1, self.N):
                # translation residual
                v = self.z[7*j:7*j+3, 0]-self.z[7*i:7*i+3, 0]
                q = txq.qinverse(self.z[7*i+3:7*i+7, 0])
                rt = txq.rotate_vector(v, q)
                rt = rt[:, np.newaxis] - vos[k, :3].reshape((-1, 1))
                # rt = self.z[7*(i+1) : 7*(i+1)+3] - self.z[7*i : 7*i+3] - \
                #     vos[i, :3].reshape((-1, 1))
                r = np.vstack((r, np.dot(L_rx, rt)))

                # rotation residual
                q0 = self.z[7*i+3: 7*i+7].squeeze()
                q1 = self.z[7*j+3: 7*j+7].squeeze()
                qvo = txq.qmult(txq.qinverse(q0), q1).reshape((-1, 1))
                rq = qvo - vos[k, 3:].reshape((-1, 1))
                r = np.vstack((r, np.dot(L_rq, rq)))
                k += 1

        return r

    def update_on_manifold(self, x):
        """
        Updates the state vector on manifold
        :param x: manifold increment, column vector
        :return: 
        """
        for i in range(self.N):
            # update translation
            t = x[6*i: 6*i+3]
            self.z[7*i: 7*i+3] += t

            # update rotation
            qm = x[6*i+3: 6*i+6]  # quaternion on the manifold
            dq = np.zeros(4)
            # method in Burgard paper
            # dq[1:] = qm.squeeze()
            # dq[0] = math.sqrt(1 - sum(np.square(qm)))  # incremental quaternion
            # method of exponential map
            n = np.linalg.norm(qm)
            dq[0] = math.cos(n)
            dq[1:] = np.sinc(n/np.pi) * qm.squeeze()
            q = self.z[7*i+3: 7*i+7].squeeze()
            q = txq.qmult(q, dq).reshape((-1, 1))
            self.z[7*i+3: 7*i+7] = q

    def optimize(self, poses, vos, sax=1, saq=1, srx=1, srq=1, n_iters=10):
        """
        run PGO, with init = poses
        :param poses:
        :param vos:
        :param sax: sigma for absolute translation
        :param saq: sigma for absolute rotation
        :param srx: sigma for relative translation
        :param srq: sigma for relative rotation
        :param n_iters:
        :return:
        """
        self.N = len(poses)
        # init state vector with the predicted poses
        self.z = np.reshape(poses.copy(), (-1, 1))

        # construct the information matrices
        L_ax = np.linalg.cholesky(np.eye(3) / sax)
        L_aq = np.linalg.cholesky(np.eye(4) / saq)
        L_rx = np.linalg.cholesky(np.eye(3) / srx)
        L_rq = np.linalg.cholesky(np.eye(4) / srq)

        for n_iter in range(n_iters):
            J = self.jacobian(L_ax.T, L_aq.T, L_rx.T, L_rq.T)
            r = self.residuals(poses.copy(), vos.copy(), L_ax.T, L_aq.T, L_rx.T,
                               L_rq.T)
            H = np.dot(J.T, J)  # hessian
            b = np.dot(J.T, r)  # residuals

            # solve Hx = -b for x
            R = slin.cholesky(H)  # H = R' R
            y = slin.solve_triangular(R.T, -b)
            x = slin.solve_triangular(R, y)

            self.update_on_manifold(x)

        return self.z.reshape((-1, 7))


def optimize_poses(pred_poses, vos=None, fc_vos=False, target_poses=None,
                   sax=1, saq=1, srx=1, srq=1):
    """
    optimizes poses using either the VOs or the target poses (calculates VOs
    from them)
    :param pred_poses: N x 7
    :param vos: (N-1) x 7
    :param fc_vos: whether to use relative transforms between all frames in a fully
    connected manner, not just consecutive frames
    :param target_poses: N x 7
    :param: sax: covariance of pose translation (1 number)
    :param: saq: covariance of pose rotation (1 number)
    :param: srx: covariance of VO translation (1 number)
    :param: srq: covariance of VO rotation (1 number)
    :return:
    """
    pgo = PoseGraphFC() if fc_vos else PoseGraph()
    if vos is None:
        if target_poses is not None:
            # calculate the VOs (in the pred_poses frame)
            vos = np.zeros((len(target_poses)-1, 7))
            for i in range(len(vos)):
                vos[i, :3] = target_poses[i+1, :3] - target_poses[i, :3]
                q0 = target_poses[i, 3:]
                q1 = target_poses[i+1, 3:]
                vos[i, 3:] = txq.qmult(txq.qinverse(q0), q1)
        else:
            print('Specify either VO or target poses')
            return None
    optim_poses = pgo.optimize(poses=pred_poses, vos=vos, sax=sax, saq=saq,
                               srx=srx, srq=srq)
    return optim_poses


def align_3d_pts(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (3xn)
    x2 -- second trajectory (3xn)

    Output:
    R -- rotation matrix (3x3)
    t -- translation vector (3x1)
    s -- scale (1x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((3, 3))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    s = np.asscalar(np.sqrt(r2/r1))

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(3)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[2, 2] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - (1/s) * np.dot(R.transpose(), x2c)

    # ---- align ----
    #x2a = s * np.dot(R, x1-t)
    #error = x2a - x2

    return R, t, s


def align_2d_pts(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (2xn)
    x2 -- second trajectory (2xn)

    Output:
    R -- rotation matrix (2x2)
    t -- translation vector (2x1)
    s -- scale (1x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((2, 2))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    s = np.asscalar(np.sqrt(r2/r1))

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(2)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[1, 1] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - (1/s) * np.dot(R.transpose(), x2c)

    # ---- align ----
    #x2a = s * np.dot(R, x1-t)
    #error = x2a - x2

    return R, t, s


def align_3d_pts_noscale(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (3xn)
    x2 -- second trajectory (3xn)

    Output:
    R -- rotation matrix (3x3)
    t -- translation vector (3x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((3, 3))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    #s = np.asscalar(np.sqrt(r2/r1))
    s = 1

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(3)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[2, 2] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - np.dot(R.transpose(), x2c)

    # ---- align ----
    #x2a = s * np.dot(R, x1-t)
    #error = x2a - x2

    return R, t, s


def align_2d_pts_noscale(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (2xn)
    x2 -- second trajectory (2xn)

    Output:
    R -- rotation matrix (2x2)
    t -- translation vector (2x1)
    s -- scale (1x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((2, 2))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    #s = np.asscalar(np.sqrt(r2/r1))
    s = 1

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(2)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[1, 1] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - (1/s) * np.dot(R.transpose(), x2c)

    # ---- align ----
    #x2a = s * np.dot(R, x1-t)
    #error = x2a - x2

    return R, t, s


def align_camera_poses(o1, o2, R1, R2, use_rotation_constraint=True):
    """Align two sets of camera poses (R1,o1/R2,o2) using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(o1-t) = o2   (1)

            R*R1 = R2         (2)

    where R1/R2 are the camera-to-world matrices, o1/o2 are the center
    of the cameras.

    Input:
    o1 -- camera centers (3xn)
    o2 -- camera centers (3xn)
    R1 -- camera poses (camera-to-world matrices) (nx3x3)
    R2 -- camera poses (camera-to-world matrices) (nx3x3)
    use_rotation_constraint -- if False, uses only Eq(1) to solve.

    Output:
    R -- rotation matrix (3x3)
    t -- translation vector (3x1)
    s -- scale (1x1)

    Note, when use_rotation_constraint=False, it is the same problem as
    above, i.e., to align two sets of 3D points.

    When use_rotation_constraint=True, we note Eq(2) is the same
    equation as Eq(1), after we zero-center and remove the scale. So, we
    can use the same approach (SVD).
    written by Jinwei Gu
    """
    if not use_rotation_constraint:
        return align_3d_pts(o1, o2)

    o1c = o1.mean(1, keepdims=True)
    o2c = o2.mean(1, keepdims=True)
    o1_zerocentered = o1 - o1c
    o2_zerocentered = o2 - o2c

    W = np.zeros((3, 3))
    r1 = 0
    r2 = 0
    for i in range(o1.shape[1]):
        a = o1_zerocentered[:, i]
        b = o2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    s = np.asscalar(np.sqrt(r2/r1))

    # add rotation constraints
    for i in range(o1.shape[1]):
        d1 = np.squeeze(R1[i, :, :])
        d2 = np.squeeze(R2[i, :, :])
        for c in range(3):
            a = d1[:, c]
            b = d2[:, c]
            W += np.outer(b, a)

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(3)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[2, 2] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = o1c - (1/s) * np.dot(R.transpose(), o2c)

    # ---- align ----
    #o2a = s * np.dot(R, o1-t)
    #R2a = np.dot(R, R1)

    return R, t, s


def test_align_3d_pts():
    import transforms3d.euler as txe
    N = 10
    x1 = np.random.rand(3, N)

    noise = np.random.rand(3, N)*0.01

    s = np.random.rand()
    t = np.random.rand(3, 1)
    R = txe.euler2mat(np.random.rand(), np.random.rand(), np.random.rand())
    R = R[:3, :3]

    x2 = s*np.dot(R, x1-t) + noise

    Re, te, se = align_3d_pts(x1, x2)

    print('scale ', s, se)
    print('rotation matrx ', R, Re)
    print('translation ', t, te)


def test_align_camera_poses():
    import transforms3d.euler as txe

    N = 10
    o1 = np.random.rand(3, N)

    noise = np.random.rand(3, N)*0.01

    s = np.random.rand()
    t = np.random.rand(3, 1)
    R = txe.euler2mat(np.random.rand(), np.random.rand(), np.random.rand())
    R = R[:3, :3]

    o2 = s*np.dot(R, o1-t) + noise

    R1 = np.zeros((N, 3, 3))
    R2 = np.zeros((N, 3, 3))
    for i in range(N):
        Ri = txe.euler2mat(
            np.random.rand(), np.random.rand(), np.random.rand())
        R1[i, :, :] = Ri[:3, :3]
        R2[i, :, :] = np.dot(R, Ri[:3, :3])

    Re1, te1, se1 = align_camera_poses(o1, o2, R1, R2, False)
    Re2, te2, se2 = align_camera_poses(o1, o2, R1, R2, True)

    print('scale ', s, se1, se2)
    print('rotation matrx ', R, Re1, Re2)
    print('translation ', t, te1, te2)


def pgo_test_poses():
    """
    generates test poses and vos for the various PGO implementations
    :return:
    """
    poses = np.zeros((3, 7))
    for i in range(poses.shape[0]):
        poses[i, :3] = i
        angle = math.radians(10*i)
        R = txe.euler2mat(angle, angle, angle)
        q = txq.mat2quat(R)
        poses[i, 3:] = q

    vos = np.zeros((poses.shape[0]-1, 7))
    for i in range(vos.shape[0]):
        vos[i, 0] = 1.5
        vos[i, 1] = 0.5
        vos[i, 2] = 1.0
        R = txe.euler2mat(math.radians(15), math.radians(10), math.radians(5))
        q = txq.mat2quat(R)
        vos[i, 3:] = q

    return poses, vos


def pgo_test_poses1():
    poses = np.zeros((3, 7))
    R = txe.euler2mat(0, 0, np.deg2rad(45))
    q = txq.mat2quat(R)
    poses[:, 3:] = q
    for i in range(len(poses)):
        poses[i, :3] = np.asarray([i, i, 0])

    pt = np.zeros((len(poses), 6))
    pt[:, :3] = poses[:, :3]
    for i, p in enumerate(poses):
        pt[i, 3:] = qlog(p[3:])
    pt = torch.from_numpy(pt.astype(np.float32))
    vost = calc_vos_safe_fc(pt.unsqueeze(0))[0].numpy()
    vos = np.zeros((len(vost), 7))
    vos[:, :3] = vost[:, :3]
    for i, p in enumerate(vost):
        vos[i, 3:] = qexp(p[3:])

    # perturbation
    vos[0, 0] = np.sqrt(2) - 0.5
    vos[1, 0] = np.sqrt(2) - 0.5

    return poses, vos


def print_poses(poses):
    print('translations')
    print(poses[:, :3])
    print('euler')
    for i in range(poses.shape[0]):
        a = txe.mat2euler(txq.quat2mat(poses[i, 3:]))
        print([np.rad2deg(aa) for aa in a])


def test_pgo():
    """
    Tests the full pose graph optimization implementation
    :return: bool
    """
    pred_poses, vos = pgo_test_poses1()
    print('pred poses')
    print_poses(pred_poses)
    print('vos')
    print_poses(vos)

    pgo = PoseGraph()
    optimized_poses = pgo.optimize(pred_poses, vos)

    print('optimized')
    print_poses(optimized_poses)


def test_pose_utils():
    """
    Tests the pose utils
    :return: 
    """
    TEST_COMPOSE = True
    TEST_INV = True

    def ra(_): return np.random.uniform(0, 2*math.pi)

    if TEST_COMPOSE:
        print('Testing pose composing...')
        R1 = txe.euler2mat(ra(1), ra(1), ra(1))
        t1 = np.random.rand(3)
        R2 = txe.euler2mat(ra(1), ra(1), ra(1))
        t2 = np.random.rand(3)

        # homogeneous matrix method
        R = np.dot(R1, R2)
        t = t1 + np.dot(R1, t2)
        print('From homogeneous matrices, t = ')
        print(t)
        print('R = ')
        print(R)

        # quaternion method
        q1 = txq.mat2quat(R1)
        q2 = txq.mat2quat(R2)

        p1 = torch.cat((torch.from_numpy(t1), torch.from_numpy(q1)))
        p2 = torch.cat((torch.from_numpy(t2), torch.from_numpy(q2)))
        p = compose_pose_quaternion(
            torch.unsqueeze(p1, 0), torch.unsqueeze(p2, 0))
        t = p[:, :3].numpy().squeeze()
        q = p[:, 3:].numpy().squeeze()
        print('From quaternions, t = ')
        print(t)
        print('R = ')
        print(txe.quat2mat(q))

    if TEST_INV:
        print('Testing pose inversion...')
        R = txe.euler2mat(ra(1), ra(1), ra(1))
        t = np.random.rand(3)
        T = np.eye(4)
        T[:3, :3] = R
        T[:3, -1] = t

        q = txq.mat2quat(R)
        p = torch.cat((torch.from_numpy(t), torch.from_numpy(q)))
        pinv = invert_pose_quaternion(torch.unsqueeze(p, 0))
        tinv, qinv = pinv[:, :3], pinv[:, 3:]
        Rinv = txq.quat2mat(qinv.numpy().squeeze())
        Tinv = np.eye(4)
        Tinv[:3, :3] = Rinv
        Tinv[:3, -1] = tinv.numpy().squeeze()
        print('T * T^(-1) = ')
        print(np.dot(T, Tinv))


def test_q_error():
    def ra(_): return np.random.uniform(0, 2*math.pi)
    # rotation along x axis
    a1 = ra(1)
    a2 = ra(1)
    q1 = txq.mat2quat(txe.euler2mat(a1, 0, 0))
    q2 = txq.mat2quat(txe.euler2mat(a2, 0, 0))
    a1 = np.rad2deg(a1)
    a2 = np.rad2deg(a2)
    print('Angles: {:f}, {:f}, difference = {:f}'.format(a1, a2, a1-a2))
    print('Error: {:f}'.format(quaternion_angular_error(q1, q2)))


def test_log_q_error():
    def ra(_): return np.random.uniform(0, 2*math.pi)
    # rotation along x axis
    a1 = ra(1)
    a2 = ra(1)
    q1 = txq.mat2quat(txe.euler2mat(0, a1, 0))
    q2 = txq.mat2quat(txe.euler2mat(0, a2, 0))
    # apply log map
    q1 = np.arccos(q1[0]) * q1[1:] / np.linalg.norm(q1[1:])
    q2 = np.arccos(q2[0]) * q2[1:] / np.linalg.norm(q2[1:])
    a1 = np.rad2deg(a1)
    a2 = np.rad2deg(a2)
    print('Angles: {:f}, {:f}, difference = {:f}'.format(a1, a2, a1-a2))
    print('Error: {:f}'.format(log_quaternion_angular_error(q1, q2)))


if __name__ == '__main__':
    test_pgo()
    # test_dumb_pgo()
    # test_align_camera_poses()
    # test_q_error()
    # test_log_q_error()


================================================
FILE: utils/pose_utils_np.py
================================================
"""
Copyright (C) 2018 NVIDIA Corporation.  All rights reserved.
Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
"""

import torch
from torch.nn import Module
from torch.autograd import Variable
from torch.nn.functional import pad
import numpy as np
import scipy.linalg as slin
import math
import transforms3d.quaternions as txq
import transforms3d.euler as txe
import quaternion
import bisect
# see for formulas:
# https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-801-machine-vision-fall-2004/readings/quaternions.pdf
# and "Quaternion and Rotation" - Yan-Bin Jia, September 18, 2016
# from IPython.core.debugger import set_trace

# PYTORCH

def tq2RT(poses, square=False):
    """
    :param poses: N x 7, (t,q)
    :return: (N,3,4) 
    """
    N,_ = poses.shape
    T = poses[:,:3]
    q = quaternion.from_float_array(poses[:,3:])

    R = quaternion.as_rotation_matrix(q) #Nx3x3
    RT = np.concatenate([R,T[...,None]], axis=-1) #Nx3x4

    if square:
        padding = np.zeros([N,1,4])
        padding[:,:,-1] = 1 
        RT = np.concatenate([RT,padding], axis=1) #Nx4x4
    return RT 
def RT2tq(poses, square=False):
    """
    !!NOT TESETED!!
    :param poses: N x 3 x 4, (R|T)
    :return: (N, 7) 
    """
    N,_,_ = poses.shape
    R = poses[:,:,:3]
    T = poses[:,:,3:] # Nx3x1

    q = quaternion.as_float_array(quaternion.from_rotation_matrix(R)) #Nx4
    t= T.squeeze(-1) 

    tq = np.concatenate([t,q], axis=-1)

    return tq 

def pose_interp(poses, timestamps_in, timestamps_out, r_interp='slerp'):
    """
    :param poses: N x 7, (t,q)
    :param timestamps: (N,) 
    :param t: (K,) 
    :return: (K,) 

    """
  
    
    # assert t_interp in ['linear', 'spline']
    assert r_interp in ['slerp', 'squad']

    assert len(poses)>1
    assert len(poses) == len(timestamps_in)

    input_ts = poses[:,:3]
    input_rs= poses[:,3:] #quaternions
    timestamps_in = np.array(timestamps_in)
    #sort the inputs
    inds = np.argsort(timestamps_in)
    poses = poses[inds]
    timestamps_in = timestamps_in[inds]

    if r_interp == 'squad':
        input_rs_ = quaternion.from_float_array(input_rs) 
        output_rs = quaternion.squad( input_rs, timestamps_in, timestamps_out)
        output_rs = quaternion.as_float_array(output_rs)
    elif r_interp == 'slerp':
        output_rs = []
        for t in timestamps_out:
            input_rs_ = quaternion.from_float_array(input_rs) 
            idx = bisect.bisect_left(timestamps_in)
            output_r =  quaternion.slerp(input_rs_[idx],input_rs_[idx+1], timestamps_in[idx], timestamps_in[idx+1],t )
            output_r = quaternion.as_float_array(output_r)
            output_rs.append(output_r)

    output_ts = []
    for t in timestamps_out:
        idx = bisect_left.bisect_left(timestamps_in)
        if idx>=len(timestamps_in)-1:
            idx -= 1
        
        t1 = timestamps_in[idx]
        t2 = timestamps_in[idx+1]

        output_t = ((t-t1)*input_ts[idx+1] +  (t2-t) *input_ts[idx]) / (t2-t1)

        output_ts.append(output_t)

    output_ts =np.concatenate(output_ts, axis=0 )
    output_rs =np.concatenate(output_rs, axis=0 )

    new_pose = np.concatenate([output_ts, output_rs], axis=1) 
    return new_pose

def vdot(v1, v2):
    """
    Dot product along the dim=1
    :param v1: N x d
    :param v2: N x d
    :return: N x 1
    """
    # out = torch.mul(v1, v2)
    out = v1 * v2
    out = np.sum(out, axis=1)
    return out


def normalize(x, p=2, dim=0, eps=1e-6):
    """
    Divides a tensor along a certain dim by the Lp norm
    :param x:
    :param p: Lp norm
    :param dim: Dimension to normalize along
    :return:
    """
    # xn=x.norm(p = p, dim = dim)
    xn = np.linalg.norm(x, ord=p, axis=dim, keepdims=True)
    # x=x / xn.unsqueeze(dim = dim)
    x = x / (xn+eps)

    # x *= np.sign(x[0])  #added on 22/4/2020
    return x


def qmult(q1, q2):
    """
    Multiply 2 quaternions
    :param q1: Tensor N x 4
    :param q2: Tensor N x 4
    :return: quaternion product, Tensor N x 4
    """
    q1s, q1v = q1[:, :1], q1[:, 1:]
    q2s, q2v = q2[:, :1], q2[:, 1:]

    qs = q1s*q2s - vdot(q1v, q2v)
    # qv=q1v.mul(q2s.expand_as(q1v)) + q2v.mul(q1s.expand_as(q2v)) +
    #     torch.cross(q1v, q2v, dim = 1)
    qv = q1v*q2s + q2v*q1s + np.cross(q1v, q2v, axis=1)
    q = np.concatenate((qs, qv), axis=1)

    # normalize
    q = normalize(q, dim=1)

    return q


def qinv(q):
    """
    Inverts quaternions
    :param q: N x 4
    :return: q*: N x 4
    """
    # q_inv = torch.cat((q[:, :1], -q[:, 1:]), dim=1)
    q_inv = np.concatenate((q[:, :1], -q[:, 1:]), axis=1)
    return q_inv


def qexp_t(q):
    """
    Applies exponential map to log quaternion
    :param q: N x 3
    :return: N x 4
    """
    n = torch.norm(q, p=2, dim=1, keepdim=True)
    n = torch.clamp(n, min=1e-8)
    q = q * torch.sin(n)
    q = q / n
    q = torch.cat((torch.cos(n), q), dim=1)
    return q


def qlog_t(q):
    """
    Applies the log map to a quaternion
    :param q: N x 4
    :return: N x 3
    """
    n = torch.norm(q[:, 1:], p=2, dim=1, keepdim=True)
    n = torch.clamp(n, min=1e-8)
    q = q[:, 1:] * torch.acos(torch.clamp(q[:, :1], min=-1.0, max=1.0))
    q = q / n
    return q


def qexp_t_safe(q):
    """
    Applies exponential map to log quaternion (safe implementation that does not
    maintain gradient flow)
    :param q: N x 3
    :return: N x 4
    """
    q = torch.from_numpy(np.asarray([qexp(qq) for qq in q.numpy()],
                                    dtype=np.float32))
    return q


def qlog_t_safe(q):
    """
    Applies the log map to a quaternion (safe implementation that does not
    maintain gradient flow)
    :param q: N x 4
    :return: N x 3
    """
    q = torch.from_numpy(np.asarray([qlog(qq) for qq in q.numpy()],
                                    dtype=np.float32))
    return q


def rotate_vec_by_q(t, q):
    """
    rotates vector t by quaternion q
    :param t: vector, Tensor N x 3
    :param q: quaternion, Tensor N x 4
    :return: t rotated by q: t' = t + 2*qs*(qv x t) + 2*qv x (qv x r)
    """
    qs, qv = q[:, :1], q[:, 1:]
    # b = torch.cross(qv, t, dim=1)
    b = np.cross(qv, t, axis=1)
    # c = 2 * torch.cross(qv, b, dim=1)
    c = 2 * np.cross(qv, b, axis=1)
    # b = 2 * b.mul(qs.expand_as(b))
    b = 2 * b*qs
    tq = t + b + c
    return tq


def compose_pose_quaternion(p1, p2):
    """
    pyTorch implementation
    :param p1: input pose, Tensor N x 7
    :param p2: pose to apply, Tensor N x 7
    :return: output pose, Tensor N x 7
    all poses are translation + quaternion
    #!comments: first apply p2 and then p1 !!
    """

    p1t, p1q = p1[:, :3], p1[:, 3:]
    p2t, p2q = p2[:, :3], p2[:, 3:]

    q = qmult(p1q, p2q)
    t = p1t + rotate_vec_by_q(p2t, p1q)
    return np.concatenate((t, q), axis=1)


def invert_pose_quaternion(p):
    """
    inverts the pose
    :param p: pose, Tensor N x 7
    :return: inverted pose
    """
    t, q = p[:, :3], p[:, 3:]
    q_inv = qinv(q)
    tinv = -rotate_vec_by_q(t, q_inv)
    return np.concatenate((tinv, q_inv), axis=1)


def calc_vo(p0, p1):
    """
    calculates VO (in the p0 frame) from 2 poses
    :param p0: N x 7
    :param p1: N x 7
    """
    # assert p0.shape==p1.shape
    return compose_pose_quaternion(invert_pose_quaternion(p0), p1)


def calc_vo_logq(p0, p1):
    """
    VO (in the p0 frame) (logq)
    :param p0: N x 6
    :param p1: N x 6
    :return: N-1 x 6
    """
    q0 = qexp_t(p0[:, 3:])
    q1 = qexp_t(p1[:, 3:])
    vos = calc_vo(torch.cat((p0[:, :3], q0), dim=1), torch.cat((p1[:, :3], q1),
                                                               dim=1))
    vos_q = qlog_t(vos[:, 3:])
    return torch.cat((vos[:, :3], vos_q), dim=1)


def calc_vo_relative(p0, p1):
    """
    calculates VO (in the world frame) from 2 poses
    :param p0: N x 7
    :param p1: N x 7
    """
    vos_t = p1[:, :3] - p0[:, :3]
    vos_q = qmult(qinv(p0[:, 3:]), p1[:, 3:])
    return np.concatenate((vos_t, vos_q), axis=1)


def calc_vo_relative_logq(p0, p1):
    """
    Calculates VO (in the world frame) from 2 poses (log q)
    :param p0: N x 6
    :param p1: N x 6
    :return:
    """
    q0 = qexp_t(p0[:, 3:])
    q1 = qexp_t(p1[:, 3:])
    vos = calc_vo_relative(torch.cat((p0[:, :3], q0), dim=1),
                           torch.cat((p1[:, :3], q1), dim=1))
    vos_q = qlog_t(vos[:, 3:])
    return torch.cat((vos[:, :3], vos_q), dim=1)


def calc_vo_relative_logq_safe(p0, p1):
    """
    Calculates VO (in the world frame) from 2 poses (log q) through numpy fns
    :param p0: N x 6
    :param p1: N x 6
    :return:
    """
    vos_t = p1[:, :3] - p0[:, :3]
    q0 = qexp_t_safe(p0[:, 3:])
    q1 = qexp_t_safe(p1[:, 3:])
    vos_q = qmult(qinv(q0), q1)
    vos_q = qlog_t_safe(vos_q)
    return torch.cat((vos_t, vos_q), dim=1)


def calc_vo_logq_safe(p0, p1):
    """
    VO in the p0 frame using numpy fns
    :param p0:
    :param p1:
    :return:
    """
    vos_t = p1[:, :3] - p0[:, :3]
    q0 = qexp_t_safe(p0[:, 3:])
    q1 = qexp_t_safe(p1[:, 3:])
    vos_t = rotate_vec_by_q(vos_t, qinv(q0))
    vos_q = qmult(qinv(q0), q1)
    vos_q = qlog_t_safe(vos_q)
    return torch.cat((vos_t, vos_q), dim=1)


def calc_vos_simple(poses):
    """
    calculate the VOs, from a list of consecutive poses
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [p[i+1].unsqueeze(0) - p[i].unsqueeze(0)
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)

    return vos


def calc_vos(poses):
    """
    calculate the VOs, from a list of consecutive poses (in the p0 frame)
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [calc_vo_logq(p[i].unsqueeze(0), p[i+1].unsqueeze(0))
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos


def calc_vos_relative(poses):
    """
    calculate the VOs, from a list of consecutive poses (in the world frame)
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [calc_vo_relative_logq(p[i].unsqueeze(0), p[i+1].unsqueeze(0))
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos


def calc_vos_safe(poses):
    """
    calculate the VOs, from a list of consecutive poses
    :param poses: N x T x 7
    :return: N x (T-1) x 7
    """
    vos = []
    for p in poses:
        pvos = [calc_vo_logq_safe(p[i].unsqueeze(0), p[i+1].unsqueeze(0))
                for i in range(len(p)-1)]
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos


def calc_vos_safe_fc(poses):
    """
    calculate the VOs, from a list of consecutive poses (fully connected)
    :param poses: N x T x 7
    :return: N x TC2 x 7
    """
    vos = []
    for p in poses:
        pvos = []
        for i in range(p.size(0)):
            for j in range(i+1, p.size(0)):
                pvos.append(calc_vo_logq_safe(
                    p[i].unsqueeze(0), p[j].unsqueeze(0)))
        vos.append(torch.cat(pvos, dim=0))
    vos = torch.stack(vos, dim=0)
    return vos

# NUMPY


def qlog(q):
    """
    Applies logarithm map to q
    :param q: (4,)
    :return: (3,)
    """
    if all(q[1:] == 0):
        q = np.zeros(3)
    else:
        q = np.arccos(q[0]) * q[1:] / np.linalg.norm(q[1:])
    return q


def qexp(q):
    """
    Applies the exponential map to q
    :param q: (3,)
    :return: (4,)
    """
    n = np.linalg.norm(q)
    q = np.hstack((np.cos(n), np.sinc(n/np.pi)*q))
    return q


def process_poses(poses_in, mean_t, std_t, align_R, align_t, align_s):
    """
    processes the 1x12 raw pose from dataset by aligning and then normalizing
    :param poses_in: N x 12
    :param mean_t: 3
    :param std_t: 3
    :param align_R: 3 x 3
    :param align_t: 3
    :param align_s: 1
    :return: processed poses (translation + quaternion) N x 7
    """
    poses_out = np.zeros((len(poses_in), 6))
    poses_out[:, 0:3] = poses_in[:, [3, 7, 11]]

    # align
    for i in range(len(poses_out)):
        R = poses_in[i].reshape((3, 4))[:3, :3]
        q = txq.mat2quat(np.dot(align_R, R))
        q *= np.sign(q[0])  # constrain to hemisphere
        q = qlog(q)
        poses_out[i, 3:] = q
        t = poses_out[i, :3] - align_t
        poses_out[i, :3] = align_s * \
            np.dot(align_R, t[:, np.newaxis]).squeeze()

    # normalize translation
    poses_out[:, :3] -= mean_t
    poses_out[:, :3] /= std_t
    return poses_out


def log_quaternion_angular_error(q1, q2):
    return quaternion_angular_error(qexp(q1), qexp(q2))


def quaternion_angular_error(q1, q2):
    """
    angular error between two quaternions
    :param q1: (4, )
    :param q2: (4, )
    :return:
    """
    d = abs(np.dot(q1, q2))
    d = min(1.0, max(-1.0, d))
    theta = 2 * np.arccos(d) * 180 / np.pi
    return theta


def skew(x):
    """
    returns skew symmetric matrix from vector
    :param x: 3 x 1
    :return:
    """
    s = np.asarray([[0, -x[2], x[1]], [x[2], 0, -x[0]], [-x[1], x[0], 0]])
    return s


def dpq_q(p):
    """
    returns the jacobian of quaternion product pq w.r.t. q
    :param p: 4 x 1
    :return: 4 x 4
    """
    J = np.zeros((4, 4))
    J[0, 0] = p[0]
    J[0, 1:] = -p[1:].squeeze()
    J[1:, 0] = p[1:].squeeze()
    J[1:, 1:] = p[0] * np.eye(3) + skew(p[1:])
    return J


def dpsq_q(p):
    """
    returns the jacobian of quaternion product (p*)q w.r.t. q
    :param p: 4 x 1
    :return: 4 x 4
    """
    J = np.zeros((4, 4))
    J[0, 0] = p[0]
    J[0, 1:] = -p[1:].squeeze()
    J[1:, 0] = -p[1:].squeeze()
    J[1:, 1:] = p[0] * np.eye(3) - skew(p[1:])
    return J


def dpsq_p(q):
    """
    returns the jacobian of quaternion product (p*)q w.r.t. p
    :param q: 4 x 1
    :return: 4 x 4
    """
    J = np.zeros((4, 4))
    J[0, 0] = q[0]
    J[0, 1:] = q[1:].squeeze()
    J[1:, 0] = q[1:].squeeze()
    J[1:, 1:] = -q[0] * np.eye(3) + skew(q[1:])
    return J


def dqstq_q(q, t):
    """
    jacobian of q* t q w.r.t. q
    :param q: 4 x 1
    :param t: 3 x 1
    :return: 3 x 4
    """
    J = np.zeros((3, 4))
    J[:, :1] = q[0]*t - np.cross(q[1:], t, axis=0)
    J[:, 1:] = -np.dot(t, q[1:].T) + np.dot(t.T, q[1:])*np.eye(3) + \
        np.dot(q[1:], t.T) + q[0]*skew(t)
    J *= 2
    return J


def dqstq_t(q):
    """
    jacobian of q* t q w.r.t. t
    :param q: 4 x 1
    :return: 3 x 3
    """
    J = (q[0]*q[0] - np.dot(q[1:].T, q[1:])) * np.eye(3) + 2*np.dot(q[1:], q[1:].T) -\
        2*q[0]*skew(q[1:])
    return J


def m_rot(x):
    """
    returns Jacobian of exponential map w.r.t. manifold increment
    :param x: part of state vector affected by increment, 4 x 1
    :return: 4 x 3
    """
    # jacobian of full q wrt qm (quaternion update on manifold),
    # evaluated at qv = (0, 0, 0)
    # full q is derived using either the exponential map or q0 = sqrt(1-qm^2)
    jm = np.vstack((np.zeros((1, 3)), np.eye(3)))  # 4 x 3
    m = np.dot(dpq_q(p=x), jm)
    return m


class PoseGraph:
    def __init__(self):
        """
        implements pose graph optimization from
        "Hybrid Hessians for Optimization of Pose Graphs" - Y. LeCun et al
        and "A Tutorial on Graph-Based SLAM" - W. Burgard et al
        """
        self.N = 0
        self.z = np.zeros((0, 0))

    def jacobian(self, L_ax, L_aq, L_rx, L_rq):
        # 6 because updates for rotation are on manifold
        J = np.zeros((0, 6*self.N))

        # unary constraints
        for i in range(self.N):
            # translation constraint
            jt = np.zeros((3, J.shape[1]))
            jt[:, 6*i: 6*i+3] = np.eye(3)
            J = np.vstack((J, np.dot(L_ax, jt)))

            # rotation constraint
            jr = np.zeros((4, J.shape[1]))
            jr[:, 6*i+3: 6*i+6] = m_rot(x=self.z[7*i+3: 7*i+7])
            J = np.vstack((J, np.dot(L_aq, jr)))

        # pairwise constraints
        for i in range(self.N-1):
                # translation constraint
            jt = np.zeros((3, J.shape[1]))
            dt = dqstq_t(q=self.z[7*i+3: 7*i+7])
            # dt = np.eye(3)
            jt[:, 6*i: 6*i+3] = -dt
            jt[:, 6*(i+1): 6*(i+1)+3] = dt
            # m = m_rot(x=self.z[7*i+3 : 7*i+7])
            # a = dqstq_q(q=self.z[7*i+3 : 7*i+7],
            #             t=self.z[7*(i+1) : 7*(i+1)+3]-self.z[7*i : 7*i+3])
            # jt[:, 6*i+3 : 6*i+6] = np.dot(a, m)
            J = np.vstack((J, np.dot(L_rx, jt)))

            # rotation constraint
            jr = np.zeros((4, J.shape[1]))
            m = m_rot(x=self.z[7*i+3: 7*i+7])
            a = dpsq_p(q=self.z[7*(i+1)+3: 7*(i+1)+7])
            jr[:, 6*i+3: 6*i+6] = np.dot(a, m)
            m = m_rot(x=self.z[7*(i+1)+3: 7*(i+1)+7])
            b = dpsq_q(p=self.z[7*i+3: 7*i+7])
            jr[:, 6*(i+1)+3: 6*(i+1)+6] = np.dot(b, m)
            J = np.vstack((J, np.dot(L_rq, jr)))

        return J

    def residuals(self, poses, vos, L_ax, L_aq, L_rx, L_rq):
        """
        computes the residuals
        :param poses: N x 7
        :param vos: (N-1) x 7
        :param L_ax: 3 x 3
        :param L_aq: 4 x 4
        :param L_rx: 3 x 3
        :param L_rq: 4 x 4
        :return:
        """
        r = np.zeros((0, 1))

        # unary residuals
        L = np.zeros((7, 7))
        L[:3, :3] = L_ax
        L[3:, 3:] = L_aq
        for i in range(self.N):
            rr = self.z[7*i: 7*(i+1)] - np.reshape(poses[i], (-1, 1))
            r = np.vstack((r, np.dot(L, rr)))

        # pairwise residuals
        for i in range(self.N-1):
            # translation residual
            v = self.z[7*(i+1):7*(i+1)+3, 0]-self.z[7*i:7*i+3, 0]
            q = txq.qinverse(self.z[7*i+3:7*i+7, 0])
            rt = txq.rotate_vector(v, q)
            rt = rt[:, np.newaxis] - vos[i, :3].reshape((-1, 1))
            # rt = self.z[7*(i+1) : 7*(i+1)+3] - self.z[7*i : 7*i+3] - \
            #     vos[i, :3].reshape((-1, 1))
            r = np.vstack((r, np.dot(L_rx, rt)))

            # rotation residual
            q0 = self.z[7*i+3: 7*i+7].squeeze()
            q1 = self.z[7*(i+1)+3: 7*(i+1)+7].squeeze()
            qvo = txq.qmult(txq.qinverse(q0), q1).reshape((-1, 1))
            rq = qvo - vos[i, 3:].reshape((-1, 1))
            r = np.vstack((r, np.dot(L_rq, rq)))

        return r

    def update_on_manifold(self, x):
        """
        Updates the state vector on manifold
        :param x: manifold increment, column vector
        :return:
        """
        for i in range(self.N):
            # update translation
            t = x[6*i: 6*i+3]
            self.z[7*i: 7*i+3] += t

            # update rotation
            qm = x[6*i+3: 6*i+6]  # quaternion on the manifold
            dq = np.zeros(4)
            # method in Burgard paper
            # dq[1:] = qm.squeeze()
            # dq[0] = math.sqrt(1 - sum(np.square(qm)))  # incremental quaternion
            # method of exponential map
            n = np.linalg.norm(qm)
            dq[0] = math.cos(n)
            dq[1:] = np.sinc(n/np.pi) * qm.squeeze()
            q = self.z[7*i+3: 7*i+7].squeeze()
            q = txq.qmult(q, dq).reshape((-1, 1))
            self.z[7*i+3: 7*i+7] = q

    def optimize(self, poses, vos, sax=1, saq=1, srx=1, srq=1, n_iters=10):
        """
        run PGO, with init = poses
        :param poses:
        :param vos:
        :param sax: sigma for absolute translation
        :param saq: sigma for absolute rotation
        :param srx: sigma for relative translation
        :param srq: sigma for relative rotation
        :param n_iters:
        :return:
        """
        self.N = len(poses)
        # init state vector with the predicted poses
        self.z = np.reshape(poses.copy(), (-1, 1))

        # construct the information matrices
        L_ax = np.linalg.cholesky(np.eye(3) / sax)
        L_aq = np.linalg.cholesky(np.eye(4) / saq)
        L_rx = np.linalg.cholesky(np.eye(3) / srx)
        L_rq = np.linalg.cholesky(np.eye(4) / srq)

        for n_iter in range(n_iters):
            J = self.jacobian(L_ax.T, L_aq.T, L_rx.T, L_rq.T)
            r = self.residuals(poses.copy(), vos.copy(), L_ax.T, L_aq.T, L_rx.T,
                               L_rq.T)
            H = np.dot(J.T, J)  # hessian
            b = np.dot(J.T, r)  # residuals

            # solve Hx = -b for x
            R = slin.cholesky(H)  # H = R' R
            y = slin.solve_triangular(R.T, -b)
            x = slin.solve_triangular(R, y)

            self.update_on_manifold(x)

        return self.z.reshape((-1, 7))


class PoseGraphFC:
    def __init__(self):
        """
        implements pose graph optimization from
        "Hybrid Hessians for Optimization of Pose Graphs" - Y. LeCun et al
        and "A Tutorial on Graph-Based SLAM" - W. Burgard et al
        fully connected version
        """
        self.N = 0
        self.z = np.zeros((0, 0))

    def jacobian(self, L_ax, L_aq, L_rx, L_rq):
        # 6 because updates for rotation are on manifold
        J = np.zeros((0, 6*self.N))

        # unary constraints
        for i in range(self.N):
            # translation constraint
            jt = np.zeros((3, J.shape[1]))
            jt[:, 6*i: 6*i+3] = np.eye(3)
            J = np.vstack((J, np.dot(L_ax, jt)))

            # rotation constraint
            jr = np.zeros((4, J.shape[1]))
            jr[:, 6*i+3: 6*i+6] = m_rot(x=self.z[7*i+3: 7*i+7])
            J = np.vstack((J, np.dot(L_aq, jr)))

        # pairwise constraints
        for i in range(self.N):
            for j in range(i+1, self.N):
                # translation constraint
                jt = np.zeros((3, J.shape[1]))
                dt = dqstq_t(q=self.z[7*i+3: 7*i+7])
                # dt = np.eye(3)
                jt[:, 6*i: 6*i+3] = -dt
                jt[:, 6*j: 6*j+3] = dt
                # m = m_rot(x=self.z[7*i+3 : 7*i+7])
                # a = dqstq_q(q=self.z[7*i+3 : 7*i+7],
                #             t=self.z[7*(i+1) : 7*(i+1)+3]-self.z[7*i : 7*i+3])
                # jt[:, 6*i+3 : 6*i+6] = np.dot(a, m)
                J = np.vstack((J, np.dot(L_rx, jt)))

                # rotation constraint
                jr = np.zeros((4, J.shape[1]))
                m = m_rot(x=self.z[7*i+3: 7*i+7])
                a = dpsq_p(q=self.z[7*j+3: 7*j+7])
                jr[:, 6*i+3: 6*i+6] = np.dot(a, m)
                m = m_rot(x=self.z[7*j+3: 7*j+7])
                b = dpsq_q(p=self.z[7*i+3: 7*i+7])
                jr[:, 6*j+3: 6*j+6] = np.dot(b, m)
                J = np.vstack((J, np.dot(L_rq, jr)))

        return J

    def residuals(self, poses, vos, L_ax, L_aq, L_rx, L_rq):
        """
        computes the residuals
        :param poses: N x 7
        :param vos: (N-1) x 7
        :param L_ax: 3 x 3
        :param L_aq: 4 x 4
        :param L_rx: 3 x 3
        :param L_rq: 4 x 4
        :return: 
        """
        r = np.zeros((0, 1))

        # unary residuals
        L = np.zeros((7, 7))
        L[:3, :3] = L_ax
        L[3:, 3:] = L_aq
        for i in range(self.N):
            rr = self.z[7*i: 7*(i+1)] - np.reshape(poses[i], (-1, 1))
            r = np.vstack((r, np.dot(L, rr)))

        # pairwise residuals
        k = 0
        for i in range(self.N):
            for j in range(i+1, self.N):
                # translation residual
                v = self.z[7*j:7*j+3, 0]-self.z[7*i:7*i+3, 0]
                q = txq.qinverse(self.z[7*i+3:7*i+7, 0])
                rt = txq.rotate_vector(v, q)
                rt = rt[:, np.newaxis] - vos[k, :3].reshape((-1, 1))
                # rt = self.z[7*(i+1) : 7*(i+1)+3] - self.z[7*i : 7*i+3] - \
                #     vos[i, :3].reshape((-1, 1))
                r = np.vstack((r, np.dot(L_rx, rt)))

                # rotation residual
                q0 = self.z[7*i+3: 7*i+7].squeeze()
                q1 = self.z[7*j+3: 7*j+7].squeeze()
                qvo = txq.qmult(txq.qinverse(q0), q1).reshape((-1, 1))
                rq = qvo - vos[k, 3:].reshape((-1, 1))
                r = np.vstack((r, np.dot(L_rq, rq)))
                k += 1

        return r

    def update_on_manifold(self, x):
        """
        Updates the state vector on manifold
        :param x: manifold increment, column vector
        :return: 
        """
        for i in range(self.N):
            # update translation
            t = x[6*i: 6*i+3]
            self.z[7*i: 7*i+3] += t

            # update rotation
            qm = x[6*i+3: 6*i+6]  # quaternion on the manifold
            dq = np.zeros(4)
            # method in Burgard paper
            # dq[1:] = qm.squeeze()
            # dq[0] = math.sqrt(1 - sum(np.square(qm)))  # incremental quaternion
            # method of exponential map
            n = np.linalg.norm(qm)
            dq[0] = math.cos(n)
            dq[1:] = np.sinc(n/np.pi) * qm.squeeze()
            q = self.z[7*i+3: 7*i+7].squeeze()
            q = txq.qmult(q, dq).reshape((-1, 1))
            self.z[7*i+3: 7*i+7] = q

    def optimize(self, poses, vos, sax=1, saq=1, srx=1, srq=1, n_iters=10):
        """
        run PGO, with init = poses
        :param poses:
        :param vos:
        :param sax: sigma for absolute translation
        :param saq: sigma for absolute rotation
        :param srx: sigma for relative translation
        :param srq: sigma for relative rotation
        :param n_iters:
        :return:
        """
        self.N = len(poses)
        # init state vector with the predicted poses
        self.z = np.reshape(poses.copy(), (-1, 1))

        # construct the information matrices
        L_ax = np.linalg.cholesky(np.eye(3) / sax)
        L_aq = np.linalg.cholesky(np.eye(4) / saq)
        L_rx = np.linalg.cholesky(np.eye(3) / srx)
        L_rq = np.linalg.cholesky(np.eye(4) / srq)

        for n_iter in range(n_iters):
            J = self.jacobian(L_ax.T, L_aq.T, L_rx.T, L_rq.T)
            r = self.residuals(poses.copy(), vos.copy(), L_ax.T, L_aq.T, L_rx.T,
                               L_rq.T)
            H = np.dot(J.T, J)  # hessian
            b = np.dot(J.T, r)  # residuals

            # solve Hx = -b for x
            R = slin.cholesky(H)  # H = R' R
            y = slin.solve_triangular(R.T, -b)
            x = slin.solve_triangular(R, y)

            self.update_on_manifold(x)

        return self.z.reshape((-1, 7))


def optimize_poses(pred_poses, vos=None, fc_vos=False, target_poses=None,
                   sax=1, saq=1, srx=1, srq=1):
    """
    optimizes poses using either the VOs or the target poses (calculates VOs
    from them)
    :param pred_poses: N x 7
    :param vos: (N-1) x 7
    :param fc_vos: whether to use relative transforms between all frames in a fully
    connected manner, not just consecutive frames
    :param target_poses: N x 7
    :param: sax: covariance of pose translation (1 number)
    :param: saq: covariance of pose rotation (1 number)
    :param: srx: covariance of VO translation (1 number)
    :param: srq: covariance of VO rotation (1 number)
    :return:
    """
    pgo = PoseGraphFC() if fc_vos else PoseGraph()
    if vos is None:
        if target_poses is not None:
            # calculate the VOs (in the pred_poses frame)
            vos = np.zeros((len(target_poses)-1, 7))
            for i in range(len(vos)):
                vos[i, :3] = target_poses[i+1, :3] - target_poses[i, :3]
                q0 = target_poses[i, 3:]
                q1 = target_poses[i+1, 3:]
                vos[i, 3:] = txq.qmult(txq.qinverse(q0), q1)
        else:
            print('Specify either VO or target poses')
            return None
    optim_poses = pgo.optimize(poses=pred_poses, vos=vos, sax=sax, saq=saq,
                               srx=srx, srq=srq)
    return optim_poses


def align_3d_pts(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (3xn)
    x2 -- second trajectory (3xn)

    Output:
    R -- rotation matrix (3x3)
    t -- translation vector (3x1)
    s -- scale (1x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((3, 3))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    s = np.asscalar(np.sqrt(r2/r1))

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(3)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[2, 2] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - (1/s) * np.dot(R.transpose(), x2c)

    # ---- align ----
    # x2a = s * np.dot(R, x1-t)
    # error = x2a - x2

    return R, t, s


def align_2d_pts(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (2xn)
    x2 -- second trajectory (2xn)

    Output:
    R -- rotation matrix (2x2)
    t -- translation vector (2x1)
    s -- scale (1x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((2, 2))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    s = np.asscalar(np.sqrt(r2/r1))

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(2)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[1, 1] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - (1/s) * np.dot(R.transpose(), x2c)

    # ---- align ----
    # x2a = s * np.dot(R, x1-t)
    # error = x2a - x2

    return R, t, s


def align_3d_pts_noscale(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (3xn)
    x2 -- second trajectory (3xn)

    Output:
    R -- rotation matrix (3x3)
    t -- translation vector (3x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((3, 3))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    # s = np.asscalar(np.sqrt(r2/r1))
    s = 1

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(3)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[2, 2] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - np.dot(R.transpose(), x2c)

    # ---- align ----
    # x2a = s * np.dot(R, x1-t)
    # error = x2a - x2

    return R, t, s


def align_2d_pts_noscale(x1, x2):
    """Align two sets of 3d points using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(x1-t) = x2

    Input:
    x1 -- first trajectory (2xn)
    x2 -- second trajectory (2xn)

    Output:
    R -- rotation matrix (2x2)
    t -- translation vector (2x1)
    s -- scale (1x1)
    written by Jinwei Gu
    """
    x1c = x1.mean(1, keepdims=True)
    x2c = x2.mean(1, keepdims=True)

    x1_zerocentered = x1 - x1c
    x2_zerocentered = x2 - x2c

    W = np.zeros((2, 2))
    r1 = 0
    r2 = 0
    for i in range(x1.shape[1]):
        a = x1_zerocentered[:, i]
        b = x2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    # s = np.asscalar(np.sqrt(r2/r1))
    s = 1

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(2)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[1, 1] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = x1c - (1/s) * np.dot(R.transpose(), x2c)

    # ---- align ----
    # x2a = s * np.dot(R, x1-t)
    # error = x2a - x2

    return R, t, s


def align_camera_poses(o1, o2, R1, R2, use_rotation_constraint=True):
    """Align two sets of camera poses (R1,o1/R2,o2) using the method of Horn (closed-form).

    Find optimal s, R, t, such that

            s*R*(o1-t) = o2   (1)

            R*R1 = R2         (2)

    where R1/R2 are the camera-to-world matrices, o1/o2 are the center
    of the cameras.

    Input:
    o1 -- camera centers (3xn)
    o2 -- camera centers (3xn)
    R1 -- camera poses (camera-to-world matrices) (nx3x3)
    R2 -- camera poses (camera-to-world matrices) (nx3x3)
    use_rotation_constraint -- if False, uses only Eq(1) to solve.

    Output:
    R -- rotation matrix (3x3)
    t -- translation vector (3x1)
    s -- scale (1x1)

    Note, when use_rotation_constraint=False, it is the same problem as
    above, i.e., to align two sets of 3D points.

    When use_rotation_constraint=True, we note Eq(2) is the same
    equation as Eq(1), after we zero-center and remove the scale. So, we
    can use the same approach (SVD).
    written by Jinwei Gu
    """
    if not use_rotation_constraint:
        return align_3d_pts(o1, o2)

    o1c = o1.mean(1, keepdims=True)
    o2c = o2.mean(1, keepdims=True)
    o1_zerocentered = o1 - o1c
    o2_zerocentered = o2 - o2c

    W = np.zeros((3, 3))
    r1 = 0
    r2 = 0
    for i in range(o1.shape[1]):
        a = o1_zerocentered[:, i]
        b = o2_zerocentered[:, i]
        W += np.outer(b, a)
        r1 += np.dot(a.T, a)
        r2 += np.dot(b.T, b)

    s = np.asscalar(np.sqrt(r2/r1))

    # add rotation constraints
    for i in range(o1.shape[1]):
        d1 = np.squeeze(R1[i, :, :])
        d2 = np.squeeze(R2[i, :, :])
        for c in range(3):
            a = d1[:, c]
            b = d2[:, c]
            W += np.outer(b, a)

    U, d, Vh = np.linalg.svd(W)
    S = np.eye(3)
    if np.linalg.det(np.dot(U, Vh)) < 0:
        S[2, 2] = -1
    R = np.dot(U, np.dot(S, Vh))
    t = o1c - (1/s) * np.dot(R.transpose(), o2c)

    # ---- align ----
    # o2a = s * np.dot(R, o1-t)
    # R2a = np.dot(R, R1)

    return R, t, s


def test_align_3d_pts():
    import transforms3d.euler as txe
    N = 10
    x1 = np.random.rand(3, N)

    noise = np.random.rand(3, N)*0.01

    s = np.random.rand()
    t = np.random.rand(3, 1)
    R = txe.euler2mat(np.random.rand(), np.random.rand(), np.random.rand())
    R = R[:3, :3]

    x2 = s*np.dot(R, x1-t) + noise

    Re, te, se = align_3d_pts(x1, x2)

    print('scale ', s, se)
    print('rotation matrx ', R, Re)
    print('translation ', t, te)


def test_align_camera_poses():
    import transforms3d.euler as txe

    N = 10
    o1 = np.random.rand(3, N)

    noise = np.random.rand(3, N)*0.01

    s = np.random.rand()
    t = np.random.rand(3, 1)
    R = txe.euler2mat(np.random.rand(), np.random.rand(), np.random.rand())
    R = R[:3, :3]

    o2 = s*np.dot(R, o1-t) + noise

    R1 = np.zeros((N, 3, 3))
    R2 = np.zeros((N, 3, 3))
    for i in range(N):
        Ri = txe.euler2mat(
            np.random.rand(), np.random.rand(), np.random.rand())
        R1[i, :, :] = Ri[:3, :3]
        R2[i, :, :] = np.dot(R, Ri[:3, :3])

    Re1, te1, se1 = align_camera_poses(o1, o2, R1, R2, False)
    Re2, te2, se2 = align_camera_poses(o1, o2, R1, R2, True)

    print('scale ', s, se1, se2)
    print('rotation matrx ', R, Re1, Re2)
    print('translation ', t, te1, te2)


def pgo_test_poses():
    """
    generates test poses and vos for the various PGO implementations
    :return:
    """
    poses = np.zeros((3, 7))
    for i in range(poses.shape[0]):
        poses[i, :3] = i
        angle = math.radians(10*i)
        R = txe.euler2mat(angle, angle, angle)
        q = txq.mat2quat(R)
        poses[i, 3:] = q

    vos = np.zeros((poses.shape[0]-1, 7))
    for i in range(vos.shape[0]):
        vos[i, 0] = 1.5
        vos[i, 1] = 0.5
        vos[i, 2] = 1.0
        R = txe.euler2mat(math.radians(15), math.radians(10), math.radians(5))
        q = txq.mat2quat(R)
        vos[i, 3:] = q

    return poses, vos


def pgo_test_poses1():
    poses = np.zeros((3, 7))
    R = txe.euler2mat(0, 0, np.deg2rad(45))
    q = txq.mat2quat(R)
    poses[:, 3:] = q
    for i in range(len(poses)):
        poses[i, :3] = np.asarray([i, i, 0])

    pt = np.zeros((len(poses), 6))
    pt[:, :3] = poses[:, :3]
    for i, p in enumerate(poses):
        pt[i, 3:] = qlog(p[3:])
    pt = torch.from_numpy(pt.astype(np.float32))
    vost = calc_vos_safe_fc(pt.unsqueeze(0))[0].numpy()
    vos = np.zeros((len(vost), 7))
    vos[:, :3] = vost[:, :3]
    for i, p in enumerate(vost):
        vos[i, 3:] = qexp(p[3:])

    # perturbation
    vos[0, 0] = np.sqrt(2) - 0.5
    vos[1, 0] = np.sqrt(2) - 0.5

    return poses, vos


def print_poses(poses):
    print('translations')
    print(poses[:, :3])
    print('euler')
    for i in range(poses.shape[0]):
        a = txe.mat2euler(txq.quat2mat(poses[i, 3:]))
        print([np.rad2deg(aa) for aa in a])


def test_pgo():
    """
    Tests the full pose graph optimization implementation
    :return: bool
    """
    pred_poses, vos = pgo_test_poses1()
    print('pred poses')
    print_poses(pred_poses)
    print('vos')
    print_poses(vos)

    pgo = PoseGraph()
    optimized_poses = pgo.optimize(pred_poses, vos)

    print('optimized')
    print_poses(optimized_poses)


def test_pose_utils():
    """
    Tests the pose utils
    :return: 
    """
    TEST_COMPOSE = True
    TEST_INV = True

    def ra(_): return np.random.uniform(0, 2*math.pi)

    if TEST_COMPOSE:
        print('Testing pose composing...')
        R1 = txe.euler2mat(ra(1), ra(1), ra(1))
        t1 = np.random.rand(3)
        R2 = txe.euler2mat(ra(1), ra(1), ra(1))
        t2 = np.random.rand(3)

        # homogeneous matrix method
        R = np.dot(R1, R2)
        t = t1 + np.dot(R1, t2)
        print('From homogeneous matrices, t = ')
        print(t)
        print('R = ')
        print(R)

        # quaternion method
        q1 = txq.mat2quat(R1)
        q2 = txq.mat2quat(R2)

        p1 = torch.cat((torch.from_numpy(t1), torch.from_numpy(q1)))
        p2 = torch.cat((torch.from_numpy(t2), torch.from_numpy(q2)))
        p = compose_pose_quaternion(
            torch.unsqueeze(p1, 0), torch.unsqueeze(p2, 0))
        t = p[:, :3].numpy().squeeze()
        q = p[:, 3:].numpy().squeeze()
        print('From quaternions, t = ')
        print(t)
        print('R = ')
        print(txe.quat2mat(q))

    if TEST_INV:
        print('Testing pose inversion...')
        R = txe.euler2mat(ra(1), ra(1), ra(1))
        t = np.random.rand(3)
        T = np.eye(4)
        T[:3, :3] = R
        T[:3, -1] = t

        q = txq.mat2quat(R)
        p = torch.cat((torch.from_numpy(t), torch.from_numpy(q)))
        pinv = invert_pose_quaternion(torch.unsqueeze(p, 0))
        tinv, qinv = pinv[:, :3], pinv[:, 3:]
        Rinv = txq.quat2mat(qinv.numpy().squeeze())
        Tinv = np.eye(4)
        Tinv[:3, :3] = Rinv
        Tinv[:3, -1] = tinv.numpy().squeeze()
        print('T * T^(-1) = ')
        print(np.dot(T, Tinv))


def test_q_error():
    def ra(_): return np.random.uniform(0, 2*math.pi)
    # rotation along x axis
    a1 = ra(1)
    a2 = ra(1)
    q1 = txq.mat2quat(txe.euler2mat(a1, 0, 0))
    q2 = txq.mat2quat(txe.euler2mat(a2, 0, 0))
    a1 = np.rad2deg(a1)
    a2 = np.rad2deg(a2)
    print('Angles: {:f}, {:f}, difference = {:f}'.format(a1, a2, a1-a2))
    print('Error: {:f}'.format(quaternion_angular_error(q1, q2)))


def test_log_q_error():
    def ra(_): return np.random.uniform(0, 2*math.pi)
    # rotation along x axis
    a1 = ra(1)
    a2 = ra(1)
    q1 = txq.mat2quat(txe.euler2mat(0, a1, 0))
    q2 = txq.mat2quat(txe.euler2mat(0, a2, 0))
    # apply log map
    q1 = np.arccos(q1[0]) * q1[1:] / np.linalg.norm(q1[1:])
    q2 = np.arccos(q2[0]) * q2[1:] / np.linalg.norm(q2[1:])
    a1 = np.rad2deg(a1)
    a2 = np.rad2deg(a2)
    print('Angles: {:f}, {:f}, difference = {:f}'.format(a1, a2, a1-a2))
    print('Error: {:f}'.format(log_quaternion_angular_error(q1, q2)))


if __name__ == '__main__':
    test_pgo()
    # test_dumb_pgo()
    # test_align_camera_poses()
    # test_q_error()
    # test_log_q_error()


================================================
FILE: utils/progress_bar.py
================================================
import contextlib
import enum
import math
import time

import numpy as np


def progress_str(val, *args, width=20, with_ptg=True):
    val = max(0., min(val, 1.))
    assert width > 1
    pos = round(width * val) - 1
    if with_ptg is True:
        log = '[{}%]'.format(max_point_str(val * 100.0, 4))
    log += '['
    for i in range(width):
        if i < pos:
            log += '='
        elif i == pos:
            log += '>'
        else:
            log += '.'
    log += ']'
    for arg in args:
        log += '[{}]'.format(arg)
    return log


def second_to_time_str(second, omit_hours_if_possible=True):
    second = int(second)
    m, s = divmod(second, 60)
    h, m = divmod(m, 60)
    if omit_hours_if_possible:
        if h == 0:
            return '{:02d}:{:02d}'.format(m, s)
    return '{:02d}:{:02d}:{:02d}'.format(h, m, s)


def progress_bar_iter(task_list, width=20, with_ptg=True, step_time_average=50, name=None):
    total_step = len(task_list)
    step_times = []
    start_time = 0.0
    name = '' if name is None else f"[{name}]"
    for i, task in enumerate(task_list):
        t = time.time()
        yield task
        step_times.append(time.time() - t)
        start_time += step_times[-1]
        start_time_str = second_to_time_str(start_time)
        average_step_time = np.mean(step_times[-step_time_average:]) + 1e-6
        speed_str = "{:.2f}it/s".format(1 / average_step_time)
        remain_time = (total_step - i) * average_step_time
        remain_time_str = second_to_time_str(remain_time)
        time_str = start_time_str + '>' + remain_time_str
        prog_str = progress_str(
            (i + 1) / total_step,
            speed_str,
            time_str,
            width=width,
            with_ptg=with_ptg)
        print(name + prog_str + '   ', end='\r', flush=True)
    print("")


list_bar = progress_bar_iter

def enumerate_bar(task_list, width=20, with_ptg=True, step_time_average=50, name=None):
    total_step = len(task_list)
    step_times = []
    start_time = 0.0
    name = '' if name is None else f"[{name}]"
    for i, task in enumerate(task_list):
        t = time.time()
        yield i, task
        step_times.append(time.time() - t)
        start_time += step_times[-1]
        start_time_str = second_to_time_str(start_time)
        average_step_time = np.mean(step_times[-step_time_average:]) + 1e-6
        speed_str = "{:.2f}it/s".format(1 / average_step_time)
        remain_time = (total_step - i) * average_step_time
        remain_time_str = second_to_time_str(remain_time)
        time_str = start_time_str + '>' + remain_time_str
        prog_str = progress_str(
            (i + 1) / total_step,
            speed_str,
            time_str,
            width=width,
            with_ptg=with_ptg)
        print(name + prog_str + '   ', end='\r', flush=True)
    print("")


def max_point_str(val, max_point):
    positive = bool(val >= 0.0)
    val = np.abs(val)
    if val == 0:
        point = 1
    else:
        point = max(int(np.log10(val)), 0) + 1
    fmt = "{:." + str(max(max_point - point, 0)) + "f}"
    if positive is True:
        return fmt.format(val)
    else:
        return fmt.format(-val)


class Unit(enum.Enum):
    Iter = 'iter'
    Byte = 'byte'


def convert_size(size_bytes):
    # from https://stackoverflow.com/questions/5194057/better-way-to-convert-file-sizes-in-python
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return s, size_name[i]


class ProgressBar:
    def __init__(self,
                 width=20,
                 with_ptg=True,
                 step_time_average=50,
                 speed_unit=Unit.Iter):
        self._width = width
        self._with_ptg = with_ptg
        self._step_time_average = step_time_average
        self._step_times = []
        self._start_time = 0.0
        self._total_size = None
        self._speed_unit = speed_unit

    def start(self, total_size):
        self._start = True
        self._step_times = []
        self._finished_sizes = []
        self._time_elapsed = 0.0
        self._current_time = time.time()
        self._total_size = total_size
        self._progress = 0

    def print_bar(self, finished_size=1, pre_string=None, post_string=None):
        self._step_times.append(time.time() - self._current_time)
        self._finished_sizes.append(finished_size)
        self._time_elapsed += self._step_times[-1]
        start_time_str = second_to_time_str(self._time_elapsed)
        time_per_size = np.array(self._step_times[-self._step_time_average:])
        time_per_size /= np.array(
            self._finished_sizes[-self._step_time_average:])
        average_step_time = np.mean(time_per_size) + 1e-6
        if self._speed_unit == Unit.Iter:
            speed_str = "{:.2f}it/s".format(1 / average_step_time)
        elif self._speed_unit == Unit.Byte:
            size, size_unit = convert_size(1 / average_step_time)
            speed_str = "{:.2f}{}/s".format(size, size_unit)
        else:
            raise ValueError("unknown speed unit")
        remain_time = (self._total_size - self._progress) * average_step_time
        remain_time_str = second_to_time_str(remain_time)
        time_str = start_time_str + '>' + remain_time_str
        prog_str = progress_str(
            (self._progress + 1) / self._total_size,
            speed_str,
            time_str,
            width=self._width,
            with_ptg=self._with_ptg)
        self._progress += finished_size
        if pre_string is not None:
            prog_str = pre_string + prog_str
        if post_string is not None:
            prog_str += post_string
        if self._progress >= self._total_size:
            print(prog_str + '   ',flush=True)
        else:
            print(prog_str + '   ', end='\r', flush=True)
        self._current_time = time.time()


================================================
FILE: utils/rand_utils.py
================================================
import numpy as np  

def truncated_normal(u, sigma, min, max, shape=None):
    """ Generate data following truncated normal distribution

    Args:
        u ([type]): mean
        sigma ([type]): var=sigma^2
        min ([type]): lower bound of the truncating range
        max ([type]): higher bound of the truncating range
        shape ([type], optional): [description]. Defaults to None.
    """

    val = min-1 
    while val<min or val>max: #iterative sampling until the first qualified data emerge
        if shape is not None:
            val = sigma*np.random.randn(shape)+u
        else:
            val = sigma*np.random.randn()+u

    assert val != min-1 

    return val
        

================================================
FILE: utils/singleton.py
================================================
# import h5py

class Singleton(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
        return cls._instances[cls]


================================================
FILE: utils/timer.py
================================================
import time 
from contextlib import contextmanager


@contextmanager
def simple_timer(name=''):
    t = time.time()
    yield 
    print(f"{name} exec time: {time.time() - t}")


def singleton(class_):
    instances = {}
    def getinstance(*args, **kwargs):
        if class_ not in instances:
            instances[class_] = class_(*args, **kwargs)
        return instances[class_]
    return getinstance

@singleton
class timming(object):
    def __init__(self):
        super().__init__()
        self.items={}

    def start(self, item_name):
        if item_name not in self.items:
            self.items[item_name]={
                "end_cnt": 0,
                "avg": 0,
                "start_t": time.time(),
                "is_finished": False
            }
        else:
            assert self.items[item_name]["is_finished"]==True
            self.items[item_name].update(
                {   
                    # "start_cnt":  self.items[item_name]["start_cnt"]+1,
                    "end_cnt": self.items[item_name]["end_cnt"], #unchanged
                    "avg": self.items[item_name]["avg"], #unchanged 
                    "start_t": time.time(),
                    "is_finished": False
                }
            )

    def end(self, item_name):
        assert item_name in self.items
        t=time.time()
        interval=t-self.items[item_name]['start_t']
        self.items[item_name].update(
            {   
                "end_cnt":  self.items[item_name]["end_cnt"]+1,
                "avg": (self.items[item_name]["avg"]*self.items[item_name]["end_cnt"]+interval)/(self.items[item_name]["end_cnt"]+1) ,
                "is_finished": True
            }
        )
    def summarize(self):
        for k in self.items:
            print(f"Average time of {k} = {self.items[k]['avg']*1000} ms", f"(averaged on {self.items[k]['end_cnt']} testing cycles.") 


================================================
FILE: utils/util.py
================================================
import torch
import numpy as np
import collections


def freeze_params(params: dict, include: str = None, exclude: str = None):
    assert isinstance(params, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    remain_params = []
    for k, p in params.items():
        if include_re is not None:
            if include_re.match(k) is not None:
                continue
        if exclude_re is not None:
            if exclude_re.match(k) is None:
                continue
        remain_params.append(p)
    return remain_params


def freeze_params_v2(params: dict, include: str = None, exclude: str = None):
    assert isinstance(params, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    for k, p in params.items():
        if include_re is not None:
            if include_re.match(k) is not None:
                p.requires_grad = False
        if exclude_re is not None:
            if exclude_re.match(k) is None:
                p.requires_grad = False


def filter_param_dict(state_dict: dict, include: str = None, exclude: str = None):
    assert isinstance(state_dict, dict)
    include_re = None
    if include is not None:
        include_re = re.compile(include)
    exclude_re = None
    if exclude is not None:
        exclude_re = re.compile(exclude)
    res_dict = {}
    for k, p in state_dict.items():
        if include_re is not None:
            if include_re.match(k) is None:
                continue
        if exclude_re is not None:
            if exclude_re.match(k) is not None:
                continue
        res_dict[k] = p
    return res_dict

def modify_parameter_name_with_map(state_dict, parameteter_name_map=None):
    if parameteter_name_map is None:
        return state_dict
    for old,new in parameteter_name_map:
        for key in list(state_dict.keys()) :
            if old in key:
                new_key=key.replace(old, new)
                state_dict[new_key] = state_dict.pop(key)
    return state_dict

def load_pretrained_model_map_func(state_dict,parameteter_name_map = None, include:str=None, exclude:str=None):
    state_dict = filter_param_dict(state_dict, include, exclude)
    state_dict = modify_parameter_name_with_map(state_dict, parameteter_name_map)
    

def list_recursive_op(input_list, op):
    assert isinstance(input_list, list)

    for i, v in enumerate(input_list):
        if isinstance(v, list):
            input_list[i] = list_recursive_op(v, op)
        elif isinstance(v, dict):
            input_list[i] = dict_recursive_op(v, op)
        else:
            input_list[i] = op(v)

    return input_list


def dict_recursive_op(input_dict, op):
    assert isinstance(input_dict, dict)

    for k, v in input_dict.items():
        if isinstance(v, dict):
            input_dict[k] = dict_recursive_op(v, op)
        elif isinstance(v, (list,tuple) ):
            input_dict[k] = list_recursive_op(v, op)
        else:
            input_dict[k] = op(v)

    return input_dict


================================================
FILE: utils/visualize.py
================================================
import numpy as np 
import cv2 
import copy

def vis_pointclouds_cv2(pc, K, win_size, init_transform=None, color=None, img=None):
    '''
    pc: input point cloud of shape Nx3
    K: camera intrinsic of shape 3x3
    win_size: visualization window size (Wx,Wy)
    '''
    x = (K@pc.T).T  #Nx3
    
    x = x/x[:,-1:]
    x = x.astype(np.int32)
    x[:,0] = np.where((x[:,0]<0) | (x[:,0]>=win_size[1]), np.zeros_like(x[:,0]), x[:,0])
    x[:,1] = np.where((x[:,1]<0) | (x[:,1]>=win_size[0]), np.zeros_like(x[:,1]), x[:,1])

    if img is None:
        img=np.zeros(list(win_size)+[3], dtype=np.uint8)
    if color is None:
        # color = [255, 255, 0]
        color = [255, 255, 0]

    img[x[:, 1], x[:, 0]] = color
    
    img[x[pc[:,-1]<0][:,1], x[pc[:,-1]<0][:,0] ] = [255,0,0]

    return img
   
def vis_2d_keypoints_cv2(img, keypoints, color=None):
    '''
    img: input point cloud of shape HxWx3
    keypoints: Nx2 , (x,y)
    '''

    keypoints = np.around(keypoints).astype(np.int32)
    img=copy.copy(img)
    if color is None:
        color = [255, 255, 0]
    img[keypoints[:,1], keypoints[:,0]] = color

    return img
   

def get_model_corners(model):
    min_x, max_x = np.min(model[:, 0]), np.max(model[:, 0])
    min_y, max_y = np.min(model[:, 1]), np.max(model[:, 1])
    min_z, max_z = np.min(model[:, 2]), np.max(model[:, 2])
    corners_3d = np.array([
        [min_x, min_y, min_z],
        [min_x, min_y, max_z],
        [min_x, max_y, min_z],
        [min_x, max_y, max_z],
        [max_x, min_y, min_z],
        [max_x, min_y, max_z],
        [max_x, max_y, min_z],
        [max_x, max_y, max_z],
    ])
    return corners_3d

def vis_pose_box(RT,K, model, background=None,fig=None, ax=None, title='', label='', x_label='x', y_label='y', color='g', dot='-'):
    
    if fig is None:
        if background is not None:
            # dpi = float(matplotlib.rcParams['figure.dpi'])
            dpi=100
            # print(float(matplotlib.rcParams['figure.dpi']))
            fig = plt.figure(figsize=[s/dpi for s in background.shape[:2]], dpi=dpi )
        else:
            fig = plt.figure()

    if ax is None:
        ax=fig.gca()
    # ax.set_axis_off()
    corner_3d=get_model_corners(model)
    corner_2d = project(corner_3d, K, RT)

    if background is not None:
        ax.imshow(background)
    ax.add_patch(patches.Polygon(
        xy=corner_2d[[0, 1, 3, 2, 0, 4, 6, 2]], fill=False, linewidth=1, edgecolor=color))
    ax.add_patch(patches.Polygon(
        xy=corner_2d[[5, 4, 6, 7, 5, 1, 3, 7]], fill=False, linewidth=1, edgecolor=color))  

    # line,=ax.plot(x, y, dot, color=color, linewidth=1)    
    # line.set_label(label)
    # ax.set_title(title)
    # ax.set_xlabel(x_label)
    # ax.set_ylabel(y_label)
    # ax.axis('equal')
    # if label !='':
    #     ax.legend()
    return fig, ax

================================================
FILE: weights/gru_update.pth
================================================
[File too large to display: 11.9 MB]