Repository: leonardodalinky/fpsample
Branch: main
Commit: 4124a21dc664
Files: 20
Total size: 173.5 KB

Directory structure:
gitextract_4aj034v5/

├── .github/
│   └── workflows/
│       ├── release.yml
│       └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── bench/
│   └── test_bench.py
├── pyproject.toml
└── src/
    ├── _ext/
    │   ├── Interval.h
    │   ├── KDLineTree.h
    │   ├── KDNode.h
    │   ├── KDTree.h
    │   ├── KDTreeBase.h
    │   ├── Point.h
    │   └── utils.h
    ├── fpsample/
    │   └── __init__.py
    ├── lib.cpp
    ├── nanoflann.hpp
    └── wrapper.hpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/release.yml
================================================
name: release-upload

on:
  push:
    tags:
      - 'v*.*.*'

permissions:
  id-token: write
  contents: read

jobs:
  sdist:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install build dependencies
        run: pip install build
      - name: Build sdist
        run: python -m build --sdist
      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://upload.pypi.org/legacy/
          user: __token__
          password: ${{ secrets.PYPI_TOKEN }}


================================================
FILE: .github/workflows/test.yml
================================================
name: test-upload

on:
  push:
    tags:
      - 'test*'

permissions:
  id-token: write
  contents: read

jobs:
  sdist:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install build dependencies
        run: pip install build
      - name: Build sdist
        run: python -m build --sdist
      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://test.pypi.org/legacy/
          user: __token__
          password: ${{ secrets.TEST_PYPI_TOKEN }}


================================================
FILE: .gitignore
================================================
/target

# Byte-compiled / optimized / DLL files
__pycache__/
.pytest_cache/
*.py[cod]

# C extensions
*.so

# Distribution / packaging
.Python
.venv/
env/
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
include/
man/
venv/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt
pip-selfcheck.json

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject

# Rope
.ropeproject

# Django stuff:
*.log
*.pot

.DS_Store

# Sphinx documentation
docs/_build/

# PyCharm
.idea/

# VSCode
.vscode/

# Pyenv
.python-version

# tmp files
tmp*
*.out

# tmp benchmark
.benchmarks/

# uv
uv.lock


================================================
FILE: .pre-commit-config.yaml
================================================
exclude: "point-e/.*"
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
  rev: v4.4.0
  hooks:
  - id: trailing-whitespace
  - id: end-of-file-fixer
  - id: name-tests-test
  - id: requirements-txt-fixer
- repo: https://github.com/pycqa/isort
  rev: 5.12.0
  hooks:
    - id: isort
      args: ["--profile", "black", "--line-length=100", "--python-version=39"]
- repo: https://github.com/psf/black
  rev: 23.7.0
  hooks:
    - id: black
      args: ["--line-length=100", "--target-version=py39"]


================================================
FILE: CMakeLists.txt
================================================
# Require CMake 3.15+ (matching scikit-build-core) Use new versions of all
# policies up to CMake 4.0
cmake_minimum_required(VERSION 3.15...4.0)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

project(
  ${SKBUILD_PROJECT_NAME}
  VERSION ${SKBUILD_PROJECT_VERSION}
  LANGUAGES CXX)

find_package(pybind11 CONFIG REQUIRED)

pybind11_add_module(_fpsample MODULE src/lib.cpp)
target_link_libraries(_fpsample PRIVATE pybind11::headers)

target_compile_definitions(_fpsample PRIVATE VERSION_INFO=${PROJECT_VERSION})
install(TARGETS _fpsample DESTINATION fpsample)


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023 AyajiLin

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# fpsample
[![pypi package version badge](https://img.shields.io/pypi/v/fpsample)](https://pypi.org/project/fpsample/)
![python version badge](https://img.shields.io/badge/python-%3E%3D3.7-blue)
[![license badge](https://img.shields.io/github/license/leonardodalinky/fpsample)](https://github.com/leonardodalinky/fpsample/blob/main/LICENSE)
[![star badge](https://img.shields.io/github/stars/leonardodalinky/fpsample?style=social)](https://github.com/leonardodalinky/fpsample)

Python efficient farthest point sampling (FPS) library, 100x faster than `numpy` implementation.

`fpsample` is coupled with `numpy` and built upon Rust pyo3 bindings. This library aims at achieving the best performance for FPS in single-threaded CPU environment.

🎉 **PyTorch version with native multithreading, batch ops, Autograd and CUDA supports is in [pytorch_fpsample](https://github.com/leonardodalinky/pytorch_fpsample).**

## Installation

### Install from PyPI

`numpy>=1.16.0` is required. Install `fpsample` using pip:

```shell
pip install -U fpsample
```

If you encounter any installation errors, please make an issue and try to compile from source.

### Build from source

Since Version 1.0.0, fpsample is built using [Scikit-build-core + pybind11](https://github.com/pybind/scikit_build_example). Therefore, you can build and install this library from source just with pip.

C++ compiler must support C++17.

CMake>=3.15 is required.

```shell
git clone https://github.com/leonardodalinky/fpsample.git
cd fpsample
pip install .
```

#### Direct porting of `QuickFPS`

See `src/warpper.hpp` and `src/_ext/` for details.

## Usage

```python
import fpsample
import numpy as np

# Generate random point cloud
pc = np.random.rand(4096, 3)
## sample 1024 points

# Vanilla FPS
fps_samples_idx = fpsample.fps_sampling(pc, 1024)

# FPS + NPDU
fps_npdu_samples_idx = fpsample.fps_npdu_sampling(pc, 1024)
## or specify the windows size
fps_npdu_samples_idx = fpsample.fps_npdu_sampling(pc, 1024, w=64)

# FPS + NPDU + KDTree
fps_npdu_kdtree_samples_idx = fpsample.fps_npdu_kdtree_sampling(pc, 1024)
## or specify the windows size
fps_npdu_kdtree_samples_idx = fpsample.fps_npdu_kdtree_sampling(pc, 1024, w=64)

# KDTree-based FPS
kdtree_fps_samples_idx = fpsample.bucket_fps_kdtree_sampling(pc, 1024)

# NOTE: Probably the best
# Bucket-based FPS or QuickFPS
kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(pc, 1024, h=3)
```

* `FPS`: Vanilla farthest point sampling. Implemented in Rust. Achieve the same performance as `numpy`.
* `FPS + NPDU`: Farthest point sampling with nearest-point-distance-updating (NPDU) heuristic strategy. 5x~10x faster than vanilla FPS. **Require dimensional locality and give sub-optimal answers**.
* `FPS + NPDU + KDTree`: Farthest point sampling with NPDU heuristic strategy and KDTree. 3x~8x faster than vanilla FPS. Slightly slower than `FPS + NPDU`. But **DOES NOT** require dimensional locality.
* `KDTree-based FPS`: A farthest point sampling algorithm based on KDTree. About 40~50x faster than vanilla FPS.
* `Bucket-based FPS` or `QuickFPS`: A bucket-based farthest point sampling algorithm. About 80~100x faster than vanilla FPS. Require an additional hyperparameter for the height of the KDTree. In practice, `h=3` or `h=5` is recommended for small data, `h=7` is recommended for medium data, and `h=9` for extremely large data.

> **NOTE**: 🔥 In most cases, `Bucket-based FPS` is the best choice, with proper hyperparameter setting.

### Determinism

For deterministic results, fix the first sampled point index by passing the `start_idx` parameter.
```python
kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(pc, 1024, h=3, start_idx=0)
```

**OR** set the random seed before calling the function.
```python
np.random.seed(42)
```

## Development

Install dependencies:
```shell
uv sync
```

## Performance
Setup:
  - CPU: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz
  - RAM: 512 GiB
  - SYSTEM: Ubuntu 20.04.6 LTS

Run benchmark:
```shell
py.test bench/ --benchmark-columns=mean,stddev --benchmark-sort=mean
```

Results:
```
------------------------------------------------------------------------------- benchmark '1024 of 4096': 8 tests -------------------------------------------------------------------------------
Name (time in ms)                    Min                Max               Mean            StdDev             Median               IQR            Outliers       OPS            Rounds  Iterations
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_bucket_fps_kdline_4k_h5      1.7892 (1.0)       2.2010 (1.01)      1.8301 (1.0)      0.0273 (1.20)      1.8255 (1.0)      0.0182 (1.0)         53;31  546.4295 (1.0)         491           1
test_bucket_fps_kdline_4k_h3      2.0452 (1.14)      2.1714 (1.0)       2.0862 (1.14)     0.0227 (1.0)       2.0817 (1.14)     0.0220 (1.21)        99;34  479.3375 (0.88)        436           1
test_bucket_fps_kdline_4k_h7      3.0348 (1.70)      3.2599 (1.50)      3.1082 (1.70)     0.0391 (1.72)      3.1005 (1.70)     0.0568 (3.12)        107;4  321.7322 (0.59)        306           1
test_fps_npdu_4k                  3.9024 (2.18)      4.3511 (2.00)      3.9514 (2.16)     0.0395 (1.74)      3.9457 (2.16)     0.0283 (1.55)        29;14  253.0768 (0.46)        245           1
test_bucket_fps_kdtree_4k         6.5697 (3.67)      7.3863 (3.40)      6.7525 (3.69)     0.1150 (5.07)      6.7240 (3.68)     0.0926 (5.08)        26;11  148.0923 (0.27)        126           1
test_fps_npdu_kdtree_4k           9.4114 (5.26)      9.7609 (4.50)      9.4979 (5.19)     0.0613 (2.70)      9.4807 (5.19)     0.0362 (1.99)        14;13  105.2866 (0.19)         97           1
test_vanilla_fps_4k              12.3702 (6.91)     12.8378 (5.91)     12.5875 (6.88)     0.1840 (8.11)     12.5024 (6.85)     0.3712 (20.38)        40;0   79.4438 (0.15)         77           1
test_vanilla_fps_4k_multiple     12.4058 (6.93)     13.2153 (6.09)     12.4706 (6.81)     0.1639 (7.22)     12.4139 (6.80)     0.0268 (1.47)         7;12   80.1886 (0.15)         80           1
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------- benchmark '4096 of 50000': 7 tests --------------------------------------------------------------------------------
Name (time in ms)                      Min                 Max                Mean            StdDev              Median               IQR            Outliers      OPS            Rounds  Iterations
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_bucket_fps_kdline_50k_h7      24.7285 (1.0)       25.2791 (1.0)       25.0179 (1.0)      0.1237 (1.0)       25.0139 (1.0)      0.1184 (1.0)          10;3  39.9714 (1.0)          40           1
test_bucket_fps_kdline_50k_h5      31.7191 (1.28)      32.9791 (1.30)      32.1800 (1.29)     0.3308 (2.68)      32.0584 (1.28)     0.3701 (3.13)          6;3  31.0752 (0.78)         31           1
test_bucket_fps_kdline_50k_h3      66.7806 (2.70)      67.5433 (2.67)      67.1429 (2.68)     0.2265 (1.83)      67.0898 (2.68)     0.2654 (2.24)          5;0  14.8936 (0.37)         15           1
test_bucket_fps_kdtree_50k         82.7767 (3.35)      85.9608 (3.40)      84.5684 (3.38)     0.9575 (7.74)      84.5108 (3.38)     1.2572 (10.62)         3;0  11.8248 (0.30)         11           1
test_fps_npdu_50k                 179.8323 (7.27)     181.9054 (7.20)     180.5911 (7.22)     0.7326 (5.92)     180.4329 (7.21)     0.7742 (6.54)          2;0   5.5374 (0.14)          6           1
test_fps_npdu_kdtree_50k          253.4978 (10.25)    255.2537 (10.10)    254.2810 (10.16)    0.7408 (5.99)     253.9220 (10.15)    1.1832 (9.99)          2;0   3.9327 (0.10)          5           1
test_vanilla_fps_50k              592.6405 (23.97)    594.1419 (23.50)    593.2049 (23.71)    0.5970 (4.83)     593.2359 (23.72)    0.7815 (6.60)          1;0   1.6858 (0.04)          5           1
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------- benchmark '50000 of 100000': 7 tests ---------------------------------------------------------------------------------------
Name (time in ms)                          Min                    Max                   Mean              StdDev                 Median                 IQR            Outliers     OPS            Rounds  Iterations
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_bucket_fps_kdline_100k_h7        232.5230 (1.0)         234.3285 (1.0)         233.3726 (1.0)        0.7632 (1.49)        233.5817 (1.0)        1.2734 (1.66)          2;0  4.2850 (1.0)           5           1
test_bucket_fps_kdtree_100k           381.3098 (1.64)        388.0433 (1.66)        384.0891 (1.65)       3.5171 (6.84)        382.9141 (1.64)       5.0501 (6.56)          1;0  2.6036 (0.61)          3           1
test_bucket_fps_kdline_100k_h9        409.0352 (1.76)        420.7832 (1.80)        414.3774 (1.78)       5.9458 (11.57)       413.3137 (1.77)       8.8110 (11.45)         1;0  2.4133 (0.56)          3           1
test_bucket_fps_kdline_100k_h5        410.1506 (1.76)        411.1764 (1.75)        410.6451 (1.76)       0.5139 (1.0)         410.6085 (1.76)       0.7693 (1.0)           1;0  2.4352 (0.57)          3           1
test_fps_npdu_kdtree_100k           3,134.7921 (13.48)     3,137.0544 (13.39)     3,135.7064 (13.44)      1.1919 (2.32)      3,135.2726 (13.42)      1.6967 (2.21)          1;0  0.3189 (0.07)          3           1
test_fps_npdu_100k                  4,325.4402 (18.60)     4,341.7247 (18.53)     4,333.3156 (18.57)      8.1554 (15.87)     4,332.7821 (18.55)     12.2134 (15.87)         1;0  0.2308 (0.05)          3           1
test_vanilla_fps_100k              14,460.2331 (62.19)    14,664.4396 (62.58)    14,561.1574 (62.39)    102.1237 (198.73)   14,558.7995 (62.33)    153.1549 (199.07)        1;0  0.0687 (0.02)          3           1
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

```

## Reference
The nearest-point-distance-updating (NPDU) heuristic strategy is proposed in the following paper:
```bibtex
@INPROCEEDINGS{Li2022adjust,
  author={Li, Jingtao and Zhou, Jian and Xiong, Yan and Chen, Xing and Chakrabarti, Chaitali},
  booktitle={2022 IEEE Workshop on Signal Processing Systems (SiPS)},
  title={An Adjustable Farthest Point Sampling Method for Approximately-sorted Point Cloud Data},
  year={2022},
  volume={},
  number={},
  pages={1-6},
  doi={10.1109/SiPS55645.2022.9919246}
}
```

Bucket-based farthest point sampling (QuickFPS) is proposed in the following paper. The implementation is based on the author's [Repo](https://github.com/hanm2019/bucket-based_farthest-point-sampling_CPU). To port the implementation to other C++ program, check [this](https://github.com/leonardodalinky/fpsample/tree/main/src/bucket_fps/_ext) for details.
```bibtex
@article{han2023quickfps,
  title={QuickFPS: Architecture and Algorithm Co-Design for Farthest Point Sampling in Large-Scale Point Clouds},
  author={Han, Meng and Wang, Liang and Xiao, Limin and Zhang, Hao and Zhang, Chenhao and Xu, Xiangrong and Zhu, Jianfeng},
  journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
  year={2023},
  publisher={IEEE}
}
```

The KDTree-based NN search algorithm implemented by [nanoflann](https://github.com/jlblancoc/nanoflann)

Thanks to the authors for their great work.


================================================
FILE: bench/test_bench.py
================================================
import numpy as np
import pytest

import fpsample

TEST_SEED = 42
TEST_BENCHMARK_SETTINGS = {
    "4k": {"group": "1024 of 4096", "warmup": False},
    "50k": {"group": "4096 of 50000", "warmup": False},
    "100k": {"group": "50000 of 100000", "warmup": False, "min_rounds": 3},
}
TEST_CASE_SETTINGS = {
    "4k": (4096, 1024, 3),
    "50k": (50000, 4096, 3),
    "100k": (100_000, 50_000, 3),
}


def create_sample_data(n_points: int, n_dim: int = 3, seed: int = TEST_SEED):
    np.random.seed(seed)
    return np.random.rand(n_points, n_dim)


####################
#                  #
#    4k setting    #
#                  #
####################
@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_vanilla_fps_4k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_sampling, pc, n_samples)

@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_vanilla_fps_4k_multiple(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_sampling, pc, n_samples, start_idx=[0, 23, 64, 128])


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_fps_npdu_4k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_npdu_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_fps_npdu_kdtree_4k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_npdu_kdtree_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_bucket_fps_kdtree_4k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdtree_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_bucket_fps_kdline_4k_h3(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 3)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_bucket_fps_kdline_4k_h5(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 5)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["4k"])
def test_bucket_fps_kdline_4k_h7(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["4k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 7)


#####################
#                   #
#    50k setting    #
#                   #
#####################
@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["50k"])
def test_vanilla_fps_50k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["50k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["50k"])
def test_fps_npdu_50k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["50k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_npdu_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["50k"])
def test_fps_npdu_kdtree_50k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["50k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_npdu_kdtree_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["50k"])
def test_bucket_fps_kdtree_50k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["50k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdtree_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["50k"])
def test_bucket_fps_kdline_50k_h3(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["50k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 3)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["50k"])
def test_bucket_fps_kdline_50k_h5(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["50k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 5)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["50k"])
def test_bucket_fps_kdline_50k_h7(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["50k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 7)


######################
#                    #
#    100k setting    #
#                    #
######################
@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["100k"])
def test_vanilla_fps_100k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["100k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["100k"])
def test_fps_npdu_100k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["100k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_npdu_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["100k"])
def test_fps_npdu_kdtree_100k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["100k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.fps_npdu_kdtree_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["100k"])
def test_bucket_fps_kdtree_100k(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["100k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdtree_sampling, pc, n_samples)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["100k"])
def test_bucket_fps_kdline_100k_h5(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["100k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 5)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["100k"])
def test_bucket_fps_kdline_100k_h7(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["100k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 7)


@pytest.mark.benchmark(**TEST_BENCHMARK_SETTINGS["100k"])
def test_bucket_fps_kdline_100k_h9(benchmark):
    n_points, n_samples, n_dim = TEST_CASE_SETTINGS["100k"]
    pc = create_sample_data(n_points, n_dim)
    benchmark(fpsample.bucket_fps_kdline_sampling, pc, n_samples, 9)


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["scikit-build-core>=0.11", "pybind11>=2.6"]
build-backend = "scikit_build_core.build"

[project]
name = "fpsample"
version = "1.0.2"
authors = [
    { name = "Leonard Lin", email = "leonard.keilin@gmail.com" },
    { name = "guch8017", email = "guch8017@gmail.com" }
]
description = "An efficient CPU implementation of farthest point sampling (FPS) for point clouds."
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.8"
classifiers = [
    "Programming Language :: Python :: Implementation :: CPython",
    "Programming Language :: Python :: Implementation :: PyPy",
    "Programming Language :: Python :: 3 :: Only",
]
dependencies = [
    "numpy>=1.16.0",
    "pybind11>=2.6",
]

[project.urls]
Repository = "https://github.com/leonardodalinky/fpsample"
Tracker = "https://github.com/leonardodalinky/fpsample/issues"

[dependency-groups]
test = ["pytest"]
dev = [
    { include-group = "test" },
    "jupyterlab>=4.3.8",
    "pytest-benchmark>=4.0.0",
]


[tool.scikit-build]
minimum-version = "build-system.requires"


================================================
FILE: src/_ext/Interval.h
================================================
//
// Created by 韩萌 on 2022/6/14.
// Refactored by AyajiLin on 2023/9/16.
//

#ifndef KD_TREE_BASED_FARTHEST_POINT_SAMPLING_INTERVAL_HPP
#define KD_TREE_BASED_FARTHEST_POINT_SAMPLING_INTERVAL_HPP

namespace quickfps {
template <typename S> class Interval {
  public:
    S low, high;
    Interval() : low(0), high(0){};
    Interval(S low, S high) : low(low), high(high){};
    Interval(const Interval &o) : low(o.low), high(o.high){};
};
} // namespace quickfps

#endif // KD_TREE_BASED_FARTHEST_POINT_SAMPLING_INTERVAL_HPP


================================================
FILE: src/_ext/KDLineTree.h
================================================
//
// Created by hanm on 22-6-15.
// Refactored by AyajiLin on 2023/9/16.
//

#ifndef FPS_CPU_KDLINETREE_H
#define FPS_CPU_KDLINETREE_H

#include <limits>
#include <vector>

#include "KDTreeBase.h"

namespace quickfps {

template <typename T, size_t DIM, typename S = T>
class KDLineTree : public KDTreeBase<T, DIM, S> {
  public:
    using typename KDTreeBase<T, DIM, S>::_Point;
    using typename KDTreeBase<T, DIM, S>::_Points;
    using typename KDTreeBase<T, DIM, S>::NodePtr;

    KDLineTree(_Points data, size_t pointSize, size_t treeHigh,
               _Points samplePoints);
    ~KDLineTree();

    std::vector<NodePtr> KDNode_list;

    size_t high_;

    _Point max_point() override;

    void update_distance(const _Point &ref_point) override;

    void sample(size_t sample_num) override;

    bool leftNode(size_t high, size_t count) const override {
        return high == this->high_ || count == 1;
    };

    void addNode(NodePtr p) override;
};

template <typename T, size_t DIM, typename S>
KDLineTree<T, DIM, S>::KDLineTree(_Points data, size_t pointSize,
                                  size_t treeHigh, _Points samplePoints)
    : KDTreeBase<T, DIM, S>(data, pointSize, samplePoints), high_(treeHigh) {
    KDNode_list.clear();
}

template <typename T, size_t DIM, typename S>
KDLineTree<T, DIM, S>::~KDLineTree() {
    KDNode_list.clear();
}

template <typename T, size_t DIM, typename S>
typename KDLineTree<T, DIM, S>::_Point KDLineTree<T, DIM, S>::max_point() {
    _Point tmpPoint;
    S max_distance = std::numeric_limits<S>::lowest();
    for (const auto &bucket : KDNode_list) {
        if (bucket->max_point.dis > max_distance) {
            max_distance = bucket->max_point.dis;
            tmpPoint = bucket->max_point;
        }
    }
    return tmpPoint;
}

template <typename T, size_t DIM, typename S>
void KDLineTree<T, DIM, S>::update_distance(const _Point &ref_point) {
    for (const auto &bucket : KDNode_list) {
        bucket->send_delay_point(ref_point);
        bucket->update_distance();
    }
}

template <typename T, size_t DIM, typename S>
void KDLineTree<T, DIM, S>::sample(size_t sample_num) {
    _Point ref_point;
    for (size_t i = 1; i < sample_num; i++) {
        ref_point = this->max_point();
        this->sample_points[i] = ref_point;
        this->update_distance(ref_point);
    }
}

template <typename T, size_t DIM, typename S>
void KDLineTree<T, DIM, S>::addNode(NodePtr p) {
    size_t nodeIdx = KDNode_list.size();
    p->idx = nodeIdx;
    KDNode_list.push_back(p);
}

} // namespace quickfps

#endif // FPS_CPU_KDLINETREE_H


================================================
FILE: src/_ext/KDNode.h
================================================
//
// Created by 韩萌 on 2022/6/14.
// Refactored by AyajiLin on 2023/9/16.
//

#ifndef KD_TREE_BASED_FARTHEST_POINT_SAMPLING_KDNODE_H
#define KD_TREE_BASED_FARTHEST_POINT_SAMPLING_KDNODE_H
#include <array>
#include <limits>
#include <queue>
#include <vector>

#include "Interval.h"
#include "Point.h"

namespace quickfps {

template <typename T, size_t DIM, typename S> class KDNode {
  public:
    using _Point = Point<T, DIM, S>;
    using _Points = _Point *;
    _Points points;
    size_t pointLeft, pointRight;
    size_t idx;

    std::array<Interval<T>, DIM> bboxs;
    std::vector<_Point> waitpoints;
    std::vector<_Point> delaypoints;
    _Point max_point;
    KDNode *left;
    KDNode *right;

    KDNode();

    KDNode(const KDNode &a);

    KDNode(const std::array<Interval<T>, DIM> &bboxs);

    void init(const _Point &ref);

    void updateMaxPoint(const _Point &lpoint, const _Point &rpoint) {
        if (lpoint.dis > rpoint.dis)
            this->max_point = lpoint;
        else
            this->max_point = rpoint;
    }

    S bound_distance(const _Point &ref_point) const;

    void send_delay_point(const _Point &point) {
        this->waitpoints.push_back(point);
    }

    void update_distance();

    void reset();

    size_t size() const;
};

template <typename T, size_t DIM, typename S>
KDNode<T, DIM, S>::KDNode()
    : points(nullptr), pointLeft(0), pointRight(0), left(nullptr),
      right(nullptr) {}

template <typename T, size_t DIM, typename S>
KDNode<T, DIM, S>::KDNode(const std::array<Interval<T>, DIM> &other_bboxs)
    : points(nullptr), pointLeft(0), pointRight(0), left(nullptr),
      right(nullptr) {
    std::copy(other_bboxs.cbegin(), other_bboxs.cend(), this->bboxs.begin());
}

template <typename T, size_t DIM, typename S>
KDNode<T, DIM, S>::KDNode(const KDNode &a)
    : points(a.points), pointLeft(a.pointLeft), pointRight(a.pointRight),
      left(a.left), right(a.right), idx(a.idx) {
    std::copy(a.bboxs.cbegin(), a.bboxs.cend(), this->bboxs.begin());
    std::copy(a.waitpoints.cbegin(), a.waitpoints.cend(),
              this->waitpoints.begin());
    std::copy(a.delaypoints.cbegin(), a.delaypoints.cend(),
              this->delaypoints.begin());
}

template <typename T, size_t DIM, typename S>
void KDNode<T, DIM, S>::init(const _Point &ref) {
    waitpoints.clear();
    delaypoints.clear();
    if (this->left && this->right) {
        this->left->init(ref);
        this->right->init(ref);
        updateMaxPoint(this->left->max_point, this->right->max_point);
    } else {
        S dis;
        S maxdis = std::numeric_limits<S>::lowest();
        for (size_t i = pointLeft; i < pointRight; i++) {
            dis = points[i].updatedistance(ref);
            if (dis > maxdis) {
                maxdis = dis;
                max_point = points[i];
            }
        }
    }
}

template <typename T, size_t DIM, typename S>
S KDNode<T, DIM, S>::bound_distance(const _Point &ref_point) const {
    S bound_dis(0);
    S dim_distance;
    for (size_t cur_dim = 0; cur_dim < DIM; cur_dim++) {
        dim_distance = 0;
        if (ref_point[cur_dim] > this->bboxs[cur_dim].high)
            dim_distance = ref_point[cur_dim] - this->bboxs[cur_dim].high;
        else if (ref_point[cur_dim] < this->bboxs[cur_dim].low)
            dim_distance = this->bboxs[cur_dim].low - ref_point[cur_dim];
        bound_dis += powi(dim_distance, 2);
    }
    return bound_dis;
}

template <typename T, size_t DIM, typename S>
void KDNode<T, DIM, S>::update_distance() {
    for (const auto &ref_point : this->waitpoints) {
        S lastmax_distance = this->max_point.dis;
        S cur_distance = this->max_point.distance(ref_point);
        // cur_distance >
        // lastmax_distance意味着当前Node的max_point不会进行更新
        if (cur_distance > lastmax_distance) {
            S boundary_distance = bound_distance(ref_point);
            if (boundary_distance < lastmax_distance)
                this->delaypoints.push_back(ref_point);
        } else {
            if (this->right && this->left) {
                if (!delaypoints.empty()) {
                    for (const auto &delay_point : delaypoints) {
                        this->left->send_delay_point(delay_point);
                        this->right->send_delay_point(delay_point);
                    }
                    delaypoints.clear();
                }
                this->left->send_delay_point(ref_point);
                this->left->update_distance();

                this->right->send_delay_point(ref_point);
                this->right->update_distance();

                updateMaxPoint(this->left->max_point, this->right->max_point);
            } else {
                S dis;
                S maxdis;
                this->delaypoints.push_back(ref_point);
                for (const auto &delay_point : delaypoints) {
                    maxdis = std::numeric_limits<S>::lowest();
                    for (size_t i = pointLeft; i < pointRight; i++) {
                        dis = points[i].updatedistance(delay_point);
                        if (dis > maxdis) {
                            maxdis = dis;
                            max_point = points[i];
                        }
                    }
                }
                this->delaypoints.clear();
            }
        }
    }
    this->waitpoints.clear();
}

template <typename T, size_t DIM, typename S> void KDNode<T, DIM, S>::reset() {
    for (size_t i = pointLeft; i < pointRight; i++) {
        points[i].reset();
    }
    this->waitpoints.clear();
    this->delaypoints.clear();
    this->max_point.reset();
    if (this->left && this->right) {
        this->left->reset();
        this->right->reset();
    }
}

template <typename T, size_t DIM, typename S>
size_t KDNode<T, DIM, S>::size() const {
    if (this->left && this->right)
        return this->left->size() + this->right->size();
    return (pointRight - pointLeft);
}

} // namespace quickfps

#endif // KD_TREE_BASED_FARTHEST_POINT_SAMPLING_KDNODE_H


================================================
FILE: src/_ext/KDTree.h
================================================
//
// Created by hanm on 22-6-15.
// Refactored by AyajiLin on 2023/9/16.
//

#ifndef FPS_CPU_KDTREE_H
#define FPS_CPU_KDTREE_H

#include "KDTreeBase.h"

namespace quickfps {

template <typename T, size_t DIM, typename S = T>
class KDTree : public KDTreeBase<T, DIM, S> {
  public:
    using typename KDTreeBase<T, DIM, S>::_Point;
    using typename KDTreeBase<T, DIM, S>::_Points;
    using typename KDTreeBase<T, DIM, S>::NodePtr;
    explicit KDTree(_Points data, size_t pointSize, _Points samplePoints);

    _Point max_point() override { return this->root_->max_point; };

    void update_distance(const _Point &ref_point) override;

    void sample(size_t sample_num) override;

    bool leftNode(size_t, size_t count) const override { return count == 1; };

    void addNode(NodePtr) override{};
};

template <typename T, size_t DIM, typename S>
KDTree<T, DIM, S>::KDTree(_Points data, size_t pointSize, _Points samplePoints)
    : KDTreeBase<T, DIM, S>(data, pointSize, samplePoints) {}

template <typename T, size_t DIM, typename S>
void KDTree<T, DIM, S>::update_distance(const _Point &ref_point) {
    this->root_->send_delay_point(ref_point);
    this->root_->update_distance();
}

template <typename T, size_t DIM, typename S>
void KDTree<T, DIM, S>::sample(size_t sample_num) {
    _Point ref_point;
    for (size_t i = 1; i < sample_num; i++) {
        ref_point = this->max_point();
        this->sample_points[i] = ref_point;
        this->update_distance(ref_point);
    }
}

} // namespace quickfps

#endif // FPS_CPU_KDTREE_H


================================================
FILE: src/_ext/KDTreeBase.h
================================================
//
// Created by 韩萌 on 2022/6/14.
// Refactored by AyajiLin on 2023/9/16.
//

#ifndef KD_TREE_BASED_FARTHEST_POINT_SAMPLING_KDTREE_H
#define KD_TREE_BASED_FARTHEST_POINT_SAMPLING_KDTREE_H

#include "KDNode.h"
#include "Point.h"
#include <algorithm>
#include <array>
#include <numeric>

namespace quickfps {

template <typename T, size_t DIM, typename S> class KDTreeBase {
  public:
    using _Point = Point<T, DIM, S>;
    using _Points = _Point *;
    using NodePtr = KDNode<T, DIM, S> *;
    using _Interval = Interval<T>;

    size_t pointSize;
    _Points sample_points;
    NodePtr root_;
    _Points points_;

  public:
    KDTreeBase(_Points data, size_t pointSize, _Points samplePoints);

    ~KDTreeBase();

    void buildKDtree();

    NodePtr get_root() const { return this->root_; };

    void init(const _Point &ref);

    virtual _Point max_point() = 0;

    virtual void sample(size_t sample_num) = 0;

  protected:
    void deleteNode(NodePtr node_p);
    virtual void addNode(NodePtr p) = 0;
    virtual bool leftNode(size_t high, size_t count) const = 0;
    virtual void update_distance(const _Point &ref_point) = 0;

    NodePtr divideTree(ssize_t left, ssize_t right,
                       const std::array<_Interval, DIM> &bboxs,
                       size_t curr_high);

    size_t planeSplit(ssize_t left, ssize_t right, size_t split_dim,
                      T split_val);

    T qSelectMedian(size_t dim, size_t left, size_t right);
    static size_t findSplitDim(const std::array<_Interval, DIM> &bboxs);
    inline std::array<_Interval, DIM> computeBoundingBox(size_t left,
                                                         size_t right);
};

template <typename T, size_t DIM, typename S>
KDTreeBase<T, DIM, S>::KDTreeBase(_Points data, size_t pointSize,
                                  _Points samplePoints)
    : pointSize(pointSize), sample_points(samplePoints), root_(nullptr),
      points_(data) {}

template <typename T, size_t DIM, typename S>
KDTreeBase<T, DIM, S>::~KDTreeBase() {
    if (root_ != nullptr)
        deleteNode(root_);
}

template <typename T, size_t DIM, typename S>
void KDTreeBase<T, DIM, S>::deleteNode(NodePtr node_p) {
    if (node_p->left)
        deleteNode(node_p->left);
    if (node_p->right)
        deleteNode(node_p->right);
    delete node_p;
}

template <typename T, size_t DIM, typename S>
void KDTreeBase<T, DIM, S>::buildKDtree() {
    size_t left = 0;
    size_t right = pointSize;
    std::array<_Interval, DIM> bboxs = this->computeBoundingBox(left, right);
    this->root_ = divideTree(left, right, bboxs, 0);
}

template <typename T, size_t DIM, typename S>
typename KDTreeBase<T, DIM, S>::NodePtr
KDTreeBase<T, DIM, S>::divideTree(ssize_t left, ssize_t right,
                                  const std::array<_Interval, DIM> &bboxs,
                                  size_t curr_high) {
    NodePtr node = new KDNode<T, DIM, S>(bboxs);

    ssize_t count = right - left;
    if (this->leftNode(curr_high, count)) {
        node->pointLeft = left;
        node->pointRight = right;
        node->points = this->points_;
        this->addNode(node);
        return node;
    } else {
        size_t split_dim = this->findSplitDim(bboxs);
        T split_val = this->qSelectMedian(split_dim, left, right);

        size_t split_delta = planeSplit(left, right, split_dim, split_val);

        std::array<_Interval, DIM> bbox_cur =
            this->computeBoundingBox(left, left + split_delta);
        node->left =
            this->divideTree(left, left + split_delta, bbox_cur, curr_high + 1);
        bbox_cur = this->computeBoundingBox(left + split_delta, right);
        node->right = this->divideTree(left + split_delta, right, bbox_cur,
                                       curr_high + 1);
        return node;
    }
}

template <typename T, size_t DIM, typename S>
size_t KDTreeBase<T, DIM, S>::planeSplit(ssize_t left, ssize_t right,
                                         size_t split_dim, T split_val) {
    ssize_t start = left;
    ssize_t end = right - 1;

    for (;;) {
        while (start <= end && points_[start].pos[split_dim] < split_val)
            ++start;
        while (start <= end && points_[end].pos[split_dim] >= split_val)
            --end;

        if (start > end)
            break;
        std::swap(points_[start], points_[end]);
        ++start;
        --end;
    }

    ssize_t lim1 = start - left;
    if (start == left)
        lim1 = 1;
    if (start == right)
        lim1 = (right - left - 1);

    return static_cast<ssize_t>(lim1);
}

template <typename T, size_t DIM, typename S>
T KDTreeBase<T, DIM, S>::qSelectMedian(size_t dim, size_t left, size_t right) {
    T sum = std::accumulate(this->points_ + left, this->points_ + right, 0.0,
                            [dim](const T &acc, const _Point &point) {
                                return acc + point.pos[dim];
                            });
    return sum / (right - left);
}

template <typename T, size_t DIM, typename S>
size_t
KDTreeBase<T, DIM, S>::findSplitDim(const std::array<_Interval, DIM> &bboxs) {
    T min_, max_;
    T span = 0;
    size_t best_dim = 0;

    for (size_t cur_dim = 0; cur_dim < DIM; cur_dim++) {
        min_ = bboxs[cur_dim].low;
        max_ = bboxs[cur_dim].high;
        T cur_span = (max_ - min_);

        if (cur_span > span) {
            best_dim = cur_dim;
            span = cur_span;
        }
    }

    return best_dim;
}

template <typename T, size_t DIM, typename S>
inline std::array<Interval<T>, DIM>
KDTreeBase<T, DIM, S>::computeBoundingBox(size_t left, size_t right) {
    T min_vals[DIM];
    T max_vals[DIM];
    std::fill(min_vals, min_vals + DIM, std::numeric_limits<T>::max());
    std::fill(max_vals, max_vals + DIM, std::numeric_limits<T>::lowest());

    for (size_t i = left; i < right; ++i) {
        const _Point &pos = points_[i];

        for (size_t cur_dim = 0; cur_dim < DIM; cur_dim++) {
            T val = pos[cur_dim];
            min_vals[cur_dim] = std::min(min_vals[cur_dim], val);
            max_vals[cur_dim] = std::max(max_vals[cur_dim], val);
        }
    }

    std::array<_Interval, DIM> bboxs;

    for (size_t cur_dim = 0; cur_dim < DIM; cur_dim++) {
        bboxs[cur_dim].low = min_vals[cur_dim];
        bboxs[cur_dim].high = max_vals[cur_dim];
    }

    return bboxs;
}

template <typename T, size_t DIM, typename S>
void KDTreeBase<T, DIM, S>::init(const _Point &ref) {
    this->sample_points[0] = ref;
    this->root_->init(ref);
}

} // namespace quickfps

#endif // KD_TREE_BASED_FARTHEST_POINT_SAMPLING_KDTREE_H


================================================
FILE: src/_ext/Point.h
================================================
//
// Created by 韩萌 on 2022/6/14.
// Refactored by AyajiLin on 2023/9/16.
//

#ifndef KD_TREE_BASED_FARTHEST_POINT_SAMPLING_POINT_H
#define KD_TREE_BASED_FARTHEST_POINT_SAMPLING_POINT_H

#include "cmath"
#include "utils.h"
#include <algorithm>
#include <limits>

namespace quickfps {

template <typename T, size_t DIM, typename S = T> class Point {
  public:
    T pos[DIM]; // x, y, z
    S dis;
    size_t id;

    Point();
    Point(const T pos[DIM], size_t id);
    Point(const T pos[DIM], size_t id, S dis);
    Point(const Point &obj);
    ~Point(){};

    bool operator<(const Point &aii) const;

    constexpr T operator[](size_t i) const { return pos[i]; }

    Point &operator=(const Point &obj) {
        std::copy(obj.pos, obj.pos + DIM, this->pos);
        this->dis = obj.dis;
        this->id = obj.id;
        return *this;
    }

    constexpr S distance(const Point &b) { return _distance(b, DIM); }

    void reset();

    S updatedistance(const Point &ref);

    S updateDistanceAndCount(const Point &ref, size_t &count);

  private:
    constexpr S _distance(const Point &b, size_t dim_left) {
        return (dim_left == 0)
                   ? 0
                   : powi((this->pos[dim_left - 1] - b.pos[dim_left - 1]), 2) +
                         _distance(b, dim_left - 1);
    }
};

template <typename T, size_t DIM, typename S>
Point<T, DIM, S>::Point() : dis(std::numeric_limits<S>::max()), id(0) {
    std::fill(pos, pos + DIM, 0);
}

template <typename T, size_t DIM, typename S>
Point<T, DIM, S>::Point(const T pos[DIM], size_t id)
    : dis(std::numeric_limits<S>::max()), id(id) {
    std::copy(pos, pos + DIM, this->pos);
}

template <typename T, size_t DIM, typename S>
Point<T, DIM, S>::Point(const T pos[DIM], size_t id, S dis) : dis(dis), id(id) {
    std::copy(pos, pos + DIM, this->pos);
}

template <typename T, size_t DIM, typename S>
Point<T, DIM, S>::Point(const Point &obj) : dis(obj.dis), id(obj.id) {
    std::copy(obj.pos, obj.pos + DIM, this->pos);
}

template <typename T, size_t DIM, typename S>
bool Point<T, DIM, S>::operator<(const Point &aii) const {
    return dis < aii.dis;
}

template <typename T, size_t DIM, typename S>
S Point<T, DIM, S>::updatedistance(const Point &ref) {
    this->dis = std::min(this->dis, this->distance(ref));
    return this->dis;
}

template <typename T, size_t DIM, typename S>
S Point<T, DIM, S>::updateDistanceAndCount(const Point &ref, size_t &count) {
    S tempDistance = this->distance(ref);
    if (tempDistance < this->dis) {
        this->dis = tempDistance;
        count++;
    }
    return this->dis;
}

template <typename T, size_t DIM, typename S> void Point<T, DIM, S>::reset() {
    this->dis = std::numeric_limits<S>::max();
}

} // namespace quickfps

#endif // KD_TREE_BASED_FARTHEST_POINT_SAMPLING_POINT_H


================================================
FILE: src/_ext/utils.h
================================================
// Refactored by AyajiLin on 2023/9/16.

/* functional  */
#ifndef KD_TREE_UTILS_HPP
#define KD_TREE_UTILS_HPP
#include <cstddef>
#include <type_traits>

namespace quickfps {
using ssize_t = std::make_signed_t<size_t>;

template <typename T>
inline constexpr T powi(const T base, const size_t exponent) {
    // (parentheses not required in next line)
    return (exponent == 0) ? 1 : (base * pow(base, exponent - 1));
}
} // namespace quickfps

#endif // KD_TREE_UTILS_HPP


================================================
FILE: src/fpsample/__init__.py
================================================
from __future__ import annotations

import warnings
from typing import List, Optional, Union

import numpy as np

from ._fpsample import (
    __doc__,
    __version__,
    _bucket_fps_kdline_sampling,
    _bucket_fps_kdtree_sampling,
    _fps_npdu_kdtree_sampling,
    _fps_npdu_sampling,
    _fps_sampling,
)


def get_start_idx(n_pts: int, start_idx: Optional[Union[int, List[int]]]) -> Union[int, np.ndarray]:
    # Random pick a start or use the given start indices
    if start_idx is None:
        start_idx = np.random.randint(low=0, high=n_pts)
    elif isinstance(start_idx, int):
        # just use the int start_idx
        start_idx = start_idx
    elif isinstance(start_idx, list):
        # convert to numpy array for rust
        # TODO: maybe better performance with fortran array?
        start_idx = np.array(start_idx, dtype=np.uint64)
    else:
        raise ValueError("start_idx should be None, int or list")
    return start_idx


def fps_sampling(
    pc: np.ndarray, n_samples: int, start_idx: Optional[Union[int, List[int]]] = None
) -> np.ndarray:
    """
    Vanilla FPS sampling.

    Args:
        pc (np.ndarray): The input point cloud of shape (n_pts, D).
        n_samples (int): Number of samples.
        start_idx (int or list[int], default=None): The starting index of sampling. If set to None, it will be randomly picked.
            If multiple start indices are given, they are used as a start set for the FPS and will all be present in the final samples.
    Returns:
        np.ndarray: The selected indices of shape (n_samples,).
    """
    assert n_samples >= 1, "n_samples should be >= 1"
    assert pc.ndim == 2
    n_pts, _ = pc.shape
    assert n_pts >= n_samples, "n_pts should be >= n_samples"
    if isinstance(start_idx, int):
        assert (
            start_idx is None or 0 <= start_idx < n_pts
        ), "start_idx should be None or 0 <= start_idx < n_pts"
    if isinstance(start_idx, list):
        assert len(start_idx) <= n_samples, "len(start_idx) should be <= n_samples"
        for idx in start_idx:
            assert 0 <= idx < n_pts, "start_idx should be None or 0 <= start_idx < n_pts"
    # best performance with fortran array
    pc = np.asfortranarray(pc, dtype=np.float32)
    # Random pick a start if not given
    start_idx = get_start_idx(n_pts, start_idx)
    return _fps_sampling(pc, n_samples, start_idx)


def fps_npdu_sampling(
    pc: np.ndarray,
    n_samples: int,
    w: Optional[int] = None,
    start_idx: Optional[Union[int, List[int]]] = None,
) -> np.ndarray:
    """
    FPS sampling with nearest-point-distance-updating (NPDU) heuristic strategy.
    **Requires dimensional locality for best samples**.

    Args:
        pc (np.ndarray): The input point cloud of shape (n_pts, D).
        n_samples (int): Number of samples.
        w (int, default=None): Windows size of local heuristic search. If set to None, it will be set to `n_pts / n_samples * 16`.
        start_idx (int or list[int], default=None): The starting index of sampling. If set to None, it will be randomly picked.
            If multiple start indices are given, they are used as a start set for the FPS and will all be present in the final samples.
    Returns:
        np.ndarray: The selected indices of shape (n_samples,).
    """
    assert n_samples >= 1, "n_samples should be >= 1"
    assert pc.ndim == 2
    n_pts, _ = pc.shape
    assert n_pts >= n_samples, "n_pts should be >= n_samples"
    assert (
        start_idx is None or 0 <= start_idx < n_pts
    ), "start_idx should be None or 0 <= start_idx < n_pts"
    if isinstance(start_idx, list):
        assert len(start_idx) <= n_samples, "len(start_idx) should be <= n_samples"
    pc = np.ascontiguousarray(pc, dtype=np.float32)
    w = w or int(n_pts / n_samples * 16)
    if w >= n_pts - 1:
        warnings.warn(f"k is too large, set to {n_pts - 1}")
        w = n_pts - 1
    # Random pick a start if not given
    start_idx = get_start_idx(n_pts, start_idx)
    return _fps_npdu_sampling(pc, n_samples, w, start_idx)


def fps_npdu_kdtree_sampling(
    pc: np.ndarray,
    n_samples: int,
    w: Optional[int] = None,
    start_idx: Optional[Union[int, List[int]]] = None,
) -> np.ndarray:
    """
    FPS sampling with nearest-point-distance-updating (NPDU) heuristic strategy.
    Using KDTree to eliminate the need of dimensional locality.
    Slower than `fps_npdu_sampling` but more robust.

    Args:
        pc (np.ndarray): The input point cloud of shape (n_pts, D).
        n_samples (int): Number of samples.
        w (int, default=None): Windows size of local heuristic search. If set to None, it will be set to `n_pts / n_samples * 16`.
        start_idx (int or list[int], default=None): The starting index of sampling. If set to None, it will be randomly picked.
            If multiple start indices are given, they are used as a start set for the FPS and will all be present in the final samples.
    Returns:
        np.ndarray: The selected indices of shape (n_samples,).
    """
    assert n_samples >= 1, "n_samples should be >= 1"
    assert pc.ndim == 2
    n_pts, _ = pc.shape
    assert n_pts >= n_samples, "n_pts should be >= n_samples"
    assert (
        start_idx is None or 0 <= start_idx < n_pts
    ), "start_idx should be None or 0 <= start_idx < n_pts"
    if isinstance(start_idx, list):
        assert len(start_idx) <= n_samples, "len(start_idx) should be <= n_samples"
    pc = np.ascontiguousarray(pc, dtype=np.float32)
    w = w or int(n_pts / n_samples * 16)
    if w >= n_pts:
        warnings.warn(f"k is too large, set to {n_pts}")
        w = n_pts
    # Random pick a start if not given
    start_idx = get_start_idx(n_pts, start_idx)
    return _fps_npdu_kdtree_sampling(pc, n_samples, w, start_idx)


def bucket_fps_kdtree_sampling(
    pc: np.ndarray, n_samples: int, start_idx: Optional[Union[int, List[int]]] = None
) -> np.ndarray:
    """
    Bucket-based FPS sampling using KDTree. Also called "QuickFPS" in the paper.

    Args:
        pc (np.ndarray): The input point cloud of shape (n_pts, D).
        n_samples (int): Number of samples.
        start_idx (int or list[int], default=None): The starting index of sampling. If set to None, it will be randomly picked.
            If multiple start indices are given, they are used as a start set for the FPS and will all be present in the final samples.
    Returns:
        np.ndarray: The selected indices of shape (n_samples,).
    """
    assert n_samples >= 1, "n_samples should be >= 1"
    assert pc.ndim == 2
    n_pts, _ = pc.shape
    assert n_pts >= n_samples, "n_pts should be >= n_samples"
    assert (
        start_idx is None or 0 <= start_idx < n_pts
    ), "start_idx should be None or 0 <= start_idx < n_pts"
    if isinstance(start_idx, list):
        assert len(start_idx) <= n_samples, "len(start_idx) should be <= n_samples"
    pc = np.ascontiguousarray(pc, dtype=np.float32)
    # Random pick a start if not given
    start_idx = get_start_idx(n_pts, start_idx)
    return _bucket_fps_kdtree_sampling(pc, n_samples, start_idx)


def bucket_fps_kdline_sampling(
    pc: np.ndarray, n_samples: int, h: int, start_idx: Optional[Union[int, List[int]]] = None
) -> np.ndarray:
    """
    Bucket-based FPS sampling using KDTree, with multiple points in each bucket. Also called "QuickFPS" in the paper.

    Args:
        pc (np.ndarray): The input point cloud of shape (n_pts, D).
        n_samples (int): Number of samples.
        h (int, default=None): Height of KDTree. The bucket size is `2**h`.
            According to the paper, for small workload, h=3 is enough;
            for medium workload, h=5 or 7 is enough; for large workload, h=9 is enough.
        start_idx (int or list[int], default=None): The starting index of sampling. If set to None, it will be randomly picked.
            If multiple start indices are given, they are used as a start set for the FPS and will all be present in the final samples.

    Returns:
        np.ndarray: The selected indices of shape (n_samples,).
    """
    assert n_samples >= 1, "n_samples should be >= 1"
    assert pc.ndim == 2
    n_pts, _ = pc.shape
    assert n_pts >= n_samples, "n_pts should be >= n_samples"
    assert h >= 1, "h should be >= 1"
    assert 2**h <= n_pts, "2**h should be <= n_pts"
    assert (
        start_idx is None or 0 <= start_idx < n_pts
    ), "start_idx should be None or 0 <= start_idx < n_pts"
    if isinstance(start_idx, list):
        assert len(start_idx) <= n_samples, "len(start_idx) should be <= n_samples"
    pc = np.ascontiguousarray(pc, dtype=np.float32)
    # Random pick a start if not given
    start_idx = get_start_idx(n_pts, start_idx)
    return _bucket_fps_kdline_sampling(pc, n_samples, h, start_idx)


__all__ = [
    "__doc__",
    "__version__",
    "fps_sampling",
    "fps_npdu_sampling",
    "fps_npdu_kdtree_sampling",
    "bucket_fps_kdtree_sampling",
    "bucket_fps_kdline_sampling",
]


================================================
FILE: src/lib.cpp
================================================
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "nanoflann.hpp"
#include "wrapper.hpp"

#if defined(_MSC_VER)
    #include <BaseTsd.h>
    typedef SSIZE_T ssize_t;
#endif

#define STRINGIFY(x) #x
#define MACRO_STRINGIFY(x) STRINGIFY(x)

namespace py = pybind11;

class StartIndex {
public:
    enum Type { SINGLE, ARRAY } type;
    size_t single_idx;
    py::array_t<size_t> array_idx;

    StartIndex(size_t idx) : type(SINGLE), single_idx(idx) {}
    StartIndex(py::array_t<size_t> arr) : type(ARRAY), array_idx(arr) {}
};

struct PointCloud {
    size_t N, dim;
    const float* data;  // row-major: N x dim

    inline size_t kdtree_get_point_count() const { return N; }

    inline float kdtree_distance(const float* a, const size_t b_idx, size_t /*dim_*/) const {
        const float* b = data + b_idx * dim;
        float dist = 0.0f;
        for (size_t i = 0; i < dim; i++) {
            float d = a[i] - b[i];
            dist += d * d;
        }
        return dist;
    }

    inline float kdtree_get_pt(const size_t idx, int dim_) const {
        return data[idx * dim + dim_];
    }

    template <class BBOX>
    bool kdtree_get_bbox(BBOX&) const {
        return false;
    }
};

void check_py_input(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    const StartIndex& start_idx,
    std::optional<size_t> max_dim = std::nullopt
) {
    if (points.ndim() != 2) {
        throw py::value_error(
            "points must be a 2D array, but got shape " + std::to_string(points.ndim())
        );
    }

    ssize_t P = points.shape(0);
    ssize_t C = points.shape(1);

    if (C == 0) {
        throw py::value_error("points must have at least one column");
    }

    if (max_dim.has_value() && C > max_dim.value()) {
        throw py::value_error(
            "points must have at most " + std::to_string(max_dim.value()) +
            " columns, but got " + std::to_string(C)
        );
    }

    if (n_samples > static_cast<size_t>(P)) {
        throw py::value_error(
            "n_samples must be less than the number of points: n_samples=" +
            std::to_string(n_samples) + ", P=" + std::to_string(P)
        );
    }

    if (start_idx.type == StartIndex::SINGLE) {
        if (start_idx.single_idx >= static_cast<size_t>(P)) {
            throw py::value_error(
                "start_idx must be less than the number of points: start_idx=" +
                std::to_string(start_idx.single_idx) + ", P=" + std::to_string(P)
            );
        }
    } else {
        auto arr = start_idx.array_idx.unchecked<1>();
        if (arr.shape(0) > n_samples) {
            throw py::value_error(
                "The number of start indices must be less than or equal to n_samples: " +
                std::to_string(arr.shape(0)) + ", n_samples=" + std::to_string(n_samples)
            );
        }
        for (ssize_t i = 0; i < arr.shape(0); ++i) {
            if (arr(i) >= static_cast<size_t>(P)) {
                throw py::value_error(
                    "All indices in start_idx must be less than the number of points: " +
                    std::to_string(arr(i)) + ", P=" + std::to_string(P)
                );
            }
        }
    }
}

py::array_t<size_t>
fps_sampling_multi_start_index(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    py::array_t<size_t, py::array::c_style | py::array::forcecast> start_idx)
{
    auto pts = points.unchecked<2>();   // 2D view
    auto starts = start_idx.unchecked<1>();

    ssize_t P = pts.shape(0);
    ssize_t C = pts.shape(1);

    if (P <= 0 || C <= 0) {
        throw std::runtime_error("points must be a 2D array");
    }

    size_t res_selected_idx = 0;
    bool has_prev = false;

    std::vector<float> dist_min(P, std::numeric_limits<float>::infinity());
    std::vector<size_t> selected;
    selected.reserve(n_samples);

    size_t start_counter = 0;

    while (selected.size() < n_samples) {
        if (has_prev) {
            size_t i0 = res_selected_idx;

            for (ssize_t i = 0; i < P; i++) {
                float dist = 0.0f;
                for (ssize_t j = 0; j < C; j++) {
                    float d = pts(i, j) - pts(i0, j);
                    dist += d * d;
                }
                if (dist < dist_min[i]) {
                    dist_min[i] = dist;
                }
            }

            if (start_counter < (size_t) starts.shape(0)) {
                size_t idx = starts(start_counter);
                selected.push_back(idx);
                res_selected_idx = idx;
                start_counter++;
            } else {
                size_t max_idx = 0;
                float max_val = -1.0f;

                for (ssize_t i = 0; i < P; i++) {
                    if (dist_min[i] >= max_val) {
                        max_val = dist_min[i];
                        max_idx = i;
                    }
                }

                selected.push_back(max_idx);
                res_selected_idx = max_idx;
            }

        } else {
            size_t idx = starts(start_counter);
            selected.push_back(idx);
            res_selected_idx = idx;
            start_counter++;
            has_prev = true;
        }
    }

    py::array_t<size_t> out(selected.size());
    auto out_buf = out.mutable_unchecked<1>();
    for (size_t i = 0; i < selected.size(); i++) {
        out_buf(i) = selected[i];
    }
    return out;
}

py::array_t<size_t> fps_sampling(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    size_t start_idx
) {
    auto pts = points.unchecked<2>();
    ssize_t P = pts.shape(0);
    ssize_t C = pts.shape(1);

    if (P <= 0 || C <= 0) {
        throw py::value_error("points must be a 2D array with at least one column");
    }
    if (start_idx >= static_cast<size_t>(P)) {
        throw py::value_error("start_idx out of range");
    }

    size_t res_selected_idx = start_idx;
    bool has_prev = false;
    std::vector<float> dist_min(P, std::numeric_limits<float>::infinity());

    std::vector<size_t> selected;
    selected.reserve(n_samples);

    while (selected.size() < n_samples) {
        if (has_prev) {
            size_t i0 = res_selected_idx;
            for (ssize_t i = 0; i < P; ++i) {
                float dist = 0.0f;
                for (ssize_t j = 0; j < C; ++j) {
                    float d = pts(i, j) - pts(i0, j);
                    dist += d * d;
                }
                if (dist < dist_min[i]) dist_min[i] = dist;
            }

            size_t max_idx = 0;
            float max_val = -1.0f;
            for (ssize_t i = 0; i < P; ++i) {
                if (dist_min[i] >= max_val) {
                    max_val = dist_min[i];
                    max_idx = i;
                }
            }
            selected.push_back(max_idx);
            res_selected_idx = max_idx;
        } else {
            selected.push_back(start_idx);
            res_selected_idx = start_idx;
            has_prev = true;
        }
    }

    py::array_t<size_t> out(selected.size());
    auto out_buf = out.mutable_unchecked<1>();
    for (size_t i = 0; i < selected.size(); ++i) {
        out_buf(i) = selected[i];
    }
    return out;
}

// EXPORT TO _fps_sample
py::array_t<size_t> _fps_sampling(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    py::object start_idx_obj
) {
    StartIndex start_idx = [&]() -> StartIndex {
        if (py::isinstance<py::int_>(start_idx_obj)) {
            return StartIndex(start_idx_obj.cast<size_t>());
        } else if (py::isinstance<py::array_t<size_t>>(start_idx_obj)) {
            return StartIndex(start_idx_obj.cast<py::array_t<size_t>>());
        } else {
            throw py::type_error("start_idx must be int or 1D numpy array of size_t");
        }
    }();

    check_py_input(points, n_samples, start_idx, std::nullopt);

    if (start_idx.type == StartIndex::SINGLE)
        return fps_sampling(points, n_samples, start_idx.single_idx);
    else
        return fps_sampling_multi_start_index(points, n_samples, start_idx.array_idx);
}

py::array_t<size_t> fps_npdu_sampling(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    size_t k,
    size_t start_idx
) {
    auto pts = points.unchecked<2>();
    ssize_t P = pts.shape(0);
    ssize_t C = pts.shape(1);

    if (P <= 0 || C <= 0)
        throw py::value_error("points must be a 2D array with at least one column");
    if (start_idx >= static_cast<size_t>(P))
        throw py::value_error("start_idx out of range");

    std::vector<float> dist_min(P, std::numeric_limits<float>::infinity());
    std::vector<size_t> selected;
    selected.reserve(n_samples);

    size_t res_selected_idx = start_idx;
    bool has_prev = false;

    while (selected.size() < n_samples) {
        if (has_prev) {
            ssize_t hw = static_cast<ssize_t>(k / 2);
            ssize_t start = static_cast<ssize_t>(res_selected_idx) - hw;
            ssize_t end   = static_cast<ssize_t>(res_selected_idx) + hw;
            if (start < 0) { end -= start; start = 0; }
            if (end >= P) { start = std::max(start - (end - P + 1), ssize_t(0)); end = P - 1; }

            for (ssize_t i = start; i <= end; ++i) {
                float dist = 0.0f;
                for (ssize_t j = 0; j < C; ++j) {
                    float d = pts(i, j) - pts(res_selected_idx, j);
                    dist += d*d;
                }
                if (dist < dist_min[i]) dist_min[i] = dist;
            }

            size_t max_idx = 0;
            float max_val = -1.0f;
            for (ssize_t i = 0; i < P; ++i) {
                if (dist_min[i] > max_val) { max_val = dist_min[i]; max_idx = i; }
            }
            selected.push_back(max_idx);
            res_selected_idx = max_idx;

        } else {
            for (ssize_t i = 0; i < P; ++i) {
                float dist = 0.0f;
                for (ssize_t j = 0; j < C; ++j) {
                    float d = pts(i,j) - pts(start_idx,j);
                    dist += d*d;
                }
                if (dist < dist_min[i]) dist_min[i] = dist;
            }
            selected.push_back(start_idx);
            res_selected_idx = start_idx;
            has_prev = true;
        }
    }

    py::array_t<size_t> out(selected.size());
    auto out_buf = out.mutable_unchecked<1>();
    for (size_t i = 0; i < selected.size(); ++i)
        out_buf(i) = selected[i];

    return out;
}

// EXPORT TO _fps_npdu_sample
py::array_t<size_t> fps_npdu_sampling_py(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    size_t k,
    py::object start_idx_obj
) {
    StartIndex start_idx = [&]() -> StartIndex {
        if (py::isinstance<py::int_>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<size_t>());
        else if (py::isinstance<py::array_t<size_t>>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<py::array_t<size_t>>());
        else
            throw py::type_error("start_idx must be int or 1D numpy array of size_t");
    }();

    check_py_input(points, n_samples, start_idx);

    if (start_idx.type == StartIndex::SINGLE)
        return fps_npdu_sampling(points, n_samples, k, start_idx.single_idx);
    else {
        PyErr_SetString(PyExc_NotImplementedError, "Array of start indices not implemented yet");
        throw py::error_already_set();
    }
}

// EXPORT TO _fps_npdu_kdtree_sample
py::array_t<size_t> fps_npdu_kdtree_sampling_py(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    size_t k,
    py::object start_idx_obj
) {
    StartIndex start_idx = [&]() -> StartIndex {
        if (py::isinstance<py::int_>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<size_t>());
        else if (py::isinstance<py::array_t<size_t>>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<py::array_t<size_t>>());
        else
            throw py::type_error("start_idx must be int or 1D numpy array of size_t");
    }();

    check_py_input(points, n_samples, start_idx);

    if (start_idx.type == StartIndex::ARRAY) {
        PyErr_SetString(PyExc_NotImplementedError, "Array of start indices not implemented yet");
        throw py::error_already_set();
    }

    auto pts = points.unchecked<2>();
    ssize_t P = pts.shape(0);
    ssize_t C = pts.shape(1);

    if (P <= 0 || C <= 0)
        throw py::value_error("points must be a 2D array with at least one column");

    PointCloud cloud;
    cloud.N = static_cast<size_t>(P);
    cloud.dim = static_cast<size_t>(C);
    cloud.data = points.data();

    using KDTree = nanoflann::KDTreeSingleIndexAdaptor<nanoflann::L2_Simple_Adaptor<float, PointCloud>, PointCloud, -1>;
    KDTree index(static_cast<int>(C), cloud, nanoflann::KDTreeSingleIndexAdaptorParams(10));
    index.buildIndex();

    std::vector<float> dist_min(P, std::numeric_limits<float>::infinity());
    std::vector<size_t> selected;
    selected.reserve(n_samples);

    size_t res_selected_idx = start_idx.single_idx;
    bool has_prev = false;

    size_t k_use = std::min<size_t>(k, static_cast<size_t>(P));
    std::vector<size_t> ret_indexes(k_use);
    std::vector<float> out_dists(k_use);

    while (selected.size() < n_samples) {
        if (has_prev) {
            std::vector<float> query(static_cast<size_t>(C));
            for (ssize_t d = 0; d < C; ++d) query[d] = pts(res_selected_idx, d);

            nanoflann::KNNResultSet<float> resultSet(static_cast<int>(k_use));
            resultSet.init(ret_indexes.data(), out_dists.data());
            nanoflann::SearchParameters params;
            index.findNeighbors(resultSet, query.data(), params);

            for (size_t idx_i = 0; idx_i < k_use; ++idx_i) {
                size_t nb = ret_indexes[idx_i];
                float dist = 0.0f;
                for (ssize_t d = 0; d < C; ++d) {
                    float diff = pts(nb, d) - pts(res_selected_idx, d);
                    dist += diff * diff;
                }
                if (dist < dist_min[nb]) dist_min[nb] = dist;
            }

            size_t max_idx = 0;
            float max_val = -1.0f;
            for (ssize_t i = 0; i < P; ++i) {
                if (dist_min[i] > max_val) { max_val = dist_min[i]; max_idx = i; }
            }

            selected.push_back(max_idx);
            res_selected_idx = max_idx;
        } else {
            for (ssize_t i = 0; i < P; ++i) {
                float dist = 0.0f;
                for (ssize_t j = 0; j < C; ++j) {
                    float d = pts(i,j) - pts(res_selected_idx,j);
                    dist += d*d;
                }
                if (dist < dist_min[i]) dist_min[i] = dist;
            }
            selected.push_back(res_selected_idx);
            has_prev = true;
        }
    }

    py::array_t<size_t> out(selected.size());
    auto out_buf = out.mutable_unchecked<1>();
    for (size_t i = 0; i < selected.size(); ++i) out_buf(i) = selected[i];

    return out;
}

py::array_t<size_t> bucket_fps_kdtree_sampling_py(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    py::object start_idx_obj
) {
    StartIndex start_idx = [&]() -> StartIndex {
        if (py::isinstance<py::int_>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<size_t>());
        else if (py::isinstance<py::array_t<size_t>>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<py::array_t<size_t>>());
        else
            throw py::type_error("start_idx must be int or 1D numpy array of size_t");
    }();

    if (start_idx.type == StartIndex::ARRAY) {
        PyErr_SetString(PyExc_NotImplementedError, "Array of start indices not implemented yet");
        throw py::error_already_set();
    }

    check_py_input(points, n_samples, start_idx);

    if (points.ndim() != 2) {
        throw py::value_error("points must be a 2D float32 array");
    }
    ssize_t P = points.shape(0);
    ssize_t C = points.shape(1);

    if (start_idx.single_idx >= static_cast<size_t>(P)) {
        throw py::value_error("start_idx out of range");
    }
    if (n_samples == 0 || n_samples > static_cast<size_t>(P)) {
        throw py::value_error("n_samples must be in [1, num_points]");
    }

    auto buf = points.unchecked<2>();

    py::array_t<size_t> out(n_samples);
    size_t* out_ptr = static_cast<size_t*>(out.mutable_data());

    int ret = bucket_fps_kdtree(
        buf.data(0,0),                       // raw_data
        static_cast<size_t>(P),              // n_points
        static_cast<size_t>(C),              // dim
        n_samples,                           // n_samples
        start_idx.single_idx,                // start_idx
        out_ptr                              // output buffer
    );

    if (ret != 0) {
        throw std::runtime_error("bucket_fps_kdtree failed with error code " + std::to_string(ret));
    }

    return out;
}

py::array_t<size_t> bucket_fps_kdline_sampling_py(
    py::array_t<float, py::array::c_style | py::array::forcecast> points,
    size_t n_samples,
    size_t height,
    py::object start_idx_obj
) {
    StartIndex start_idx = [&]() -> StartIndex {
        if (py::isinstance<py::int_>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<size_t>());
        else if (py::isinstance<py::array_t<size_t>>(start_idx_obj))
            return StartIndex(start_idx_obj.cast<py::array_t<size_t>>());
        else
            throw py::type_error("start_idx must be int or 1D numpy array of size_t");
    }();

    if (start_idx.type == StartIndex::ARRAY) {
        PyErr_SetString(PyExc_NotImplementedError, "Array of start indices not implemented yet");
        throw py::error_already_set();
    }

    if (points.ndim() != 2) {
        throw py::value_error("points must be a 2D float32 array");
    }

    ssize_t P = points.shape(0);
    ssize_t C = points.shape(1);

    if (start_idx.single_idx >= static_cast<size_t>(P)) {
        throw py::value_error("start_idx out of range");
    }
    if (n_samples == 0 || n_samples > static_cast<size_t>(P)) {
        throw py::value_error("n_samples must be in [1, num_points]");
    }
    if (height == 0) {
        throw py::value_error("height must be >= 1");
    }

    auto buf = points.unchecked<2>();

    py::array_t<size_t> out(n_samples);
    size_t* out_ptr = static_cast<size_t*>(out.mutable_data());

    int ret = bucket_fps_kdline(
        buf.data(0,0),                        // raw_data
        static_cast<size_t>(P),               // n_points
        static_cast<size_t>(C),               // dim
        n_samples,                            // n_samples
        start_idx.single_idx,                 // start_idx
        height,                               // window height
        out_ptr                               // output buffer
    );

    if (ret != 0) {
        throw std::runtime_error("bucket_fps_kdline failed with error code " + std::to_string(ret));
    }

    return out;
}

PYBIND11_MODULE(_fpsample, m, py::mod_gil_not_used(), py::multiple_interpreters::per_interpreter_gil()) {
    m.doc() = R"pbdoc(
        Python efficient farthest point sampling (FPS) library
        -----------------------

        .. currentmodule:: fpsample

        .. autosummary::
           :toctree: _generate

           _fps_sampling
           _fps_npdu_sampling
           _fps_npdu_kdtree_sampling
           _bucket_fps_kdtree_sampling
           _bucket_fps_kdline_sampling
    )pbdoc";

    m.def("_fps_sampling", &_fps_sampling, R"pbdoc(
            Farthest Point Sampling (FPS)
            Args:
                points (np.ndarray[float32, 2D]): N x C point array.
                n_samples (int): number of samples to pick.
                start_idx (int or np.ndarray[int32, 1D]): initial index or indices to start FPS.
            Returns:
                np.ndarray[int32]: sampled point indices.
    )pbdoc");

    m.def("_fps_npdu_sampling", &fps_npdu_sampling_py, R"pbdoc(
            FPS with Nearest Point Distance Update
            Args:
                points (np.ndarray[float32, 2D]): N x C point array.
                n_samples (int): number of samples to pick.
                k (int): number of neighbors for local update.
                start_idx (int): initial index to start FPS.
            Returns:
                np.ndarray[int32]: sampled point indices.
    )pbdoc");

    m.def("_fps_npdu_kdtree_sampling", &fps_npdu_kdtree_sampling_py, R"pbdoc(
            FPS with Nearest Point Distance Update using KD-tree acceleration
            Args:
                points (np.ndarray[float32, 2D]): N x C point array.
                n_samples (int): number of samples to pick.
                k (int): number of neighbors for local update.
                start_idx (int): initial index to start FPS.
            Returns:
                np.ndarray[int32]: sampled point indices.
    )pbdoc");

    m.def("_bucket_fps_kdtree_sampling",
      &bucket_fps_kdtree_sampling_py,
      R"pbdoc(
          Bucket FPS sampling using KD-tree acceleration.
          Args:
              points (np.ndarray[float32, 2D]): N x C point array.
              n_samples (int): number of samples to pick.
              start_idx (int): initial index to start FPS.
          Returns:
              np.ndarray[int32]: sampled point indices.
      )pbdoc");

m.def("_bucket_fps_kdline_sampling",
      &bucket_fps_kdline_sampling_py,
      R"pbdoc(
          Bucket FPS sampling using KD-line local window update.
          Args:
              points (np.ndarray[float32, 2D]): N x C point array.
              n_samples (int): number of samples to pick.
              height (int): window size around selected point.
              start_idx (int): first index.
          Returns:
              np.ndarray[int32]: sampled point indices.
      )pbdoc");

#ifdef VERSION_INFO
    m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO);
#else
    m.attr("__version__") = "dev";
#endif
}


================================================
FILE: src/nanoflann.hpp
================================================
/***********************************************************************
 * Software License Agreement (BSD License)
 *
 * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
 * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
 * Copyright 2011-2025  Jose Luis Blanco (joseluisblancoc@gmail.com).
 *   All rights reserved.
 *
 * THE BSD LICENSE
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *************************************************************************/

/** \mainpage nanoflann C++ API documentation
 *  nanoflann is a C++ header-only library for building KD-Trees, mostly
 *  optimized for 2D or 3D point clouds.
 *
 *  nanoflann does not require compiling or installing, just an
 *  #include <nanoflann.hpp> in your code.
 *
 *  See:
 *   - [Online README](https://github.com/jlblancoc/nanoflann)
 *   - [C++ API documentation](https://jlblancoc.github.io/nanoflann/)
 */

#pragma once

#include <algorithm>
#include <array>
#include <atomic>
#include <cassert>
#include <cmath>  // for abs()
#include <cstdint>
#include <cstdlib>  // for abs()
#include <functional>  // std::reference_wrapper
#include <future>
#include <istream>
#include <limits>  // std::numeric_limits
#include <ostream>
#include <stack>
#include <stdexcept>
#include <unordered_set>
#include <vector>

/** Library version: 0xMmP (M=Major,m=minor,P=patch) */
#define NANOFLANN_VERSION 0x180

// Avoid conflicting declaration of min/max macros in Windows headers
#if !defined(NOMINMAX) && \
    (defined(_WIN32) || defined(_WIN32_) || defined(WIN32) || defined(_WIN64))
#define NOMINMAX
#ifdef max
#undef max
#undef min
#endif
#endif
// Avoid conflicts with X11 headers
#ifdef None
#undef None
#endif

namespace nanoflann
{
/** @addtogroup nanoflann_grp nanoflann C++ library for KD-trees
 *  @{ */

/** the PI constant (required to avoid MSVC missing symbols) */
template <typename T>
T pi_const()
{
    return static_cast<T>(3.14159265358979323846);
}

/**
 * Traits if object is resizable and assignable (typically has a resize | assign
 * method)
 */
template <typename T, typename = int>
struct has_resize : std::false_type
{
};

template <typename T>
struct has_resize<T, decltype((void)std::declval<T>().resize(1), 0)>
    : std::true_type
{
};

template <typename T, typename = int>
struct has_assign : std::false_type
{
};

template <typename T>
struct has_assign<T, decltype((void)std::declval<T>().assign(1, 0), 0)>
    : std::true_type
{
};

/**
 * Free function to resize a resizable object
 */
template <typename Container>
inline typename std::enable_if<has_resize<Container>::value, void>::type resize(
    Container& c, const size_t nElements)
{
    c.resize(nElements);
}

/**
 * Free function that has no effects on non resizable containers (e.g.
 * std::array) It raises an exception if the expected size does not match
 */
template <typename Container>
inline typename std::enable_if<!has_resize<Container>::value, void>::type
    resize(Container& c, const size_t nElements)
{
    if (nElements != c.size())
        throw std::logic_error("Try to change the size of a std::array.");
}

/**
 * Free function to assign to a container
 */
template <typename Container, typename T>
inline typename std::enable_if<has_assign<Container>::value, void>::type assign(
    Container& c, const size_t nElements, const T& value)
{
    c.assign(nElements, value);
}

/**
 * Free function to assign to a std::array
 */
template <typename Container, typename T>
inline typename std::enable_if<!has_assign<Container>::value, void>::type
    assign(Container& c, const size_t nElements, const T& value)
{
    for (size_t i = 0; i < nElements; i++) c[i] = value;
}

/** operator "<" for std::sort() */
struct IndexDist_Sorter
{
    /** PairType will be typically: ResultItem<IndexType,DistanceType> */
    template <typename PairType>
    bool operator()(const PairType& p1, const PairType& p2) const
    {
        return p1.second < p2.second;
    }
};

/**
 * Each result element in RadiusResultSet. Note that distances and indices
 * are named `first` and `second` to keep backward-compatibility with the
 * `std::pair<>` type used in the past. In contrast, this structure is ensured
 * to be `std::is_standard_layout` so it can be used in wrappers to other
 * languages.
 * See: https://github.com/jlblancoc/nanoflann/issues/166
 */
template <typename IndexType = size_t, typename DistanceType = double>
struct ResultItem
{
    ResultItem() = default;
    ResultItem(const IndexType index, const DistanceType distance)
        : first(index), second(distance)
    {
    }

    IndexType    first;  //!< Index of the sample in the dataset
    DistanceType second;  //!< Distance from sample to query point
};

/** @addtogroup result_sets_grp Result set classes
 *  @{ */

/** Result set for KNN searches (N-closest neighbors) */
template <
    typename _DistanceType, typename _IndexType = size_t,
    typename _CountType = size_t>
class KNNResultSet
{
   public:
    using DistanceType = _DistanceType;
    using IndexType    = _IndexType;
    using CountType    = _CountType;

   private:
    IndexType*    indices;
    DistanceType* dists;
    CountType     capacity;
    CountType     count;

   public:
    explicit KNNResultSet(CountType capacity_)
        : indices(nullptr), dists(nullptr), capacity(capacity_), count(0)
    {
    }

    void init(IndexType* indices_, DistanceType* dists_)
    {
        indices = indices_;
        dists   = dists_;
        count   = 0;
    }

    CountType size() const { return count; }
    bool      empty() const { return count == 0; }
    bool      full() const { return count == capacity; }

    /**
     * Called during search to add an element matching the criteria.
     * @return true if the search should be continued, false if the results are
     * sufficient
     */
    bool addPoint(DistanceType dist, IndexType index)
    {
        CountType i;
        for (i = count; i > 0; --i)
        {
            /** If defined and two points have the same distance, the one with
             *  the lowest-index will be returned first. */
#ifdef NANOFLANN_FIRST_MATCH
            if ((dists[i - 1] > dist) ||
                ((dist == dists[i - 1]) && (indices[i - 1] > index)))
            {
#else
            if (dists[i - 1] > dist)
            {
#endif
                if (i < capacity)
                {
                    dists[i]   = dists[i - 1];
                    indices[i] = indices[i - 1];
                }
            }
            else
                break;
        }
        if (i < capacity)
        {
            dists[i]   = dist;
            indices[i] = index;
        }
        if (count < capacity) count++;

        // tell caller that the search shall continue
        return true;
    }

    //! Returns the worst distance among found solutions if the search result is
    //! full, or the maximum possible distance, if not full yet.
    DistanceType worstDist() const
    {
        return (count < capacity || !count)
                   ? std::numeric_limits<DistanceType>::max()
                   : dists[count - 1];
    }

    void sort()
    {
        // already sorted
    }
};

/** Result set for RKNN searches (N-closest neighbors with a maximum radius) */
template <
    typename _DistanceType, typename _IndexType = size_t,
    typename _CountType = size_t>
class RKNNResultSet
{
   public:
    using DistanceType = _DistanceType;
    using IndexType    = _IndexType;
    using CountType    = _CountType;

   private:
    IndexType*    indices;
    DistanceType* dists;
    CountType     capacity;
    CountType     count;
    DistanceType  maximumSearchDistanceSquared;

   public:
    explicit RKNNResultSet(
        CountType capacity_, DistanceType maximumSearchDistanceSquared_)
        : indices(nullptr),
          dists(nullptr),
          capacity(capacity_),
          count(0),
          maximumSearchDistanceSquared(maximumSearchDistanceSquared_)
    {
    }

    void init(IndexType* indices_, DistanceType* dists_)
    {
        indices = indices_;
        dists   = dists_;
        count   = 0;
        if (capacity) dists[capacity - 1] = maximumSearchDistanceSquared;
    }

    CountType size() const { return count; }
    bool      empty() const { return count == 0; }
    bool      full() const { return count == capacity; }

    /**
     * Called during search to add an element matching the criteria.
     * @return true if the search should be continued, false if the results are
     * sufficient
     */
    bool addPoint(DistanceType dist, IndexType index)
    {
        CountType i;
        for (i = count; i > 0; --i)
        {
            /** If defined and two points have the same distance, the one with
             *  the lowest-index will be returned first. */
#ifdef NANOFLANN_FIRST_MATCH
            if ((dists[i - 1] > dist) ||
                ((dist == dists[i - 1]) && (indices[i - 1] > index)))
            {
#else
            if (dists[i - 1] > dist)
            {
#endif
                if (i < capacity)
                {
                    dists[i]   = dists[i - 1];
                    indices[i] = indices[i - 1];
                }
            }
            else
                break;
        }
        if (i < capacity)
        {
            dists[i]   = dist;
            indices[i] = index;
        }
        if (count < capacity) count++;

        // tell caller that the search shall continue
        return true;
    }

    //! Returns the worst distance among found solutions if the search result is
    //! full, or the maximum possible distance, if not full yet.
    DistanceType worstDist() const
    {
        return (count < capacity || !count) ? maximumSearchDistanceSquared
                                            : dists[count - 1];
    }

    void sort()
    {
        // already sorted
    }
};

/**
 * A result-set class used when performing a radius based search.
 */
template <typename _DistanceType, typename _IndexType = size_t>
class RadiusResultSet
{
   public:
    using DistanceType = _DistanceType;
    using IndexType    = _IndexType;

   public:
    const DistanceType radius;

    std::vector<ResultItem<IndexType, DistanceType>>& m_indices_dists;

    explicit RadiusResultSet(
        DistanceType                                      radius_,
        std::vector<ResultItem<IndexType, DistanceType>>& indices_dists)
        : radius(radius_), m_indices_dists(indices_dists)
    {
        init();
    }

    void init() { clear(); }
    void clear() { m_indices_dists.clear(); }

    size_t size() const { return m_indices_dists.size(); }
    size_t empty() const { return m_indices_dists.empty(); }

    bool full() const { return true; }

    /**
     * Called during search to add an element matching the criteria.
     * @return true if the search should be continued, false if the results are
     * sufficient
     */
    bool addPoint(DistanceType dist, IndexType index)
    {
        if (dist < radius) m_indices_dists.emplace_back(index, dist);
        return true;
    }

    DistanceType worstDist() const { return radius; }

    /**
     * Find the worst result (farthest neighbor) without copying or sorting
     * Pre-conditions: size() > 0
     */
    ResultItem<IndexType, DistanceType> worst_item() const
    {
        if (m_indices_dists.empty())
            throw std::runtime_error(
                "Cannot invoke RadiusResultSet::worst_item() on "
                "an empty list of results.");
        auto it = std::max_element(
            m_indices_dists.begin(), m_indices_dists.end(), IndexDist_Sorter());
        return *it;
    }

    void sort()
    {
        std::sort(
            m_indices_dists.begin(), m_indices_dists.end(), IndexDist_Sorter());
    }
};

/** @} */

/** @addtogroup loadsave_grp Load/save auxiliary functions
 * @{ */
template <typename T>
void save_value(std::ostream& stream, const T& value)
{
    stream.write(reinterpret_cast<const char*>(&value), sizeof(T));
}

template <typename T>
void save_value(std::ostream& stream, const std::vector<T>& value)
{
    size_t size = value.size();
    stream.write(reinterpret_cast<const char*>(&size), sizeof(size_t));
    stream.write(reinterpret_cast<const char*>(value.data()), sizeof(T) * size);
}

template <typename T>
void load_value(std::istream& stream, T& value)
{
    stream.read(reinterpret_cast<char*>(&value), sizeof(T));
}

template <typename T>
void load_value(std::istream& stream, std::vector<T>& value)
{
    size_t size;
    stream.read(reinterpret_cast<char*>(&size), sizeof(size_t));
    value.resize(size);
    stream.read(reinterpret_cast<char*>(value.data()), sizeof(T) * size);
}
/** @} */

/** @addtogroup metric_grp Metric (distance) classes
 * @{ */

struct Metric
{
};

/** Manhattan distance functor (generic version, optimized for
 * high-dimensionality data sets). Corresponding distance traits:
 * nanoflann::metric_L1
 *
 * \tparam T Type of the elements (e.g. double, float, uint8_t)
 * \tparam DataSource Source of the data, i.e. where the vectors are stored
 * \tparam _DistanceType Type of distance variables (must be signed)
 * \tparam IndexType Type of the arguments with which the data can be
 * accessed (e.g. float, double, int64_t, T*)
 */
template <
    class T, class DataSource, typename _DistanceType = T,
    typename IndexType = size_t>
struct L1_Adaptor
{
    using ElementType  = T;
    using DistanceType = _DistanceType;

    const DataSource& data_source;

    L1_Adaptor(const DataSource& _data_source) : data_source(_data_source) {}

    DistanceType evalMetric(
        const T* a, const IndexType b_idx, size_t size,
        DistanceType worst_dist = -1) const
    {
        DistanceType result    = DistanceType();
        const T*     last      = a + size;
        const T*     lastgroup = last - 3;
        size_t       d         = 0;

        /* Process 4 items with each loop for efficiency. */
        while (a < lastgroup)
        {
            const DistanceType diff0 =
                std::abs(a[0] - data_source.kdtree_get_pt(b_idx, d++));
            const DistanceType diff1 =
                std::abs(a[1] - data_source.kdtree_get_pt(b_idx, d++));
            const DistanceType diff2 =
                std::abs(a[2] - data_source.kdtree_get_pt(b_idx, d++));
            const DistanceType diff3 =
                std::abs(a[3] - data_source.kdtree_get_pt(b_idx, d++));
            result += diff0 + diff1 + diff2 + diff3;
            a += 4;
            if ((worst_dist > 0) && (result > worst_dist)) { return result; }
        }
        /* Process last 0-3 components.  Not needed for standard vector lengths.
         */
        while (a < last)
        {
            result += std::abs(*a++ - data_source.kdtree_get_pt(b_idx, d++));
        }
        return result;
    }

    template <typename U, typename V>
    DistanceType accum_dist(const U a, const V b, const size_t) const
    {
        return std::abs(a - b);
    }
};

/** **Squared** Euclidean distance functor (generic version, optimized for
 * high-dimensionality data sets). Corresponding distance traits:
 * nanoflann::metric_L2
 *
 * \tparam T Type of the elements (e.g. double, float, uint8_t)
 * \tparam DataSource Source of the data, i.e. where the vectors are stored
 * \tparam _DistanceType Type of distance variables (must be signed)
 * \tparam IndexType Type of the arguments with which the data can be
 * accessed (e.g. float, double, int64_t, T*)
 */
template <
    class T, class DataSource, typename _DistanceType = T,
    typename IndexType = size_t>
struct L2_Adaptor
{
    using ElementType  = T;
    using DistanceType = _DistanceType;

    const DataSource& data_source;

    L2_Adaptor(const DataSource& _data_source) : data_source(_data_source) {}

    DistanceType evalMetric(
        const T* a, const IndexType b_idx, size_t size,
        DistanceType worst_dist = -1) const
    {
        DistanceType result    = DistanceType();
        const T*     last      = a + size;
        const T*     lastgroup = last - 3;
        size_t       d         = 0;

        /* Process 4 items with each loop for efficiency. */
        while (a < lastgroup)
        {
            const DistanceType diff0 =
                a[0] - data_source.kdtree_get_pt(b_idx, d++);
            const DistanceType diff1 =
                a[1] - data_source.kdtree_get_pt(b_idx, d++);
            const DistanceType diff2 =
                a[2] - data_source.kdtree_get_pt(b_idx, d++);
            const DistanceType diff3 =
                a[3] - data_source.kdtree_get_pt(b_idx, d++);
            result +=
                diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
            a += 4;
            if ((worst_dist > 0) && (result > worst_dist)) { return result; }
        }
        /* Process last 0-3 components.  Not needed for standard vector lengths.
         */
        while (a < last)
        {
            const DistanceType diff0 =
                *a++ - data_source.kdtree_get_pt(b_idx, d++);
            result += diff0 * diff0;
        }
        return result;
    }

    template <typename U, typename V>
    DistanceType accum_dist(const U a, const V b, const size_t) const
    {
        return (a - b) * (a - b);
    }
};

/** **Squared** Euclidean (L2) distance functor (suitable for low-dimensionality
 * datasets, like 2D or 3D point clouds) Corresponding distance traits:
 * nanoflann::metric_L2_Simple
 *
 * \tparam T Type of the elements (e.g. double, float, uint8_t)
 * \tparam DataSource Source of the data, i.e. where the vectors are stored
 * \tparam _DistanceType Type of distance variables (must be signed)
 * \tparam IndexType Type of the arguments with which the data can be
 * accessed (e.g. float, double, int64_t, T*)
 */
template <
    class T, class DataSource, typename _DistanceType = T,
    typename IndexType = size_t>
struct L2_Simple_Adaptor
{
    using ElementType  = T;
    using DistanceType = _DistanceType;

    const DataSource& data_source;

    L2_Simple_Adaptor(const DataSource& _data_source)
        : data_source(_data_source)
    {
    }

    DistanceType evalMetric(
        const T* a, const IndexType b_idx, size_t size) const
    {
        DistanceType result = DistanceType();
        for (size_t i = 0; i < size; ++i)
        {
            const DistanceType diff =
                a[i] - data_source.kdtree_get_pt(b_idx, i);
            result += diff * diff;
        }
        return result;
    }

    template <typename U, typename V>
    DistanceType accum_dist(const U a, const V b, const size_t) const
    {
        return (a - b) * (a - b);
    }
};

/** SO2 distance functor
 *  Corresponding distance traits: nanoflann::metric_SO2
 *
 * \tparam T Type of the elements (e.g. double, float, uint8_t)
 * \tparam DataSource Source of the data, i.e. where the vectors are stored
 * \tparam _DistanceType Type of distance variables (must be signed) (e.g.
 * float, double) orientation is constrained to be in [-pi, pi]
 * \tparam IndexType Type of the arguments with which the data can be
 * accessed (e.g. float, double, int64_t, T*)
 */
template <
    class T, class DataSource, typename _DistanceType = T,
    typename IndexType = size_t>
struct SO2_Adaptor
{
    using ElementType  = T;
    using DistanceType = _DistanceType;

    const DataSource& data_source;

    SO2_Adaptor(const DataSource& _data_source) : data_source(_data_source) {}

    DistanceType evalMetric(
        const T* a, const IndexType b_idx, size_t size) const
    {
        return accum_dist(
            a[size - 1], data_source.kdtree_get_pt(b_idx, size - 1), size - 1);
    }

    /** Note: this assumes that input angles are already in the range [-pi,pi]
     */
    template <typename U, typename V>
    DistanceType accum_dist(const U a, const V b, const size_t) const
    {
        DistanceType result = DistanceType();
        DistanceType PI     = pi_const<DistanceType>();
        result              = b - a;
        if (result > PI)
            result -= 2 * PI;
        else if (result < -PI)
            result += 2 * PI;
        return result;
    }
};

/** SO3 distance functor (Uses L2_Simple)
 *  Corresponding distance traits: nanoflann::metric_SO3
 *
 * \tparam T Type of the elements (e.g. double, float, uint8_t)
 * \tparam DataSource Source of the data, i.e. where the vectors are stored
 * \tparam _DistanceType Type of distance variables (must be signed) (e.g.
 * float, double)
 * \tparam IndexType Type of the arguments with which the data can be
 * accessed (e.g. float, double, int64_t, T*)
 */
template <
    class T, class DataSource, typename _DistanceType = T,
    typename IndexType = size_t>
struct SO3_Adaptor
{
    using ElementType  = T;
    using DistanceType = _DistanceType;

    L2_Simple_Adaptor<T, DataSource, DistanceType, IndexType>
        distance_L2_Simple;

    SO3_Adaptor(const DataSource& _data_source)
        : distance_L2_Simple(_data_source)
    {
    }

    DistanceType evalMetric(
        const T* a, const IndexType b_idx, size_t size) const
    {
        return distance_L2_Simple.evalMetric(a, b_idx, size);
    }

    template <typename U, typename V>
    DistanceType accum_dist(const U a, const V b, const size_t idx) const
    {
        return distance_L2_Simple.accum_dist(a, b, idx);
    }
};

/** Metaprogramming helper traits class for the L1 (Manhattan) metric */
struct metric_L1 : public Metric
{
    template <class T, class DataSource, typename IndexType = size_t>
    struct traits
    {
        using distance_t = L1_Adaptor<T, DataSource, T, IndexType>;
    };
};
/** Metaprogramming helper traits class for the L2 (Euclidean) **squared**
 * distance metric */
struct metric_L2 : public Metric
{
    template <class T, class DataSource, typename IndexType = size_t>
    struct traits
    {
        using distance_t = L2_Adaptor<T, DataSource, T, IndexType>;
    };
};
/** Metaprogramming helper traits class for the L2_simple (Euclidean)
 * **squared** distance metric */
struct metric_L2_Simple : public Metric
{
    template <class T, class DataSource, typename IndexType = size_t>
    struct traits
    {
        using distance_t = L2_Simple_Adaptor<T, DataSource, T, IndexType>;
    };
};
/** Metaprogramming helper traits class for the SO3_InnerProdQuat metric */
struct metric_SO2 : public Metric
{
    template <class T, class DataSource, typename IndexType = size_t>
    struct traits
    {
        using distance_t = SO2_Adaptor<T, DataSource, T, IndexType>;
    };
};
/** Metaprogramming helper traits class for the SO3_InnerProdQuat metric */
struct metric_SO3 : public Metric
{
    template <class T, class DataSource, typename IndexType = size_t>
    struct traits
    {
        using distance_t = SO3_Adaptor<T, DataSource, T, IndexType>;
    };
};

/** @} */

/** @addtogroup param_grp Parameter structs
 * @{ */

enum class KDTreeSingleIndexAdaptorFlags
{
    None                  = 0,
    SkipInitialBuildIndex = 1
};

inline std::underlying_type<KDTreeSingleIndexAdaptorFlags>::type operator&(
    KDTreeSingleIndexAdaptorFlags lhs, KDTreeSingleIndexAdaptorFlags rhs)
{
    using underlying =
        typename std::underlying_type<KDTreeSingleIndexAdaptorFlags>::type;
    return static_cast<underlying>(lhs) & static_cast<underlying>(rhs);
}

/**  Parameters (see README.md) */
struct KDTreeSingleIndexAdaptorParams
{
    KDTreeSingleIndexAdaptorParams(
        size_t                        _leaf_max_size = 10,
        KDTreeSingleIndexAdaptorFlags _flags =
            KDTreeSingleIndexAdaptorFlags::None,
        unsigned int _n_thread_build = 1)
        : leaf_max_size(_leaf_max_size),
          flags(_flags),
          n_thread_build(_n_thread_build)
    {
    }

    size_t                        leaf_max_size;
    KDTreeSingleIndexAdaptorFlags flags;
    unsigned int                  n_thread_build;
};

/** Search options for KDTreeSingleIndexAdaptor::findNeighbors() */
struct SearchParameters
{
    SearchParameters(float eps_ = 0, bool sorted_ = true)
        : eps(eps_), sorted(sorted_)
    {
    }

    float eps;  //!< search for eps-approximate neighbours (default: 0)
    bool  sorted;  //!< only for radius search, require neighbours sorted by
                  //!< distance (default: true)
};
/** @} */

/** @addtogroup memalloc_grp Memory allocation
 * @{ */

/**
 * Pooled storage allocator
 *
 * The following routines allow for the efficient allocation of storage in
 * small chunks from a specified pool.  Rather than allowing each structure
 * to be freed individually, an entire pool of storage is freed at once.
 * This method has two advantages over just using malloc() and free().  First,
 * it is far more efficient for allocating small objects, as there is
 * no overhead for remembering all the information needed to free each
 * object or consolidating fragmented memory.  Second, the decision about
 * how long to keep an object is made at the time of allocation, and there
 * is no need to track down all the objects to free them.
 *
 */
class PooledAllocator
{
    static constexpr size_t WORDSIZE  = 16;  // WORDSIZE must >= 8
    static constexpr size_t BLOCKSIZE = 8192;

    /* We maintain memory alignment to word boundaries by requiring that all
        allocations be in multiples of the machine wordsize.  */
    /* Size of machine word in bytes.  Must be power of 2. */
    /* Minimum number of bytes requested at a time from the system.  Must be
     * multiple of WORDSIZE. */

    using Size = size_t;

    Size  remaining_ = 0;  //!< Number of bytes left in current block of storage
    void* base_ = nullptr;  //!< Pointer to base of current block of storage
    void* loc_  = nullptr;  //!< Current location in block to next allocate

    void internal_init()
    {
        remaining_   = 0;
        base_        = nullptr;
        usedMemory   = 0;
        wastedMemory = 0;
    }

   public:
    Size usedMemory   = 0;
    Size wastedMemory = 0;

    /**
        Default constructor. Initializes a new pool.
     */
    PooledAllocator() { internal_init(); }

    /**
     * Destructor. Frees all the memory allocated in this pool.
     */
    ~PooledAllocator() { free_all(); }

    /** Frees all allocated memory chunks */
    void free_all()
    {
        while (base_ != nullptr)
        {
            // Get pointer to prev block
            void* prev = *(static_cast<void**>(base_));
            ::free(base_);
            base_ = prev;
        }
        internal_init();
    }

    /**
     * Returns a pointer to a piece of new memory of the given size in bytes
     * allocated from the pool.
     */
    void* malloc(const size_t req_size)
    {
        /* Round size up to a multiple of wordsize.  The following expression
            only works for WORDSIZE that is a power of 2, by masking last bits
           of incremented size to zero.
         */
        const Size size = (req_size + (WORDSIZE - 1)) & ~(WORDSIZE - 1);

        /* Check whether a new block must be allocated.  Note that the first
           word of a block is reserved for a pointer to the previous block.
         */
        if (size > remaining_)
        {
            wastedMemory += remaining_;

            /* Allocate new storage. */
            const Size blocksize =
                size > BLOCKSIZE ? size + WORDSIZE : BLOCKSIZE + WORDSIZE;

            // use the standard C malloc to allocate memory
            void* m = ::malloc(blocksize);
            if (!m) { throw std::bad_alloc(); }

            /* Fill first word of new block with pointer to previous block. */
            static_cast<void**>(m)[0] = base_;
            base_                     = m;

            remaining_ = blocksize - WORDSIZE;
            loc_       = static_cast<char*>(m) + WORDSIZE;
        }
        void* rloc = loc_;
        loc_       = static_cast<char*>(loc_) + size;
        remaining_ -= size;

        usedMemory += size;

        return rloc;
    }

    /**
     * Allocates (using this pool) a generic type T.
     *
     * Params:
     *     count = number of instances to allocate.
     * Returns: pointer (of type T*) to memory buffer
     */
    template <typename T>
    T* allocate(const size_t count = 1)
    {
        T* mem = static_cast<T*>(this->malloc(sizeof(T) * count));
        return mem;
    }
};
/** @} */

/** @addtogroup nanoflann_metaprog_grp Auxiliary metaprogramming stuff
 * @{ */

/** Used to declare fixed-size arrays when DIM>0, dynamically-allocated vectors
 * when DIM=-1. Fixed size version for a generic DIM:
 */
template <int32_t DIM, typename T>
struct array_or_vector
{
    using type = std::array<T, DIM>;
};
/** Dynamic size version */
template <typename T>
struct array_or_vector<-1, T>
{
    using type = std::vector<T>;
};

/** @} */

/** kd-tree base-class
 *
 * Contains the member functions common to the classes KDTreeSingleIndexAdaptor
 * and KDTreeSingleIndexDynamicAdaptor_.
 *
 * \tparam Derived The name of the class which inherits this class.
 * \tparam DatasetAdaptor The user-provided adaptor, which must be ensured to
 *         have a lifetime equal or longer than the instance of this class.
 * \tparam Distance The distance metric to use, these are all classes derived
 * from nanoflann::Metric
 * \tparam DIM Dimensionality of data points (e.g. 3 for 3D points)
 * \tparam IndexType Type of the arguments with which the data can be
 * accessed (e.g. float, double, int64_t, T*)
 */
template <
    class Derived, typename Distance, class DatasetAdaptor, int32_t DIM = -1,
    typename index_t = uint32_t>
class KDTreeBaseClass
{
   public:
    /** Frees the previously-built index. Automatically called within
     * buildIndex(). */
    void freeIndex(Derived& obj)
    {
        obj.pool_.free_all();
        obj.root_node_           = nullptr;
        obj.size_at_index_build_ = 0;
    }

    using ElementType  = typename Distance::ElementType;
    using DistanceType = typename Distance::DistanceType;
    using IndexType    = index_t;

    /**
     *  Array of indices to vectors in the dataset_.
     */
    std::vector<IndexType> vAcc_;

    using Offset    = typename decltype(vAcc_)::size_type;
    using Size      = typename decltype(vAcc_)::size_type;
    using Dimension = int32_t;

    /*---------------------------
     * Internal Data Structures
     * --------------------------*/
    struct Node
    {
        /** Union used because a node can be either a LEAF node or a non-leaf
         * node, so both data fields are never used simultaneously */
        union
        {
            struct leaf
            {
                Offset left, right;  //!< Indices of points in leaf node
            } lr;
            struct nonleaf
            {
                Dimension divfeat;  //!< Dimension used for subdivision.
                /// The values used for subdivision.
                DistanceType divlow, divhigh;
            } sub;
        } node_type;

        /** Child nodes (both=nullptr mean its a leaf node) */
        Node *child1 = nullptr, *child2 = nullptr;
    };

    using NodePtr      = Node*;
    using NodeConstPtr = const Node*;

    struct Interval
    {
        ElementType low, high;
    };

    NodePtr root_node_ = nullptr;

    Size leaf_max_size_ = 0;

    /// Number of thread for concurrent tree build
    Size n_thread_build_ = 1;
    /// Number of current points in the dataset
    Size size_ = 0;
    /// Number of points in the dataset when the index was built
    Size      size_at_index_build_ = 0;
    Dimension dim_                 = 0;  //!< Dimensionality of each data point

    /** Define "BoundingBox" as a fixed-size or variable-size container
     * depending on "DIM" */
    using BoundingBox = typename array_or_vector<DIM, Interval>::type;

    /** Define "distance_vector_t" as a fixed-size or variable-size container
     * depending on "DIM" */
    using distance_vector_t = typename array_or_vector<DIM, DistanceType>::type;

    /** The KD-tree used to find neighbours */
    BoundingBox root_bbox_;

    /**
     * Pooled memory allocator.
     *
     * Using a pooled memory allocator is more efficient
     * than allocating memory directly when there is a large
     * number small of memory allocations.
     */
    PooledAllocator pool_;

    /** Returns number of points in dataset  */
    Size size(const Derived& obj) const { return obj.size_; }

    /** Returns the length of each point in the dataset */
    Size veclen(const Derived& obj) const { return DIM > 0 ? DIM : obj.dim_; }

    /// Helper accessor to the dataset points:
    ElementType dataset_get(
        const Derived& obj, IndexType element, Dimension component) const
    {
        return obj.dataset_.kdtree_get_pt(element, component);
    }

    /**
     * Computes the index memory usage
     * Returns: memory used by the index
     */
    Size usedMemory(const Derived& obj) const
    {
        return obj.pool_.usedMemory + obj.pool_.wastedMemory +
               obj.dataset_.kdtree_get_point_count() *
                   sizeof(IndexType);  // pool memory and vind array memory
    }

    /**
     * Compute the minimum and maximum element values in the specified dimension
     */
    void computeMinMax(
        const Derived& obj, Offset ind, Size count, Dimension element,
        ElementType& min_elem, ElementType& max_elem) const
    {
        min_elem = dataset_get(obj, vAcc_[ind], element);
        max_elem = min_elem;
        for (Offset i = 1; i < count; ++i)
        {
            ElementType val = dataset_get(obj, vAcc_[ind + i], element);
            if (val < min_elem) min_elem = val;
            if (val > max_elem) max_elem = val;
        }
    }

    /**
     * Create a tree node that subdivides the list of vecs from vind[first]
     * to vind[last].  The routine is called recursively on each sublist.
     *
     * @param left index of the first vector
     * @param right index of the last vector
     * @param bbox bounding box used as input for splitting and output for
     * parent node
     */
    NodePtr divideTree(
        Derived& obj, const Offset left, const Offset right, BoundingBox& bbox)
    {
        assert(left < obj.dataset_.kdtree_get_point_count());

        NodePtr node = obj.pool_.template allocate<Node>();  // allocate memory
        const auto dims = (DIM > 0 ? DIM : obj.dim_);

        /* If too few exemplars remain, then make this a leaf node. */
        if ((right - left) <= static_cast<Offset>(obj.leaf_max_size_))
        {
            node->child1 = node->child2 = nullptr; /* Mark as leaf node. */
            node->node_type.lr.left     = left;
            node->node_type.lr.right    = right;

            // compute bounding-box of leaf points
            for (Dimension i = 0; i < dims; ++i)
            {
                bbox[i].low  = dataset_get(obj, obj.vAcc_[left], i);
                bbox[i].high = dataset_get(obj, obj.vAcc_[left], i);
            }
            for (Offset k = left + 1; k < right; ++k)
            {
                for (Dimension i = 0; i < dims; ++i)
                {
                    const auto val = dataset_get(obj, obj.vAcc_[k], i);
                    if (bbox[i].low > val) bbox[i].low = val;
                    if (bbox[i].high < val) bbox[i].high = val;
                }
            }
        }
        else
        {
            /* Determine the index, dimension and value for split plane */
            Offset       idx;
            Dimension    cutfeat;
            DistanceType cutval;
            middleSplit_(obj, left, right - left, idx, cutfeat, cutval, bbox);

            node->node_type.sub.divfeat = cutfeat;

            /* Recurse on left */
            BoundingBox left_bbox(bbox);
            left_bbox[cutfeat].high = cutval;
            node->child1 = this->divideTree(obj, left, left + idx, left_bbox);

            /* Recurse on right */
            BoundingBox right_bbox(bbox);
            right_bbox[cutfeat].low = cutval;
            node->child2 = this->divideTree(obj, left + idx, right, right_bbox);

            node->node_type.sub.divlow  = left_bbox[cutfeat].high;
            node->node_type.sub.divhigh = right_bbox[cutfeat].low;

            for (Dimension i = 0; i < dims; ++i)
            {
                bbox[i].low  = std::min(left_bbox[i].low, right_bbox[i].low);
                bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high);
            }
        }

        return node;
    }

    /**
     * Create a tree node that subdivides the list of vecs from vind[first] to
     * vind[last] concurrently.  The routine is called recursively on each
     * sublist.
     *
     * @param left index of the first vector
     * @param right index of the last vector
     * @param bbox bounding box used as input for splitting and output for
     * parent node
     * @param thread_count count of std::async threads
     * @param mutex mutex for mempool allocation
     */
    NodePtr divideTreeConcurrent(
        Derived& obj, const Offset left, const Offset right, BoundingBox& bbox,
        std::atomic<unsigned int>& thread_count, std::mutex& mutex)
    {
        std::unique_lock<std::mutex> lock(mutex);
        NodePtr node = obj.pool_.template allocate<Node>();  // allocate memory
        lock.unlock();

        const auto dims = (DIM > 0 ? DIM : obj.dim_);

        /* If too few exemplars remain, then make this a leaf node. */
        if ((right - left) <= static_cast<Offset>(obj.leaf_max_size_))
        {
            node->child1 = node->child2 = nullptr; /* Mark as leaf node. */
            node->node_type.lr.left     = left;
            node->node_type.lr.right    = right;

            // compute bounding-box of leaf points
            for (Dimension i = 0; i < dims; ++i)
            {
                bbox[i].low  = dataset_get(obj, obj.vAcc_[left], i);
                bbox[i].high = dataset_get(obj, obj.vAcc_[left], i);
            }
            for (Offset k = left + 1; k < right; ++k)
            {
                for (Dimension i = 0; i < dims; ++i)
                {
                    const auto val = dataset_get(obj, obj.vAcc_[k], i);
                    if (bbox[i].low > val) bbox[i].low = val;
                    if (bbox[i].high < val) bbox[i].high = val;
                }
            }
        }
        else
        {
            /* Determine the index, dimension and value for split plane */
            Offset       idx;
            Dimension    cutfeat;
            DistanceType cutval;
            middleSplit_(obj, left, right - left, idx, cutfeat, cutval, bbox);

            node->node_type.sub.divfeat = cutfeat;

            std::future<NodePtr> right_future;

            /* Recurse on right concurrently, if possible */

            BoundingBox right_bbox(bbox);
            right_bbox[cutfeat].low = cutval;
            if (++thread_count < n_thread_build_)
            {
                /* Concurrent thread for right recursion */

                right_future = std::async(
                    std::launch::async, &KDTreeBaseClass::divideTreeConcurrent,
                    this, std::ref(obj), left + idx, right,
                    std::ref(right_bbox), std::ref(thread_count),
                    std::ref(mutex));
            }
            else { --thread_count; }

            /* Recurse on left in this thread */

            BoundingBox left_bbox(bbox);
            left_bbox[cutfeat].high = cutval;
            node->child1            = this->divideTreeConcurrent(
                           obj, left, left + idx, left_bbox, thread_count, mutex);

            if (right_future.valid())
            {
                /* Block and wait for concurrent right from above */

                node->child2 = right_future.get();
                --thread_count;
            }
            else
            {
                /* Otherwise, recurse on right in this thread */

                node->child2 = this->divideTreeConcurrent(
                    obj, left + idx, right, right_bbox, thread_count, mutex);
            }

            node->node_type.sub.divlow  = left_bbox[cutfeat].high;
            node->node_type.sub.divhigh = right_bbox[cutfeat].low;

            for (Dimension i = 0; i < dims; ++i)
            {
                bbox[i].low  = std::min(left_bbox[i].low, right_bbox[i].low);
                bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high);
            }
        }

        return node;
    }

    void middleSplit_(
        const Derived& obj, const Offset ind, const Size count, Offset& index,
        Dimension& cutfeat, DistanceType& cutval, const BoundingBox& bbox)
    {
        const auto  dims     = (DIM > 0 ? DIM : obj.dim_);
        const auto  EPS      = static_cast<DistanceType>(0.00001);
        ElementType max_span = bbox[0].high - bbox[0].low;
        for (Dimension i = 1; i < dims; ++i)
        {
            ElementType span = bbox[i].high - bbox[i].low;
            if (span > max_span) { max_span = span; }
        }
        ElementType max_spread = -1;
        cutfeat                = 0;
        ElementType min_elem = 0, max_elem = 0;
        for (Dimension i = 0; i < dims; ++i)
        {
            ElementType span = bbox[i].high - bbox[i].low;
            if (span >= (1 - EPS) * max_span)
            {
                ElementType min_elem_, max_elem_;
                computeMinMax(obj, ind, count, i, min_elem_, max_elem_);
                ElementType spread = max_elem_ - min_elem_;
                if (spread > max_spread)
                {
                    cutfeat    = i;
                    max_spread = spread;
                    min_elem   = min_elem_;
                    max_elem   = max_elem_;
                }
            }
        }
        // split in the middle
        DistanceType split_val = (bbox[cutfeat].low + bbox[cutfeat].high) / 2;

        if (split_val < min_elem)
            cutval = min_elem;
        else if (split_val > max_elem)
            cutval = max_elem;
        else
            cutval = split_val;

        Offset lim1, lim2;
        planeSplit(obj, ind, count, cutfeat, cutval, lim1, lim2);

        if (lim1 > count / 2)
            index = lim1;
        else if (lim2 < count / 2)
            index = lim2;
        else
            index = count / 2;
    }

    /**
     *  Subdivide the list of points by a plane perpendicular on the axis
     * corresponding to the 'cutfeat' dimension at 'cutval' position.
     *
     *  On return:
     *  dataset[ind[0..lim1-1]][cutfeat] < cutval
     *  dataset[ind[lim1..lim2-1]][cutfeat] == cutval
     *  dataset[ind[lim2..count]][cutfeat] > cutval
     */
    void planeSplit(
        const Derived& obj, const Offset ind, const Size count,
        const Dimension cutfeat, const DistanceType& cutval, Offset& lim1,
        Offset& lim2)
    {
        /* First pass.
         * Determine lim1 with all values less than cutval to the left.
         */
        Offset left  = 0;
        Offset right = count - 1;
        for (;;)
        {
            while (left <= right &&
                   dataset_get(obj, vAcc_[ind + left], cutfeat) < cutval)
                ++left;
            while (right && left <= right &&
                   dataset_get(obj, vAcc_[ind + right], cutfeat) >= cutval)
                --right;
            if (left > right || !right)
                break;  // "!right" was added to support unsigned Index types
            std::swap(vAcc_[ind + left], vAcc_[ind + right]);
            ++left;
            --right;
        }
        /* Second pass
         * Determine lim2 with all values greater than cutval to the right
         * The middle is used for balancing the tree
         */
        lim1  = left;
        right = count - 1;
        for (;;)
        {
            while (left <= right &&
                   dataset_get(obj, vAcc_[ind + left], cutfeat) <= cutval)
                ++left;
            while (right && left <= right &&
                   dataset_get(obj, vAcc_[ind + right], cutfeat) > cutval)
                --right;
            if (left > right || !right)
                break;  // "!right" was added to support unsigned Index types
            std::swap(vAcc_[ind + left], vAcc_[ind + right]);
            ++left;
            --right;
        }
        lim2 = left;
    }

    DistanceType computeInitialDistances(
        const Derived& obj, const ElementType* vec,
        distance_vector_t& dists) const
    {
        assert(vec);
        DistanceType dist = DistanceType();

        for (Dimension i = 0; i < (DIM > 0 ? DIM : obj.dim_); ++i)
        {
            if (vec[i] < obj.root_bbox_[i].low)
            {
                dists[i] =
                    obj.distance_.accum_dist(vec[i], obj.root_bbox_[i].low, i);
                dist += dists[i];
            }
            if (vec[i] > obj.root_bbox_[i].high)
            {
                dists[i] =
                    obj.distance_.accum_dist(vec[i], obj.root_bbox_[i].high, i);
                dist += dists[i];
            }
        }
        return dist;
    }

    static void save_tree(
        const Derived& obj, std::ostream& stream, const NodeConstPtr tree)
    {
        save_value(stream, *tree);
        if (tree->child1 != nullptr) { save_tree(obj, stream, tree->child1); }
        if (tree->child2 != nullptr) { save_tree(obj, stream, tree->child2); }
    }

    static void load_tree(Derived& obj, std::istream& stream, NodePtr& tree)
    {
        tree = obj.pool_.template allocate<Node>();
        load_value(stream, *tree);
        if (tree->child1 != nullptr) { load_tree(obj, stream, tree->child1); }
        if (tree->child2 != nullptr) { load_tree(obj, stream, tree->child2); }
    }

    /**  Stores the index in a binary file.
     *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so
     * when loading the index object it must be constructed associated to the
     * same source of data points used while building it. See the example:
     * examples/saveload_example.cpp \sa loadIndex  */
    void saveIndex(const Derived& obj, std::ostream& stream) const
    {
        save_value(stream, obj.size_);
        save_value(stream, obj.dim_);
        save_value(stream, obj.root_bbox_);
        save_value(stream, obj.leaf_max_size_);
        save_value(stream, obj.vAcc_);
        if (obj.root_node_) save_tree(obj, stream, obj.root_node_);
    }

    /**  Loads a previous index from a binary file.
     *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so
     * the index object must be constructed associated to the same source of
     * data points used while building the index. See the example:
     * examples/saveload_example.cpp \sa loadIndex  */
    void loadIndex(Derived& obj, std::istream& stream)
    {
        load_value(stream, obj.size_);
        load_value(stream, obj.dim_);
        load_value(stream, obj.root_bbox_);
        load_value(stream, obj.leaf_max_size_);
        load_value(stream, obj.vAcc_);
        load_tree(obj, stream, obj.root_node_);
    }
};

/** @addtogroup kdtrees_grp KD-tree classes and adaptors
 * @{ */

/** kd-tree static index
 *
 * Contains the k-d trees and other information for indexing a set of points
 * for nearest-neighbor matching.
 *
 *  The class "DatasetAdaptor" must provide the following interface (can be
 * non-virtual, inlined methods):
 *
 *  \code
 *   // Must return the number of data poins
 *   size_t kdtree_get_point_count() const { ... }
 *
 *
 *   // Must return the dim'th component of the idx'th point in the class:
 *   T kdtree_get_pt(const size_t idx, const size_t dim) const { ... }
 *
 *   // Optional bounding-box computation: return false to default to a standard
 * bbox computation loop.
 *   //   Return true if the BBOX was already computed by the class and returned
 * in "bb" so it can be avoided to redo it again.
 *   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3
 * for point clouds) template <class BBOX> bool kdtree_get_bbox(BBOX &bb) const
 *   {
 *      bb[0].low = ...; bb[0].high = ...;  // 0th dimension limits
 *      bb[1].low = ...; bb[1].high = ...;  // 1st dimension limits
 *      ...
 *      return true;
 *   }
 *
 *  \endcode
 *
 * \tparam DatasetAdaptor The user-provided adaptor, which must be ensured to
 *         have a lifetime equal or longer than the instance of this class.
 * \tparam Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc. \tparam DIM
 * Dimensionality of data points (e.g. 3 for 3D points) \tparam IndexType Will
 * be typically size_t or int
 */
template <
    typename Distance, class DatasetAdaptor, int32_t DIM = -1,
    typename index_t = uint32_t>
class KDTreeSingleIndexAdaptor
    : public KDTreeBaseClass<
          KDTreeSingleIndexAdaptor<Distance, DatasetAdaptor, DIM, index_t>,
          Distance, DatasetAdaptor, DIM, index_t>
{
   public:
    /** Deleted copy constructor*/
    explicit KDTreeSingleIndexAdaptor(
        const KDTreeSingleIndexAdaptor<
            Distance, DatasetAdaptor, DIM, index_t>&) = delete;

    /** The data source used by this index */
    const DatasetAdaptor& dataset_;

    const KDTreeSingleIndexAdaptorParams indexParams;

    Distance distance_;

    using Base = typename nanoflann::KDTreeBaseClass<
        nanoflann::KDTreeSingleIndexAdaptor<
            Distance, DatasetAdaptor, DIM, index_t>,
        Distance, DatasetAdaptor, DIM, index_t>;

    using Offset    = typename Base::Offset;
    using Size      = typename Base::Size;
    using Dimension = typename Base::Dimension;

    using ElementType  = typename Base::ElementType;
    using DistanceType = typename Base::DistanceType;
    using IndexType    = typename Base::IndexType;

    using Node    = typename Base::Node;
    using NodePtr = Node*;

    using Interval = typename Base::Interval;

    /** Define "BoundingBox" as a fixed-size or variable-size container
     * depending on "DIM" */
    using BoundingBox = typename Base::BoundingBox;

    /** Define "distance_vector_t" as a fixed-size or variable-size container
     * depending on "DIM" */
    using distance_vector_t = typename Base::distance_vector_t;

    /**
     * KDTree constructor
     *
     * Refer to docs in README.md or online in
     * https://github.com/jlblancoc/nanoflann
     *
     * The KD-Tree point dimension (the length of each point in the datase, e.g.
     * 3 for 3D points) is determined by means of:
     *  - The \a DIM template parameter if >0 (highest priority)
     *  - Otherwise, the \a dimensionality parameter of this constructor.
     *
     * @param inputData Dataset with the input features. Its lifetime must be
     *  equal or longer than that of the instance of this class.
     * @param params Basically, the maximum leaf node size
     *
     * Note that there is a variable number of optional additional parameters
     * which will be forwarded to the metric class constructor. Refer to example
     * `examples/pointcloud_custom_metric.cpp` for a use case.
     *
     */
    template <class... Args>
    explicit KDTreeSingleIndexAdaptor(
        const Dimension dimensionality, const DatasetAdaptor& inputData,
        const KDTreeSingleIndexAdaptorParams& params, Args&&... args)
        : dataset_(inputData),
          indexParams(params),
          distance_(inputData, std::forward<Args>(args)...)
    {
        init(dimensionality, params);
    }

    explicit KDTreeSingleIndexAdaptor(
        const Dimension dimensionality, const DatasetAdaptor& inputData,
        const KDTreeSingleIndexAdaptorParams& params = {})
        : dataset_(inputData), indexParams(params), distance_(inputData)
    {
        init(dimensionality, params);
    }

   private:
    void init(
        const Dimension                       dimensionality,
        const KDTreeSingleIndexAdaptorParams& params)
    {
        Base::size_                = dataset_.kdtree_get_point_count();
        Base::size_at_index_build_ = Base::size_;
        Base::dim_                 = dimensionality;
        if (DIM > 0) Base::dim_ = DIM;
        Base::leaf_max_size_ = params.leaf_max_size;
        if (params.n_thread_build > 0)
        {
            Base::n_thread_build_ = params.n_thread_build;
        }
        else
        {
            Base::n_thread_build_ =
                std::max(std::thread::hardware_concurrency(), 1u);
        }

        if (!(params.flags &
              KDTreeSingleIndexAdaptorFlags::SkipInitialBuildIndex))
        {
            // Build KD-tree:
            buildIndex();
        }
    }

   public:
    /**
     * Builds the index
     */
    void buildIndex()
    {
        Base::size_                = dataset_.kdtree_get_point_count();
        Base::size_at_index_build_ = Base::size_;
        init_vind();
        this->freeIndex(*this);
        Base::size_at_index_build_ = Base::size_;
        if (Base::size_ == 0) return;
        computeBoundingBox(Base::root_bbox_);
        // construct the tree
        if (Base::n_thread_build_ == 1)
        {
            Base::root_node_ =
                this->divideTree(*this, 0, Base::size_, Base::root_bbox_);
        }
        else
        {
#ifndef NANOFLANN_NO_THREADS
            std::atomic<unsigned int> thread_count(0u);
            std::mutex                mutex;
            Base::root_node_ = this->divideTreeConcurrent(
                *this, 0, Base::size_, Base::root_bbox_, thread_count, mutex);
#else /* NANOFLANN_NO_THREADS */
            throw std::runtime_error("Multithreading is disabled");
#endif /* NANOFLANN_NO_THREADS */
        }
    }

    /** \name Query methods
     * @{ */

    /**
     * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored
     * inside the result object.
     *
     * Params:
     *     result = the result object in which the indices of the
     * nearest-neighbors are stored vec = the vector for which to search the
     * nearest neighbors
     *
     * \tparam RESULTSET Should be any ResultSet<DistanceType>
     * \return  True if the requested neighbors could be found.
     * \sa knnSearch, radiusSearch
     *
     * \note If L2 norms are used, all returned distances are actually squared
     *       distances.
     */
    template <typename RESULTSET>
    bool findNeighbors(
        RESULTSET& result, const ElementType* vec,
        const SearchParameters& searchParams = {}) const
    {
        assert(vec);
        if (this->size(*this) == 0) return false;
        if (!Base::root_node_)
            throw std::runtime_error(
                "[nanoflann] findNeighbors() called before building the "
                "index.");
        float epsError = 1 + searchParams.eps;

        // fixed or variable-sized container (depending on DIM)
        distance_vector_t dists;
        // Fill it with zeros.
        auto zero = static_cast<typename RESULTSET::DistanceType>(0);
        assign(dists, (DIM > 0 ? DIM : Base::dim_), zero);
        DistanceType dist = this->computeInitialDistances(*this, vec, dists);
        searchLevel(result, vec, Base::root_node_, dist, dists, epsError);

        if (searchParams.sorted) result.sort();

        return result.full();
    }

    /**
     * Find all points contained within the specified bounding box. Their
     * indices are stored inside the result object.
     *
     * Params:
     *     result = the result object in which the indices of the points
     *              within the bounding box are stored
     *     bbox = the bounding box defining the search region
     *
     * \tparam RESULTSET Should be any ResultSet<DistanceType>
     * \return  Number of points found within the bounding box.
     * \sa findNeighbors, knnSearch, radiusSearch
     *
     * \note The search is inclusive - points on the boundary are included.
     */
    template <typename RESULTSET>
    Size findWithinBox(RESULTSET& result, const BoundingBox& bbox) const
    {
        if (this->size(*this) == 0) return 0;
        if (!Base::root_node_)
            throw std::runtime_error(
                "[nanoflann] findWithinBox() called before building the "
                "index.");

        std::stack<NodePtr> stack;
        stack.push(Base::root_node_);

        while (!stack.empty())
        {
            const NodePtr node = stack.top();
            stack.pop();

            // If this is a leaf node, then do check and return.
            if (!node->child1)  // (if one node is nullptr, both are)
            {
                for (Offset i = node->node_type.lr.left;
                     i < node->node_type.lr.right; ++i)
                {
                    if (contains(bbox, Base::vAcc_[i]))
                    {
                        if (!result.addPoint(0, Base::vAcc_[i]))
                        {
                            // the resultset doesn't want to receive any more
                            // points, we're done searching!
                            return result.size();
                        }
                    }
                }
            }
            else
            {
                const int  idx        = node->node_type.sub.divfeat;
                const auto low_bound  = node->node_type.sub.divlow;
                const auto high_bound = node->node_type.sub.divhigh;

                if (bbox[idx].low <= low_bound) stack.push(node->child1);
                if (bbox[idx].high >= high_bound) stack.push(node->child2);
            }
        }

        return result.size();
    }

    /**
     * Find the "num_closest" nearest neighbors to the \a query_point[0:dim-1].
     * Their indices and distances are stored in the provided pointers to
     * array/vector.
     *
     * \sa radiusSearch, findNeighbors
     * \return Number `N` of valid points in the result set.
     *
     * \note If L2 norms are used, all returned distances are actually squared
     *       distances.
     *
     * \note Only the first `N` entries in `out_indices` and `out_distances`
     *       will be valid. Return is less than `num_closest` only if the
     *       number of elements in the tree is less than `num_closest`.
     */
    Size knnSearch(
        const ElementType* query_point, const Size num_closest,
        IndexType* out_indices, DistanceType* out_distances) const
    {
        nanoflann::KNNResultSet<DistanceType, IndexType> resultSet(num_closest);
        resultSet.init(out_indices, out_distances);
        findNeighbors(resultSet, query_point);
        return resultSet.size();
    }

    /**
     * Find all the neighbors to \a query_point[0:dim-1] within a maximum
     * radius. The output is given as a vector of pairs, of which the first
     * element is a point index and the second the corresponding distance.
     * Previous contents of \a IndicesDists are cleared.
     *
     *  If searchParams.sorted==true, the output list is sorted by ascending
     * distances.
     *
     *  For a better performance, it is advisable to do a .reserve() on the
     * vector if you have any wild guess about the number of expected matches.
     *
     *  \sa knnSearch, findNeighbors, radiusSearchCustomCallback
     * \return The number of points within the given radius (i.e. indices.size()
     * or dists.size() )
     *
     * \note If L2 norms are used, search radius and all returned distances
     *       are actually squared distances.
     */
    Size radiusSearch(
        const ElementType* query_point, const DistanceType& radius,
        std::vector<ResultItem<IndexType, DistanceType>>& IndicesDists,
        const SearchParameters& searchParams = {}) const
    {
        RadiusResultSet<DistanceType, IndexType> resultSet(
            radius, IndicesDists);
        const Size nFound =
            radiusSearchCustomCallback(query_point, resultSet, searchParams);
        return nFound;
    }

    /**
     * Just like radiusSearch() but with a custom callback class for each point
     * found in the radius of the query. See the source of RadiusResultSet<> as
     * a start point for your own classes. \sa radiusSearch
     */
    template <class SEARCH_CALLBACK>
    Size radiusSearchCustomCallback(
        const ElementType* query_point, SEARCH_CALLBACK& resultSet,
        const SearchParameters& searchParams = {}) const
    {
        findNeighbors(resultSet, query_point, searchParams);
        return resultSet.size();
    }

    /**
     * Find the first N neighbors to \a query_point[0:dim-1] within a maximum
     * radius. The output is given as a vector of pairs, of which the first
     * element is a point index and the second the corresponding distance.
     * Previous contents of \a IndicesDists are cleared.
     *
     * \sa radiusSearch, findNeighbors
     * \return Number `N` of valid points in the result set.
     *
     * \note If L2 norms are used, all returned distances are actually squared
     *       distances.
     *
     * \note Only the first `N` entries in `out_indices` and `out_distances`
     *       will be valid. Return is less than `num_closest` only if the
     *       number of elements in the tree is less than `num_closest`.
     */
    Size rknnSearch(
        const ElementType* query_point, const Size num_closest,
        IndexType* out_indices, DistanceType* out_distances,
        const DistanceType& radius) const
    {
        nanoflann::RKNNResultSet<DistanceType, IndexType> resultSet(
            num_closest, radius);
        resultSet.init(out_indices, out_distances);
        findNeighbors(resultSet, query_point);
        return resultSet.size();
    }

    /** @} */

   public:
    /** Make sure the auxiliary list \a vind has the same size than the
     * current dataset, and re-generate if size has changed. */
    void init_vind()
    {
        // Create a permutable array of indices to the input vectors.
        Base::size_ = dataset_.kdtree_get_point_count();
        if (Base::vAcc_.size() != Base::size_) Base::vAcc_.resize(Base::size_);
        for (IndexType i = 0; i < static_cast<IndexType>(Base::size_); i++)
            Base::vAcc_[i] = i;
    }

    void computeBoundingBox(BoundingBox& bbox)
    {
        const auto dims = (DIM > 0 ? DIM : Base::dim_);
        resize(bbox, dims);
        if (dataset_.kdtree_get_bbox(bbox))
        {
            // Done! It was implemented in derived class
        }
        else
        {
            const Size N = dataset_.kdtree_get_point_count();
            if (!N)
                throw std::runtime_error(
                    "[nanoflann] computeBoundingBox() called but "
                    "no data points found.");
            for (Dimension i = 0; i < dims; ++i)
            {
                bbox[i].low = bbox[i].high =
                    this->dataset_get(*this, Base::vAcc_[0], i);
            }
            for (Offset k = 1; k < N; ++k)
            {
                for (Dimension i = 0; i < dims; ++i)
                {
                    const auto val =
                        this->dataset_get(*this, Base::vAcc_[k], i);
                    if (val < bbox[i].low) bbox[i].low = val;
                    if (val > bbox[i].high) bbox[i].high = val;
                }
            }
        }
    }

    bool contains(const BoundingBox& bbox, IndexType idx) const
    {
        const auto dims = (DIM > 0 ? DIM : Base::dim_);
        for (Dimension i = 0; i < dims; ++i)
        {
            const auto point = this->dataset_.kdtree_get_pt(idx, i);
            if (point < bbox[i].low || point > bbox[i].high) return false;
        }
        return true;
    }

    /**
     * Performs an exact search in the tree starting from a node.
     * \tparam RESULTSET Should be any ResultSet<DistanceType>
     * \return true if the search should be continued, false if the results are
     * sufficient
     */
    template <class RESULTSET>
    bool searchLevel(
        RESULTSET& result_set, const ElementType* vec, const NodePtr node,
        DistanceType mindist, distance_vector_t& dists,
        const float epsError) const
    {
        // If this is a leaf node, then do check and return.
        if (!node->child1)  // (if one node is nullptr, both are)
        {
            for (Offset i = node->node_type.lr.left;
                 i < node->node_type.lr.right; ++i)
            {
                const IndexType accessor = Base::vAcc_[i];  // reorder... : i;
                DistanceType    dist     = distance_.evalMetric(
                           vec, accessor, (DIM > 0 ? DIM : Base::dim_));
                if (dist < result_set.worstDist())
                {
                    if (!result_set.addPoint(dist, Base::vAcc_[i]))
                    {
                        // the resultset doesn't want to receive any more
                        // points, we're done searching!
                        return false;
                    }
                }
            }
            return true;
        }

        /* Which child branch should be taken first? */
        Dimension    idx   = node->node_type.sub.divfeat;
        ElementType  val   = vec[idx];
        DistanceType diff1 = val - node->node_type.sub.divlow;
        DistanceType diff2 = val - node->node_type.sub.divhigh;

        NodePtr      bestChild;
        NodePtr      otherChild;
        DistanceType cut_dist;
        if ((diff1 + diff2) < 0)
        {
            bestChild  = node->child1;
            otherChild = node->child2;
            cut_dist =
                distance_.accum_dist(val, node->node_type.sub.divhigh, idx);
        }
        else
        {
            bestChild  = node->child2;
            otherChild = node->child1;
            cut_dist =
                distance_.accum_dist(val, node->node_type.sub.divlow, idx);
        }

        /* Call recursively to search next level down. */
        if (!searchLevel(result_set, vec, bestChild, mindist, dists, epsError))
        {
            // the resultset doesn't want to receive any more points, we're done
            // searching!
            return false;
        }

        DistanceType dst = dists[idx];
        mindist          = mindist + cut_dist - dst;
        dists[idx]       = cut_dist;
        if (mindist * epsError <= result_set.worstDist())
        {
            if (!searchLevel(
                    result_set, vec, otherChild, mindist, dists, epsError))
            {
                // the resultset doesn't want to receive any more points, we're
                // done searching!
                return false;
            }
        }
        dists[idx] = dst;
        return true;
    }

   public:
    /**  Stores the index in a binary file.
     *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so
     * when loading the index object it must be constructed associated to the
     * same source of data points used while building it. See the example:
     * examples/saveload_example.cpp \sa loadIndex  */
    void saveIndex(std::ostream& stream) const
    {
        Base::saveIndex(*this, stream);
    }

    /**  Loads a previous index from a binary file.
     *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so
     * the index object must be constructed associated to the same source of
     * data points used while building the index. See the example:
     * examples/saveload_example.cpp \sa loadIndex  */
    void loadIndex(std::istream& stream) { Base::loadIndex(*this, stream); }

};  // class KDTree

/** kd-tree dynamic index
 *
 * Contains the k-d trees and other information for indexing a set of points
 * for nearest-neighbor matching.
 *
 * The class "DatasetAdaptor" must provide the following interface (can be
 * non-virtual, inlined methods):
 *
 *  \code
 *   // Must return the number of data poins
 *   size_t kdtree_get_point_count() const { ... }
 *
 *   // Must return the dim'th component of the idx'th point in the class:
 *   T kdtree_get_pt(const size_t idx, const size_t dim) const { ... }
 *
 *   // Optional bounding-box computation: return false to default to a standard
 * bbox computation loop.
 *   //   Return true if the BBOX was already computed by the class and returned
 * in "bb" so it can be avoided to redo it again.
 *   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3
 * for point clouds) template <class BBOX> bool kdtree_get_bbox(BBOX &bb) const
 *   {
 *      bb[0].low = ...; bb[0].high = ...;  // 0th dimension limits
 *      bb[1].low = ...; bb[1].high = ...;  // 1st dimension limits
 *      ...
 *      return true;
 *   }
 *
 *  \endcode
 *
 * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
 * \tparam Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc.
 * \tparam DIM Dimensionality of data points (e.g. 3 for 3D points)
 * \tparam IndexType Type of the arguments with which the data can be
 * accessed (e.g. float, double, int64_t, T*)
 */
template <
    typename Distance, class DatasetAdaptor, int32_t DIM = -1,
    typename IndexType = uint32_t>
class KDTreeSingleIndexDynamicAdaptor_
    : public KDTreeBaseClass<
          KDTreeSingleIndexDynamicAdaptor_<
              Distance, DatasetAdaptor, DIM, IndexType>,
          Distance, DatasetAdaptor, DIM, IndexType>
{
   public:
    /**
     * The dataset used by this index
     */
    const DatasetAdaptor& dataset_;  //!< The source of our data

    KDTreeSingleIndexAdaptorParams index_params_;

    std::vector<int>& treeIndex_;

    Distance distance_;

    using Base = typename nanoflann::KDTreeBaseClass<
        nanoflann::KDTreeSingleIndexDynamicAdaptor_<
            Distance, DatasetAdaptor, DIM, IndexType>,
        Distance, DatasetAdaptor, DIM, IndexType>;

    using ElementType  = typename Base::ElementType;
    using DistanceType = typename Base::DistanceType;

    using Offset    = typename Base::Offset;
    using Size      = typename Base::Size;
    using Dimension = typename Base::Dimension;

    using Node    = typename Base::Node;
    using NodePtr = Node*;

    using Interval = typename Base::Interval;
    /** Define "BoundingBox" as a fixed-size or variable-size container
     * depending on "DIM" */
    using BoundingBox = typename Base::BoundingBox;

    /** Define "distance_vector_t" as a fixed-size or variable-size container
     * depending on "DIM" */
    using distance_vector_t = typename Base::distance_vector_t;

    /**
     * KDTree constructor
     *
     * Refer to docs in README.md or online in
     * https://github.com/jlblancoc/nanoflann
     *
     * The KD-Tree point dimension (the length of each point in the datase, e.g.
     * 3 for 3D points) is determined by means of:
     *  - The \a DIM template parameter if >0 (highest priority)
     *  - Otherwise, the \a dimensionality parameter of this constructor.
     *
     * @param inputData Dataset with the input features. Its lifetime must be
     *  equal or longer than that of the instance of this class.
     * @param params Basically, the maximum leaf node size
     */
    KDTreeSingleIndexDynamicAdaptor_(
        const Dimension dimensionality, const DatasetAdaptor& inputData,
        std::vector<int>&                     treeIndex,
        const KDTreeSingleIndexAdaptorParams& params =
            KDTreeSingleIndexAdaptorParams())
        : dataset_(inputData),
          index_params_(params),
          treeIndex_(treeIndex),
          distance_(inputData)
    {
        Base::size_                = 0;
        Base::size_at_index_build_ = 0;
        for (auto& v : Base::root_bbox_) v = {};
        Base::dim_ = dimensionality;
        if (DIM > 0) Base::dim_ = DIM;
        Base::leaf_max_size_ = params.leaf_max_size;
        if (params.n_thread_build > 0)
        {
            Base::n_thread_build_ = params.n_thread_build;
        }
        else
        {
            Base::n_thread_build_ =
                std::max(std::thread::hardware_concurrency(), 1u);
        }
    }

    /** Explicitly default the copy constructor */
    KDTreeSingleIndexDynamicAdaptor_(
        const KDTreeSingleIndexDynamicAdaptor_& rhs) = default;

    /** Assignment operator definiton */
    KDTreeSingleIndexDynamicAdaptor_ operator=(
        const KDTreeSingleIndexDynamicAdaptor_& rhs)
    {
        KDTreeSingleIndexDynamicAdaptor_ tmp(rhs);
        std::swap(Base::vAcc_, tmp.Base::vAcc_);
        std::swap(Base::leaf_max_size_, tmp.Base::leaf_max_size_);
        std::swap(index_params_, tmp.index_params_);
        std::swap(treeIndex_, tmp.treeIndex_);
        std::swap(Base::size_, tmp.Base::size_);
        std::swap(Base::size_at_index_build_, tmp.Base::size_at_index_build_);
        std::swap(Base::root_node_, tmp.Base::root_node_);
        std::swap(Base::root_bbox_, tmp.Base::root_bbox_);
        std::swap(Base::pool_, tmp.Base::pool_);
        return *this;
    }

    /**
     * Builds the index
     */
    void buildIndex()
    {
        Base::size_ = Base::vAcc_.size();
        this->freeIndex(*this);
        Base::size_at_index_build_ = Base::size_;
        if (Base::size_ == 0) return;
        computeBoundingBox(Base::root_bbox_);
        // construct the tree
        if (Base::n_thread_build_ == 1)
        {
            Base::root_node_ =
                this->divideTree(*this, 0, Base::size_, Base::root_bbox_);
        }
        else
        {
#ifndef NANOFLANN_NO_THREADS
            std::atomic<unsigned int> thread_count(0u);
            std::mutex                mutex;
            Base::root_node_ = this->divideTreeConcurrent(
                *this, 0, Base::size_, Base::root_bbox_, thread_count, mutex);
#else /* NANOFLANN_NO_THREADS */
            throw std::runtime_error("Multithreading is disabled");
#endif /* NANOFLANN_NO_THREADS */
        }
    }

    /** \name Query methods
     * @{ */

    /**
     * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored
     * inside the result object.
     * This is the core search function, all others are wrappers around this
     * one.
     *
     * \param result The result object in which the indices of the
     *               nearest-neighbors are stored.
     * \param vec    The vector of the query point for which to search the
     *               nearest neighbors.
     * \param searchParams Optional parameters for the search.
     *
     * \tparam RESULTSET Should be any ResultSet<DistanceType>
     * \return True if the requested neighbors could be found.
     *
     * \sa knnSearch(), radiusSearch(), radiusSearchCustomCallback()
     *
     * \note If L2 norms are used, all returned distances are actually squared
     *       distances.
     */
    template <typename RESULTSET>
    bool findNeighbors(
        RESULTSET& result, const ElementType* vec,
        const SearchParameters& searchParams = {}) const
    {
        assert(vec);
        if (this->size(*this) == 0) return false;
        if (!Base::root_node_) return false;
        float epsError = 1 + searchParams.eps;

        // fixed or variable-sized container (depending on DIM)
        distance_vector_t dists;
        // Fill it with zeros.
        assign(
            dists, (DIM > 0 ? DIM : Base::dim_),
            static_cast<typename distance_vector_t::value_type>(0));
        DistanceType dist = this->computeInitialDistances(*this, vec, dists);
        searchLevel(result, vec, Base::root_node_, dist, dists, epsError);
        return result.full();
    }

    /**
     * Find the "num_closest" nearest neighbors to the \a query_point[0:dim-1].
     * Their indices are stored inside the result object. \sa radiusSearch,
     * findNeighbors
     * \return Number `N` of valid points in
     * the result set.
     *
     * \note If L2 norms are used, all returned distances are actually squared
     *       distances.
     *
     * \note Only the first `N` entries in `out_indices` and `out_distances`
     *       will be valid. Return may be less than `num_closest` only if the
     *       number of elements in the tree is less than `num_closest`.
     */
    Size knnSearch(
        const ElementType* query_point, const Size num_closest,
        IndexType* out_indices, DistanceType* out_distances,
        const SearchParameters& searchParams = {}) const
    {
        nanoflann::KNNResultSet<DistanceType, IndexType> resultSet(num_closest);
        resultSet.init(out_indices, out_distances);
        findNeighbors(resultSet, query_point, searchParams);
        return resultSet.size();
    }

    /**
     * Find all the neighbors to \a query_point[0:dim-1] within a maximum
     * radius. The output is given as a vector of pairs, of which the first
     * element is a point index and the second the corresponding distance.
     * Previous contents of \a IndicesDists are cleared.
     *
     * If searchParams.sorted==true, the output list is sorted by ascending
     * distances.
     *
     * For a better performance, it is advisable to do a .reserve() on the
     * vector if you have any wild guess about the number of expected matches.
     *
     *  \sa knnSearch, findNeighbors, radiusSearchCustomCallback
     * \return The number of points within the given radius (i.e. indices.size()
     * or dists.size() )
     *
     * \note If L2 norms are used, search radius and all returned distances
     *       are actually squared distances.
     */
    Size radiusSearch(
        const ElementType* query_point, const DistanceType& radius,
        std::vector<ResultItem<IndexType, DistanceType>>& IndicesDists,
        const SearchParameters& searchParams = {}) const
    {
        RadiusResultSet<DistanceType, IndexType> resultSet(
            radius, IndicesDists);
        const size_t nFound =
            radiusSearchCustomCallback(query_point, resultSet, searchParams);
        return nFound;
    }

    /**
     * Just like radiusSearch() but with a custom callback class for each point
     * found in the radius of the query. See the source of RadiusResultSet<> as
     * a start point for your own classes. \sa radiusSearch
     */
    template <class SEARCH_CALLBACK>
    Size radiusSearchCustomCallback(
        const ElementType* query_point, SEARCH_CALLBACK& resultSet,
        const SearchParameters& searchParams = {}) const
    {
        findNeighbors(resultSet, query_point, searchParams);
        return resultSet.size();
    }

    /** @} */

   public:
    void computeBoundingBox(BoundingBox& bbox)
    {
        const auto dims = (DIM > 0 ? DIM : Base::dim_);
        resize(bbox, dims);

        if (dataset_.kdtree_get_bbox(bbox))
        {
            // Done! It was implemented in derived class
        }
        else
        {
            const Size N = Base::size_;
            if (!N)
                throw std::runtime_error(
                    "[nanoflann] computeBoundingBox() called but "
                    "no data points found.");
            for (Dimension i = 0; i < dims; ++i)
            {
                bbox[i].low = bbox[i].high =
                    this->dataset_get(*this, Base::vAcc_[0], i);
            }
            for (Offset k = 1; k < N; ++k)
            {
                for (Dimension i = 0; i < dims; ++i)
                {
                    const auto val =
                        this->dataset_get(*this, Base::vAcc_[k], i);
                    if (val < bbox[i].low) bbox[i].low = val;
                    if (val > bbox[i].high) bbox[i].high = val;
                }
            }
        }
    }

    /**
     * Performs an exact search in the tree starting from a node.
     * \tparam RESULTSET Should be any ResultSet<DistanceType>
     */
    template <class RESULTSET>
    void searchLevel(
        RESULTSET& result_set, const ElementType* vec, const NodePtr node,
        DistanceType mindist, distance_vector_t& dists,
        const float epsError) const
    {
        // If this is a leaf node, then do check and return.
        if (!node->child1)  // (if one node is nullptr, both are)
        {
            for (Offset i = node->node_type.lr.left;
                 i < node->node_type.lr.right; ++i)
            {
                const IndexType index = Base::vAcc_[i];  // reorder... : i;
                if (treeIndex_[index] == -1) continue;
                DistanceType dist = distance_.evalMetric(
                    vec, index, (DIM > 0 ? DIM : Base::dim_));
                if (dist < result_set.worstDist())
                {
                    if (!result_set.addPoint(
                            static_cast<typename RESULTSET::DistanceType>(dist),
                            static_cast<typename RESULTSET::IndexType>(
                                Base::vAcc_[i])))
                    {
                        // the resultset doesn't want to receive any more
                        // points, we're done searching!
                        return;  // false;
                    }
                }
            }
            return;
        }

        /* Which child branch should be taken first? */
        Dimension    idx   = node->node_type.sub.divfeat;
        ElementType  val   = vec[idx];
        DistanceType diff1 = val - node->node_type.sub.divlow;
        DistanceType diff2 = val - node->node_type.sub.divhigh;

        NodePtr      bestChild;
        NodePtr      otherChild;
        DistanceType cut_dist;
        if ((diff1 + diff2) < 0)
        {
            bestChild  = node->child1;
            otherChild = node->child2;
            cut_dist =
                distance_.accum_dist(val, node->node_type.sub.divhigh, idx);
        }
        else
        {
            bestChild  = node->child2;
            otherChild = node->child1;
            cut_dist =
                distance_.accum_dist(val, node->node_type.sub.divlow, idx);
        }

        /* Call recursively to search next level down. */
        searchLevel(result_set, vec, bestChild, mindist, dists, epsError);

        DistanceType dst = dists[idx];
        mindist          = mindist + cut_dist - dst;
        dists[idx]       = cut_dist;
        if (mindist * epsError <= result_set.worstDist())
        {
            searchLevel(result_set, vec, otherChild, mindist, dists, epsError);
        }
        dists[idx] = dst;
    }

   public:
    /**  Stores the index in a binary file.
     *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so
     * when loading the index object it must be constructed associated to the
     * same source of data points used while building it. See the example:
     * examples/saveload_example.cpp \sa loadIndex  */
    void saveIndex(std::ostream& stream) { saveIndex(*this, stream); }

    /**  Loads a previous index from a binary file.
     *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so
     * the index object must be constructed associated to the same source of
     * data points used while building the index. See the example:
     * examples/saveload_example.cpp \sa loadIndex  */
    void loadIndex(std::istream& stream) { loadIndex(*this, stream); }
};

/** kd-tree dynaimic index
 *
 * class to create multiple static index and merge their results to behave as
 * single dynamic index as proposed in Logarithmic Approach.
 *
 *  Example of usage:
 *  examples/dynamic_pointcloud_example.cpp
 *
 * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
 * \tparam Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc. \tparam DIM
 * Dimensionality of data points (e.g. 3 for 3D points) \tparam IndexType
 * Will be typically size_t or int
 */
template <
    typename Distance, class DatasetAdaptor, int32_t DIM = -1,
    typename IndexType = uint32_t>
class KDTreeSingleIndexDynamicAdaptor
{
   public:
    using ElementType  = typename Distance::ElementType;
    using DistanceType = typename Distance::DistanceType;

    using Offset = typename KDTreeSingleIndexDynamicAdaptor_<
        Distance, DatasetAdaptor, DIM>::Offset;
    using Size = typename KDTreeSingleIndexDynamicAdaptor_<
        Distance, DatasetAdaptor, DIM>::Size;
    using Dimension = typename KDTreeSingleIndexDynamicAdaptor_<
        Distance, DatasetAdaptor, DIM>::Dimension;

   protected:
    Size leaf_max_size_;
    Size treeCount_;
    Size pointCount_;

    /**
     * The dataset used by this index
     */
    const DatasetAdaptor& dataset_;  //!< The source of our data

    /** treeIndex[idx] is the index of tree in which point at idx is stored.
     * treeIndex[idx]=-1 means that point has been removed. */
    std::vector<int>        treeIndex_;
    std::unordered_set<int> removedPoints_;

    KDTreeSingleIndexAdaptorParams index_params_;

    Dimension dim_;  //!< Dimensionality of each data point

    using index_container_t = KDTreeSingleIndexDynamicAdaptor_<
        Distance, DatasetAdaptor, DIM, IndexType>;
    std::vector<index_container_t> index_;

   public:
    /** Get a const ref to the internal list of indices; the number of indices
     * is adapted dynamically as the dataset grows in size. */
    const std::vector<index_container_t>& getAllIndices() const
    {
        return index_;
    }

   private:
    /** finds position of least significant unset bit */
    int First0Bit(IndexType num)
    {
        int pos = 0;
        while (num & 1)
        {
            num = num >> 1;
            pos++;
        }
        return pos;
    }

    /** Creates multiple empty trees to handle dynamic support */
    void init()
    {
        using my_kd_tree_t = KDTreeSingleIndexDynamicAdaptor_<
            Distance, DatasetAdaptor, DIM, IndexType>;
        std::vector<my_kd_tree_t> index(
            treeCount_,
            my_kd_tree_t(dim_ /*dim*/, dataset_, treeIndex_, index_params_));
        index_ = index;
    }

   public:
    Distance distance_;

    /**
     * KDTree constructor
     *
     * Refer to docs in README.md or online in
     * https://github.com/jlblancoc/nanoflann
     *
     * The KD-Tree point dimension (the length of each point in the datase, e.g.
     * 3 for 3D points) is determined by means of:
     *  - The \a DIM template parameter if >0 (highest priority)
     *  - Otherwise, the \a dimensionality parameter of this constructor.
     *
     * @param inputData Dataset with the input features. Its lifetime must be
     *  equal or longer than that of the instance of this class.
     * @param params Basically, the maximum leaf node size
     */
    explicit KDTreeSingleIndexDynamicAdaptor(
        const int dimensionality, const DatasetAdaptor& inputData,
        const KDTreeSingleIndexAdaptorParams& params =
            KDTreeSingleIndexAdaptorParams(),
        const size_t maximumPointCount = 1000000000U)
        : dataset_(inputData), index_params_(params), distance_(inputData)
    {
        treeCount_  = static_cast<size_t>(std::log2(maximumPointCount)) + 1;
        pointCount_ = 0U;
        dim_        = dimensionality;
        treeIndex_.clear();
        if (DIM > 0) dim_ = DIM;
        leaf_max_size_ = params.leaf_max_size;
        init();
        const size_t num_initial_points = dataset_.kdtree_get_point_count();
        if (num_initial_points > 0) { addPoints(0, num_initial_points - 1); }
    }

    /** Deleted copy constructor*/
    explicit KDTreeSingleIndexDynamicAdaptor(
        const KDTreeSingleIndexDynamicAdaptor<
            Distance, DatasetAdaptor, DIM, IndexType>&) = delete;

    /** Add points to the set, Inserts all points from [start, end] */
    void addPoints(IndexType start, IndexType end)
    {
        const Size count    = end - start + 1;
        int        maxIndex = 0;
        treeIndex_.resize(treeIndex_.size() + count);
        for (IndexType idx = start; idx <= end; idx++)
        {
            const int pos           = First0Bit(pointCount_);
            maxIndex                = std::max(pos, maxIndex);
            treeIndex_[pointCount_] = pos;

            const auto it = removedPoints_.find(idx);
            if (it != removedPoints_.end())
            {
                removedPoints_.erase(it);
                treeIndex_[idx] = pos;
            }

            for (int i = 0; i < pos; i++)
            {
                for (int j = 0; j < static_cast<int>(index_[i].vAcc_.size());
                     j++)
                {
                    index_[pos].vAcc_.push_back(index_[i].vAcc_[j]);
                    if (treeIndex_[index_[i].vAcc_[j]] != -1)
                        treeIndex_[index_[i].vAcc_[j]] = pos;
                }
                index_[i].vAcc_.clear();
            }
            index_[pos].vAcc_.push_back(idx);
            pointCount_++;
        }

        for (int i = 0; i <= maxIndex; ++i)
        {
            index_[i].freeIndex(index_[i]);
            if (!index_[i].vAcc_.empty()) index_[i].buildIndex();
        }
    }

    /** Remove a point from the set (Lazy Deletion) */
    void removePoint(size_t idx)
    {
        if (idx >= pointCount_) return;
        removedPoints_.insert(idx);
        treeIndex_[idx] = -1;
    }

    /**
     * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored
     * inside the result object.
     *
     * Params:
     *     result = the result object in which the indices of the
     * nearest-neighbors are stored vec = the vector for which to search the
     * nearest neighbors
     *
     * \tparam RESULTSET Should be any ResultSet<DistanceType>
     * \return  True if the requested neighbors could be found.
     * \sa knnSearch, radiusSearch
     *
     * \note If L2 norms are used, all returned distances are actually squared
     *       distances.
     */
    template <typename RESULTSET>
    bool findNeighbors(
        RESULTSET& result, const ElementType* vec,
        const SearchParameters& searchParams = {}) const
    {
        for (size_t i = 0; i < treeCount_; i++)
        {
            index_[i].findNeighbors(result, &vec[0], searchParams);
        }
        return result.full();
    }
};

/** An L2-metric KD-tree adaptor for working with data directly stored in an
 * Eigen Matrix, without duplicating the data storage. You can select whether a
 * row or column in the matrix represents a point in the state space.
 *
 * Example of usage:
 * \code
 * Eigen::Matrix<num_t,Eigen::Dynamic,Eigen::Dynamic>  mat;
 *
 * // Fill out "mat"...
 * using my_kd_tree_t = nanoflann::KDTreeEigenMatrixAdaptor<
 *   Eigen::Matrix<num_t,Dynamic,Dynamic>>;
 *
 * const int max_leaf = 10;
 * my_kd_tree_t mat_index(mat, max_leaf);
 * mat_index.index->...
 * \endcode
 *
 *  \tparam DIM If set to >0, it specifies a compile-time fixed dimensionality
 * for the points in the data set, allowing more compiler optimizations.
 * \tparam Distance The distance metric to use: nanoflann::metric_L1,
 * nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc.
 * \tparam row_major If set to true the rows of the matrix are used as the
 *         points, if set to false  the columns of the matrix are used as the
 *         points.
 */
template <
    class MatrixType, int32_t DIM = -1, class Distance = nanoflann::metric_L2,
    bool row_major = true>
struct KDTreeEigenMatrixAdaptor
{
    using self_t =
        KDTreeEigenMatrixAdaptor<MatrixType, DIM, Distance, row_major>;
    using num_t     = typename MatrixType::Scalar;
    using IndexType = typename MatrixType::Index;
    using metric_t  = typename Distance::template traits<
        num_t, self_t, IndexType>::distance_t;

    using index_t = KDTreeSingleIndexAdaptor<
        metric_t, self_t,
        row_major ? MatrixType::ColsAtCompileTime
                  : MatrixType::RowsAtCompileTime,
        IndexType>;

    index_t* index_;  //! The kd-tree index for the user to call its methods as
                      //! usual with any other FLANN index.

    using Offset    = typename index_t::Offset;
    using Size      = typename index_t::Size;
    using Dimension = typename index_t::Dimension;

    /// Constructor: takes a const ref to the matrix object with the data points
    explicit KDTreeEigenMatrixAdaptor(
        const Dimension                                 dimensionality,
        const std::reference_wrapper<const MatrixType>& mat,
        const int leaf_max_size = 10, const unsigned int n_thread_build = 1)
        : m_data_matrix(mat)
    {
        const auto dims = row_major ? mat.get().cols() : mat.get().rows();
        if (static_cast<Dimension>(dims) != dimensionality)
            throw std::runtime_error(
                "Error: 'dimensionality' must match column count in data "
                "matrix");
        if (DIM > 0 && static_cast<int32_t>(dims) != DIM)
            throw std::runtime_error(
                "Data set dimensionality does not match the 'DIM' template "
                "argument");
        index_ = new index_t(
            dims, *this /* adaptor */,
            nanoflann::KDTreeSingleIndexAdaptorParams(
                leaf_max_size, nanoflann::KDTreeSingleIndexAdaptorFlags::None,
                n_thread_build));
    }

   public:
    /** Deleted copy constructor */
    KDTreeEigenMatrixAdaptor(const self_t&) = delete;

    ~KDTreeEigenMatrixAdaptor() { delete index_; }

    const std::reference_wrapper<const MatrixType> m_data_matrix;

    /** Query for the \a num_closest closest points to a given point (entered as
     * query_point[0:dim-1]). Note that this is a short-cut method for
     * index->findNeighbors(). The user can also call index->... methods as
     * desired.
     *
     * \note If L2 norms are used, all returned distances are actually squared
     *       distances.
     */
    void query(
        const num_t* query_point, const Size num_closest,
        IndexType* out_indices, num_t* out_distances) const
    {
        nanoflann::KNNResultSet<num_t, IndexType> resultSet(num_closest);
        resultSet.init(out_indices, out_distances);
        index_->findNeighbors(resultSet, query_point);
    }

    /** @name Interface expected by KDTreeSingleIndexAdaptor
     * @{ */

    const self_t& derived() const { return *this; }
    self_t&       derived() { return *this; }

    // Must return the number of data points
    Size kdtree_get_point_count() const
    {
        if (row_major)
            return m_data_matrix.get().rows();
        else
            return m_data_matrix.get().cols();
    }

    // Returns the dim'th component of the idx'th point in the class:
    num_t kdtree_get_pt(const IndexType idx, size_t dim) const
    {
        if (row_major)
            return m_data_matrix.get().coeff(idx, IndexType(dim));
        else
            return m_data_matrix.get().coeff(IndexType(dim), idx);
    }

    // Optional bounding-box computation: return false to default to a standard
    // bbox computation loop.
    //   Return true if the BBOX was already computed by the class and returned
    //   in "bb" so it can be avoided to redo it again. Look at bb.size() to
    //   find out the expected dimensionality (e.g. 2 or 3 for point clouds)
    template <class BBOX>
    bool kdtree_get_bbox(BBOX& /*bb*/) const
    {
        return false;
    }

    /** @} */

};  // end of KDTreeEigenMatrixAdaptor
/** @} */

/** @} */  // end of grouping
}  // namespace nanoflann


================================================
FILE: src/wrapper.hpp
================================================
#include "_ext/KDLineTree.h"
#include "_ext/KDTree.h"
#include <array>
#include <memory>
#include <utility>
#include <vector>

#ifndef BUCKET_FPS_MAX_DIM
#define BUCKET_FPS_MAX_DIM 8
#endif
constexpr size_t max_dim = BUCKET_FPS_MAX_DIM;

using quickfps::KDLineTree;
using quickfps::KDTree;
using quickfps::Point;

template <typename T, size_t DIM, typename S>
std::vector<Point<T, DIM, S>> raw_data_to_points(const float *raw_data,
                                                 size_t n_points, size_t dim) {
    std::vector<Point<T, DIM, S>> points;
    points.reserve(n_points);
    for (size_t i = 0; i < n_points; i++) {
        const float *ptr = raw_data + i * dim;
        points.push_back(Point<T, DIM, S>(ptr, i));
    }
    return points;
}

template <typename T, size_t DIM, typename S = T>
void kdtree_sample(const float *raw_data, size_t n_points, size_t dim,
                   size_t n_samples, size_t start_idx,
                   size_t *sampled_point_indices) {
    auto points = raw_data_to_points<T, DIM, S>(raw_data, n_points, dim);
    std::unique_ptr<Point<T, DIM, S>[]> sampled_points(
        new Point<T, DIM, S>[n_samples]);
    KDTree<T, DIM, S> tree(points.data(), n_points, sampled_points.get());
    tree.buildKDtree();
    tree.init(points[start_idx]);
    tree.sample(n_samples);
    for (size_t i = 0; i < n_samples; i++) {
        sampled_point_indices[i] = sampled_points[i].id;
    }
}

template <typename T, size_t DIM, typename S = T>
void kdline_sample(const float *raw_data, size_t n_points, size_t dim,
                   size_t n_samples, size_t start_idx, size_t height,
                   size_t *sampled_point_indices) {
    auto points = raw_data_to_points<T, DIM, S>(raw_data, n_points, dim);
    std::unique_ptr<Point<T, DIM, S>[]> sampled_points(
        new Point<T, DIM, S>[n_samples]);
    KDLineTree<T, DIM, S> tree(points.data(), n_points, height,
                               sampled_points.get());
    tree.buildKDtree();
    tree.init(points[start_idx]);
    tree.sample(n_samples);
    for (size_t i = 0; i < n_samples; i++) {
        sampled_point_indices[i] = sampled_points[i].id;
    }
}

////////////////////////////////////////
//                                    //
//    Compile Time Function Helper    //
//                                    //
////////////////////////////////////////
using KDTreeFuncType = void (*)(const float *, size_t, size_t, size_t, size_t,
                                size_t *);
using KDLineFuncType = void (*)(const float *, size_t, size_t, size_t, size_t,
                                size_t, size_t *);

template <typename T, size_t Count, typename M, size_t... I>
constexpr std::array<T, Count> mapIndices(M &&m, std::index_sequence<I...>) {
    std::array<T, Count> result{m.template operator()<I + 1>()...};
    return result;
}

template <typename T, size_t Count, typename M>
constexpr std::array<T, Count> map(M m) {
    return mapIndices<T, Count>(m, std::make_index_sequence<Count>());
}

template <typename T, typename S = T> struct kdtree_func_helper {
    template <size_t DIM> KDTreeFuncType operator()() {
        return &kdtree_sample<T, DIM, S>;
    }
};

template <typename T, typename S = T> struct kdline_func_helper {
    template <size_t DIM> KDLineFuncType operator()() {
        return &kdline_sample<T, DIM, S>;
    }
};

/////////////////
//             //
//    C API    //
//             //
/////////////////

extern "C" {
int bucket_fps_kdtree(const float *raw_data, size_t n_points, size_t dim,
                      size_t n_samples, size_t start_idx,
                      size_t *sampled_point_indices) {
    if (dim == 0 || dim > max_dim) {
        // only support 1 to MAX_DIM dimensions
        return 1;
    } else if (start_idx >= n_points) {
        // start_idx should be smaller than n_samples
        return 2;
    }
    auto func_arr = map<KDTreeFuncType, max_dim>(kdtree_func_helper<float>{});
    func_arr[dim - 1](raw_data, n_points, dim, n_samples, start_idx,
                      sampled_point_indices);
    return 0;
}

int bucket_fps_kdline(const float *raw_data, size_t n_points, size_t dim,
                      size_t n_samples, size_t start_idx, size_t height,
                      size_t *sampled_point_indices) {
    if (dim == 0 || dim > max_dim) {
        // only support 1 to MAX_DIM dimensions
        return 1;
    } else if (start_idx >= n_points) {
        // start_idx should be smaller than n_samples
        return 2;
    }
    auto func_arr = map<KDLineFuncType, max_dim>(kdline_func_helper<float>{});
    func_arr[dim - 1](raw_data, n_points, dim, n_samples, start_idx, height,
                      sampled_point_indices);
    return 0;
}
}