Repository: seung-lab/kimimaro
Branch: master
Commit: 66f488e8ff06
Files: 37
Total size: 504.3 KB

Directory structure:
gitextract_n_5c9fyh/

├── .dockerignore
├── .github/
│   └── workflows/
│       ├── build_wheel.yml
│       └── test.yml
├── .gitignore
├── AUTHORS
├── CITATION.cff
├── ChangeLog
├── LICENSE
├── MANIFEST.in
├── README.md
├── automated_test.py
├── benchmarks/
│   ├── README.md
│   ├── benchmark.py
│   └── kimimaro.numbers
├── build_linux.sh
├── ext/
│   └── skeletontricks/
│       ├── dijkstra_invalidation.hpp
│       ├── libdivide.h
│       ├── skeletontricks.hpp
│       ├── skeletontricks.pyx
│       └── unordered_dense.hpp
├── kimimaro/
│   ├── __init__.py
│   ├── intake.py
│   ├── post.py
│   ├── sharedmemory.py
│   ├── trace.py
│   └── utility.py
├── kimimaro_cli/
│   ├── LICENSE
│   ├── __init__.py
│   └── codecs.py
├── manual_testing/
│   └── manual_test.py
├── manylinux2010.Dockerfile
├── manylinux2014.Dockerfile
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tox.ini

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
build
*.egg-info
benchmarks
__pycache__
manual_testing
.eggs
.git
.tox
.pytest_cache

================================================
FILE: .github/workflows/build_wheel.yml
================================================
name: Build Wheels

on:
  workflow_dispatch:
  push:
    tags:
      - '*'
env:
  CIBW_SKIP: pp* *-musllinux* cp36* cp37* cp38*

jobs:
  build_wheels:
    name: Build wheels on ${{matrix.arch}} for ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, windows-latest, macos-latest]
        arch: [auto]
        include:
          - os: ubuntu-latest
            arch: aarch64

    steps:
      - uses: actions/checkout@v4

      - name: Set up QEMU
        if:  ${{ matrix.arch == 'aarch64' }}
        uses: docker/setup-qemu-action@v1

      - name: Build wheels
        uses: pypa/cibuildwheel@v3.2.0
        # to supply options, put them in 'env', like:
        env:
          CIBW_ARCHS_LINUX: ${{matrix.arch}}
          CIBW_BEFORE_BUILD: pip install numpy setuptools wheel cython 
          CIBW_ARCHS_MACOS: "x86_64 arm64"

      - name: Upload built wheels
        uses: actions/upload-artifact@v4
        with:
          name: built-wheels-${{ matrix.os }}-${{ matrix.arch }}
          path: ./wheelhouse/*.whl
          if-no-files-found: warn

================================================
FILE: .github/workflows/test.yml
================================================
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Test Suite

on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]

jobs:
  build:

    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]

    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install cython numpy setuptools wheel build
        pip install -r requirements.txt -r requirements-dev.txt 
        python -m build --wheel
        pip install dist/*.whl
    - name: Test with pytest
      run: |
        python setup.py develop
        python -m pytest -v -x automated_test.py


================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

test.py
.DS_Store

# Itelli J
.idea/

ext/skeletontricks/skeletontricks.cpp


================================================
FILE: AUTHORS
================================================
Jingpeng Wu <jingpeng.wu@gmail.com>
William Silversmith <william.silversmith@gmail.com>


================================================
FILE: CITATION.cff
================================================
cff-version: 1.1.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Silversmith"
  given-names: "William"
  orcid: "https://orcid.org/0000-0002-5485-5341"
- family-names: "Bae"
  given-names: "J. Alexander"
  orcid: "https://orcid.org/0000-0002-4681-6342"
- family-names: "Li"
  given-names: "Peter H."
  orcid: "https://orcid.org/0000-0001-6193-4454"
- family-names: "Wilson"
  given-names: "A.M."
  orcid: "https://orcid.org/0000-0002-3822-5200"
title: "Kimimaro: Skeletonize densely labeled 3D image segmentations"
version: 3.0.0
date-released: 2021-09-29
doi: 10.5281/zenodo.5539913 


================================================
FILE: ChangeLog
================================================
CHANGES
=======

2.0.2
-----

* test: faster execution for cube and solid color tests
* fix(trace): skip adding DAF if max is 0
* test: check extremely sparse images (one or two voxels with no dust threshold)
* chore: drop py35 testing add .dockerignore

2.0.1
-----

* fix(windows): use np.uintp before casting to size\_t
* fix: appveyor needs numpy installed first
* chore: new build system for binary distribution

2.0.0
-----

* fix(intake): solid color blocks were causing errors (#56)
* perf: faster somas (#55)
* fix: python3.8 compiles cpp code (#52)
* chore: update travis to use python 3.7 and 3.8
* add python3.8 test

1.6.0
-----

* feat: avocado protection (🥑) (#43)
* chore: update ChangeLog

1.5.0
-----

* chore: add skeleton for manual testing
* feat: add fill\_holes argument (#50)

1.4.2
-----

* chore: loosen networkx requirement (#49)
* Update README.md
* docs: update memory usage diagram for version 1.4.0

1.4.1
-----

* perf: switch source and target for dijkstra

1.3.3
-----

* refactor: make type of 0L clear to std::max on Windows
* Revert "fix: don't assume vertices are uint32"
* fix: don't assume vertices are uint32
* chore: update ChangeLog

1.3.2
-----

* fix: several additional algorithms required 64-bit addressable changes

1.3.1
-----

* chore: bump dijkstra requirement
* fix: 64-bit addressable \_roll\_invalidation\_cube (#42)
* docs: shout out to fill\_voids
* fix: remove unnecessary PIL import

1.3.0
-----

* docs: describe max\_paths in the function docstring
* fix: soma center was being overriden by fix\_borders
* perf: only recompute EDT for soma if some voxels were filled
* perf: use bidirectional dijkstra on somata (increases peak memory usage)

1.2.1
-----

* docs: remove non-ascii character from README.md
* docs: link back to papers using Kimimaro

1.2.0
-----

* docs: show how to use synapses\_to\_targets
* feat: facility for converting synapse centroids into targets (#37)
* refactor+perf: use new fill-voids package

1.1.0
-----

* perf: implemented flood fill based binary\_fill\_holes (#38)

1.0.4
-----

* perf: increase postprocess speed (#35)
* perf: more judicious use of consolidate in postprocess

1.0.3
-----

* docs: update ChangeLog
* fix: preserve skeleton id during postprocessing

1.0.2
-----

* fix: allow multiple invocations of a pathos process pool
* perf: skip processing if dust\_threshold larger than image

1.0.1
-----

* fix: accept any root converable to a tuple
* fix: progress bars were disrupted in parallel feature
* docs: upload changelog

1.0.0
-----

* feat: specify extra\_targets\_before and after (#33)
* docs: fix spelling & grammar

0.7.0
-----

* docs: add parallel\_chunk\_size to README
* perf+feat: Reduce Parallel Task Starvation + Better Parallel Progress Bar (#32)
* docs: add example of join\_close\_components

0.6.0
-----

* feat: adds join\_close\_components to postprocess (#27)
* docs: link to tutorial wiki articles
* docs: add advice on tweaking parameters

0.5.4
-----

* fix: sometimes get\_mapping doesn't get everything

0.5.3
-----

* fix: object\_ids were being masked instead of mask\_excepted
* docs: show performance chart for v0.5.2

0.5.2
-----

* perf: improve performance of find\_objects 7x

0.5.1
-----

* perf: ~20x faster unique(label, return\_counts=True) (#26)
* docs: changelog update + small formatting adjustment to example

0.5.0
-----

* docs: example of how to use postprocess
* feat: import out-of-core postprocessing logic from Igneous
* docs: add object\_ids to example
* perf: improve speed of skeletontricks.get\_mapping
* fix: accept binary images of type bool
* perf: take advantage of faster segid finding if dust\_threshold == 0
* fix: compilation warning for \_roll\_invalidation\_cube
* test: add some manual visualization tests
* chore: update ChangeLog

0.4.2
-----

* release: 0.4.2
* chore: tell PyPI we're using markdown
* fix: ensure we pick max dbf close to centroid of detected somata
* chore: update ChangeLog
* docs: various corrections to the README

0.4.1
-----

* fix: add defense against setting the dust threshold lower than 1
* chore: formatting around all\_labels
* test: x and y joinability
* test: show that two 1px overlapping volumes join properly
* Update README.md
* feat: accept N-dimensional arrays with trivial dimensions above 3
* docs: add Google TEASAR run to boslster case for popularity
* fix: prevent duplicate border targets
* feat: parallel edt implementation
* fix: add support for anisotropy to distance calculations
* test: add distortion to border test
* wip: propogate anisotropy to fix\_borders calls
* fix: cuboid soma processing
* fix: bump edt to 1.2.4 to correct part of large anisotropy issue
* perf: faster masking operations with newer fastremap
* docs: encouraged the use of parallel processing in README.md
* chore: add GPLv3 classifer to setup.cfg
* chore: add ChangeLog

0.4.0
-----

* feat: parallel implementation (Cursed Seal Mode) (#10)

0.3.1
-----

* fix: INTEGER type did not include all integers

0.3.0
-----

* docs: updated credits with fix\_borders
* feat: add fix\_borders parameter & max\_paths parameter (#9)
* test+fix: remove "cd python"
* docs: add Travis CI badge
* chore: add Travis CI
* test: add basic test for skeletonizing diagonal of square and cube
* perf: improve memory consumption of object masking
* perf: introduce in\_place flag to make it safe to modify input data
* perf: use fastremap's new in\_place flag for lower memory and perf
* docs: updated credits

0.2.2
-----

* fix: accept C order arrays (#7)
* docs: reduce redundancy in example vs performance
* docs: add benchmark description
* docs: added benchmark photo
* docs: add link to citation 4
* docs: use citations 3 and 4
* docs: described "roll invalidation cube"
* docs: described algorithm in steps
* docs: describing the algorithm

0.2.1
-----

* fix: black volumes should return dict not None

0.2.0
-----

* docs: add PyPI badge
* feat: fix branching (#1)
* docs: adding sections to README

0.1.0
-----

* chore: clean up dockerfile and metadata
* docs: draft discussion of motivation and usage
* feat: export DimensionError exception (so it can be caught)
* refactor: remove path\_downsample from trace function
* docs: described parameters of skeletonize function
* chore: files required for building distributions
* wip: importing skeletonization procedure
* Initial commit


================================================
FILE: LICENSE
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.


================================================
FILE: MANIFEST.in
================================================
recursive-include ext *
include LICENSE

================================================
FILE: README.md
================================================
[![PyPI version](https://badge.fury.io/py/kimimaro.svg)](https://badge.fury.io/py/kimimaro)  

# Kimimaro: Skeletonize Densely Labeled Images

```bash
# Produce SWC files from volumetric images.
kimimaro forge labels.npy --progress # writes to ./kimimaro_out/
kimimaro view kimimaro_out/10.swc
```

Rapidly skeletonize all non-zero labels in 2D and 3D numpy arrays using a TEASAR derived method. The returned list of skeletons is in the format used by [cloud-volume](https://github.com/seung-lab/cloud-volume/wiki/Advanced-Topic:-Skeletons). A skeleton is a stick figure 1D representation of a 2D or 3D object that consists of a graph of verticies linked by edges. A skeleton where the verticies also carry a distance to the nearest boundary they were extracted from is called a "Medial Axis Transform", which Kimimaro provides.

Skeletons are a compact representation that can be used to visualize objects, trace the connectivity of an object, or otherwise analyze the object's geometry. Kimimaro was designed for use with high resolution neurons extracted from electron microscopy data via AI segmentation, but it can be applied to many different fields.  

On an Apple Silicon M1 arm64 chip (Firestorm cores 3.2 GHz max frequency), this package processed a 512x512x100 volume with 333 labels in 20 seconds. It processed a 512x512x512 volume (`connectomics.npy`) with 2124 labels in 187 seconds.

<p style="font-style: italics;" align="center">
<img height=512 width=512 src="https://raw.githubusercontent.com/seung-lab/kimimaro/master/mass_skeletonization.png" alt="A Densely Labeled Volume Skeletonized with Kimimaro" /><br>
Fig. 1: A Densely Labeled Volume Skeletonized with Kimimaro
</p>

## `pip` Installation 

If a binary is available for your platform:

```bash
pip install kimimaro 
# installs additional libraries to accelerate some
# operations like join_close_components
pip install "kimimaro[accel]"
# Makes the kimimaro view command work
pip install "kimimaro[view]"
# Enables TIFF generation on the CLI
pip install "kimimaro[tif]"
# Enables reading NIBABEL, NRRD, TIFF, CRACKLE on the CLI
pip install "kimimaro[all_formats]"
# Install all optional dependencies
pip install "kimimaro[all]"
```

Otherwise, you'll also need a C++ compiler:

```bash
sudo apt-get install python3-dev g++ # ubuntu linux
```

## Example

<p style="font-style: italics;" align="center">
<img height=512 src="https://raw.githubusercontent.com/seung-lab/kimimaro/master/kimimaro_512x512x512_benchmark.png" alt="A Densely Labeled Volume Skeletonized with Kimimaro" /><br>
Fig. 2: Memory Usage on a 512x512x512 Densely Labeled Volume (`connectomics.npy`)
</p>

Figure 2 shows the memory usage and processessing time (~390 seconds, about 6.5 minutes) required when Kimimaro 1.4.0 was applied to a 512x512x512 cutout, *labels*, from a connectomics dataset, `connectomics.npy` containing 2124 connected components. The different sections of the algorithm are depicted. Grossly, the preamble runs for about half a minute, skeletonization for about six minutes, and finalization within seconds. The peak memory usage was about 4.5 GB. The code below was used to process *labels*. The processing of the glia was truncated in due to a combination of *fix_borders* and max_paths.  

Kimimaro has come a long way. Version 0.2.1 took over 15 minutes and had a Preamble run time twice as long on the same dataset.

On a Macbook Pro M3, the same settings now complete in 94 seconds (1.6 minutes) on version 5.4.0. With xs3d 1.11.0, cross section analysis takes 215 seconds (3.6 minutes).

### Python Interface

```python
# LISTING 1: Producing Skeletons from a labeled image.

import kimimaro

# To obtain this 512 MB segmentation sample volume:
# pip install crackle-codec 

import crackle
labels = crackle.load("benchmarks/connectomics.npy.ckl.gz") 

skels = kimimaro.skeletonize(
  labels, 
  teasar_params={
    "scale": 1.5, 
    "const": 300, # physical units
    "pdrf_scale": 100000,
    "pdrf_exponent": 4,
    "soma_acceptance_threshold": 3500, # physical units
    "soma_detection_threshold": 750, # physical units
    "soma_invalidation_const": 300, # physical units
    "soma_invalidation_scale": 2,
    "max_paths": 300, # default None
  },
  # object_ids=[ ... ], # process only the specified labels
  # extra_targets_before=[ (27,33,100), (44,45,46) ], # target points in voxels
  # extra_targets_after=[ (27,33,100), (44,45,46) ], # target points in voxels
  dust_threshold=1000, # skip connected components with fewer than this many voxels
  anisotropy=(16,16,40), # default True
  fix_branching=True, # default True
  fix_borders=True, # default True
  fill_holes=False, # default False
  fix_avocados=False, # default False
  progress=True, # default False, show progress bar
  parallel=1, # <= 0 all cpu, 1 single process, 2+ multiprocess
  parallel_chunk_size=100, # how many skeletons to process before updating progress bar
)

# LISTING 2: Combining skeletons produced from 
#            adjacent or overlapping images.

import kimimaro
from osteoid import Skeleton

skels = ... # a set of skeletons produced from the same label id
skel = Skeleton.simple_merge(skels).consolidate()
skel = kimimaro.postprocess(
  skel, 
  dust_threshold=1000, # physical units
  tick_threshold=3500 # physical units
)

# LISTING 3: Adding cross sectional area to skeletons
# Cross section planes are defined by normal vectors. Those
# vectors come from the difference between adjacent vertices.
skels = ... # one or more skeletons produced from a single image
skels = kimimaro.cross_sectional_area(
  labels, skels, 
  anisotropy=(16,16,40), 
  smoothing_window=5, # rolling average window of plane normals
  progress=True,
)
skel = skels[0]
skel.cross_sectional_area # array of cross sectional areas
skel.cross_sectional_area_contacts # non-zero contacted the image border

# Split input skeletons into connected components and
# then join the two nearest vertices within `radius` distance
# of each other until there is only a single connected component
# or no pairs of points nearer than `radius` exist. 
# Fuse all remaining components into a single skeleton.
skel = kimimaro.join_close_components([skel1, skel2], radius=1500) # 1500 units threshold
skel = kimimaro.join_close_components([skel1, skel2], radius=None) # no threshold

# Given synapse centroids (in voxels) and the SWC integer label you'd 
# like to assign (e.g. for pre-synaptic and post-synaptic) this finds the 
# nearest voxel to the centroid for that label.
# Input: { label: [ ((x,y,z), swc_label), ... ] }
# Returns: { (x,y,z): swc_label, ... }
extra_targets = kimimaro.synapses_to_targets(labels, synapses)


# LISTING 4: Drawing a centerline between
#   preselected points on a binary image.
#   This is a much simpler option for when
#   you know exactly what you want, but may
#   be less efficient for large scale procesing.

skel = kimimaro.connect_points(
  labels == 67301298,
  start=(3, 215, 202), 
  end=(121, 426, 227),
  anisotropy=(32,32,40),
)

# LISTING 5: Using skeletons to oversegment existing
#  segmentations for integration into proofreading systems 
#  that on merging atomic labels. oversegmented_labels 
#  is returned numbered from 1. skels is a copy returned 
#  with the property skel.segments that associates a label
#  to each vertex (labels will not be unique if downsampling 
#  is used)
oversegmented_labels, skels = kimimaro.oversegment(
  labels, skels, 
  anisotropy=(32,32,40), 
  downsample=10,
)
```

`connectomics.npy` is multilabel connectomics data derived from pinky40, a 2018 experimental automated segmentation of ~1.5 million cubic micrometers of mouse visual cortex. It is an early predecessor to the now public pinky100_v185 segmentation that can be found at https://microns-explorer.org/phase1 You will need to run `lzma -d connectomics.npy.lzma` to obtain the 512x512x512 uint32 volume at 32x32x40 nm<sup>3</sup> resolution.  

### CLI Interface

The CLI supports producing skeletons from a single image as SWCs and viewing the resulting SWC files one at a time. By default, the SWC files are written to `./kimimaro_out/$LABEL.swc`.

Here's an equivalent example to the code above.

```bash
kimimaro forge labels.npy --scale 4 --const 10 --soma-detect 1100 --soma-accept 3500 --soma-scale 1 --soma-const 300 --anisotropy 16,16,40 --fix-borders --progress 
```

Visualize the your data:

```bash
kimimaro view 1241241.swc # visualize skeleton
kimimaro view labels.npy # visualize segmentation
```

It can also convert binary image skeletons produced by thinning algorithms into SWC files and back. This can be helpful for comparing different skeletonization algorithms or even just using their results.

```bash
kimimaro swc from binary_image.tiff # -> binary_image.swc
kimimaro swc to --format tiff binary_image.swc # -> binary_image.tiff or npy
```

## Tweaking `kimimaro.skeletonize` Parameters

This algorithm works by finding a root point on a 3D object and then serially tracing paths via dijksta's shortest path algorithm through a penalty field to the most distant unvisited point. After each pass, there is a sphere (really a circumscribing cube) that expands around each vertex in the current path that marks part of the object as visited.  

For a visual tutorial on the basics of the skeletonization procedure, check out this wiki article: [A Pictorial Guide to TEASAR Skeletonization](https://github.com/seung-lab/kimimaro/wiki/A-Pictorial-Guide-to-TEASAR-Skeletonization)

For more detailed information, [read below](https://github.com/seung-lab/kimimaro#ii-skeletonization) or the [TEASAR paper](https://ieeexplore.ieee.org/abstract/document/883951/) (though we [deviate from TEASAR](https://github.com/seung-lab/kimimaro#teasar-derived-algorthm) in a few places). [1]

### `scale` and `const`

Usually, the most important parameters to tweak are `scale` and `const` which control the radius of this invalidation sphere according to the equation `r(x,y,z) = scale * DBF(x,y,z) + const` where the dimensions are physical (e.g. nanometers, i.e. corrected for anisotropy). `DBF(x,y,z)` is the physical distance from the shape boundary at that point.  

Check out this [wiki article](https://github.com/seung-lab/kimimaro/wiki/Intuition-for-Setting-Parameters-const-and-scale) to help refine your intuition.

### `anisotropy`

Represents the physical dimension of each voxel. For example, a connectomics dataset might be scanned with an electron microscope at 4nm x 4nm per pixel and stacked in slices 40nm thick. i.e. `anisotropy=(4,4,40)`. You can use any units so long as you are consistent.

### `dust_threshold`

This threshold culls connected components that are smaller than this many voxels.  

### `extra_targets_after` and `extra_targets_before`  

`extra_targets_after` provides additional voxel targets to trace to after the morphological tracing algorithm completes. For example, you might add known synapse locations to the skeleton.   

`extra_targets_before` is the same as `extra_targets_after` except that the additional targets are front-loaded and the paths that they cover are invalidated. This may affect the results of subsequent morphological tracing.

### `max_paths`  

Limits the number of paths that can be drawn for the given label. Certain cells, such as glia, that may not be important for the current analysis may be expensive to process and can be aborted early.  

### `pdrf_scale` and `pdrf_exponent`

The `pdrf_scale` and `pdrf_exponent` represent parameters to the penalty equation that takes the euclidean distance field (**D**) and augments it so that cutting closer to the border is very penalized to make dijkstra take paths that are more centered.   

P<sub>r</sub> = `pdrf_scale` * (1 - **D** / max(**D**)) <sup>`pdrf_exponent`</sup> + (directional gradient < 1.0).  

The default settings should work fairly well, but under large anisotropies or with cavernous morphologies, it's possible that you might need to tweak it. If you see the skeleton go haywire inside a large area, it could be a collapse of floating point precision.  

### `soma_acceptance_threshold` and `soma_detection_threshold`

We process somas specially because they do not have a tubular geometry and instead should be represented in a hub and spoke manner. `soma_acceptance_threshold` is the physical radius (e.g. in nanometers) beyond which we classify a connected component of the image as containing a soma. The distance transform's output is depressed by holes in the label, which are frequently produced by segmentation algorithms on somata. We can fill them, but the hole filling algorithm we use is slow so we would like to only apply it occasionally. Therefore, we set a lower threshold, the `soma_acceptance_threshold`, beyond which we fill the holes and retest the soma.  

### `soma_invalidation_scale` and `soma_invalidation_const`   

Once we have classified a region as a soma, we fix root of the skeletonization algorithm at one of the  points of maximum distance from the boundary (usually there is only one). We then mark as visited all voxels around that point in a spherical radius described by `r(x,y,z) = soma_invalidation_scale * DBF(x,y,z) + soma_invalidation_const` where DBF(x,y,z) is the physical distance from the shape boundary at that point. If done correctly, this can prevent skeletons from being drawn to the boundaries of the soma, and instead pulls the skeletons mainly into the processes extending from the cell body.  

### `fix_borders`

This feature makes it easier to connect the skeletons of adjacent image volumes that do not fit in RAM. If enabled, skeletons will be deterministically drawn to the approximate center of the 2D contact area of each place where the shape contacts the border. This can affect the performance of the operation positively or negatively depending on the shape and number of contacts.  

### `fix_branching`  

You'll probably never want to disable this, but base TEASAR is infamous for forking the skeleton at branch points way too early. This option makes it preferential to fork at a more reasonable place at a significant performance penalty. 

### `fill_holes`

_Warning: This will remove input labels that are deemed to be holes._

If your segmentation contains artifacts that cause holes to appear in labels, you can preprocess the entire image to eliminate background holes and holes caused by entirely contained inclusions. This option adds a moderate amount of additional processing time at the beginning (perhaps ~30%). 

### `fix_avocados`

Avocados are segmentations of cell somata that classify the nucleus separately from the cytoplasm. This is a common problem in automatic segmentations due to the visual similarity of a cell membrane and a nuclear membrane combined with insufficient context.  

Skeletonizing an avocado results in a poor skeletonization of the cell soma that will disconnect the nucleus and usually results in too many paths traced around the nucleus. Setting `fix_avocados=True` attempts to detect and fix these problems. Currently we handle non-avocados, avocados, cells with inclusions, and nested avocados. You can see examples [here](https://github.com/seung-lab/kimimaro/pull/43).

### `progress`

Show a progress bar once the skeletonization phase begins.

### `parallel`  

Use a pool of processors to skeletonize faster. Each process allocatable task is the skeletonization of one connected component (so it won't help with a single label that takes a long time to skeletonize). This option also affects the speed of the initial euclidean distance transform, which is parallel enabled and is the most expensive part of the Preamble (described below).  

### `parallel_chunk_size`  

This only applies when using parallel. This sets the number of skeletons a subprocess will extract before returning control to the main thread, updating the progress bar, and acquiring a new task. If this value is set too low (e.g. < 10-20) the cost of interprocess communication can become significant and even dominant. If it is set too high, task starvation may occur for the other subprocesses if a subprocess gets a particularly hard skeleton and they complete quickly. Progress bar updates will be infrequent if the value is too high as well.  

The actual chunk size used will be `min(parallel_chunk_size, len(cc_labels) // parallel)`. `cc_labels` represents the number of connected components in the sample.  

### Performance Tips

- If you only need a few labels skeletonized, pass in `object_ids` to bypass processing all the others. If `object_ids` contains only a single label, the masking operation will run faster.
- Larger TEASAR parameters scale and const require processing larger invalidation regions per path.
- Set `pdrf_exponent` to a small power of two (e.g. 1, 2, 4, 8, 16) for a small speedup.
- If you are willing to sacrifice the improved branching behavior, you can set `fix_branching=False` for a moderate 1.1x to 1.5x speedup (assuming your TEASAR parameters and data allow branching).
- If your dataset contains important cells (that may in fact be the seat of consciousness) but they take significant processing power to analyze, you can save them to savor for later by setting `max_paths` to some reasonable level which will abort and proceed to the next label after the algorithm detects that that at least that many paths will be needed.
- Parallel distributes work across connected components and is generally a good idea if you have the cores and memory. Not only does it make single runs proceed faster, but you can also practically use a much larger context; that improves soma processing as they are less likely to be cut off. The Preamble of the algorithm (detailed below) is still single threaded at the moment, so task latency increases with size. 
- If `parallel_chunk_size` is set very low (e.g. < 10) during parallel operation, interprocess communication can become a significant overhead. Try raising this value.  

## Motivation

The connectomics field commonly generates very large densely labeled volumes of neural tissue. Skeletons are one dimensional representations of two or three dimensional objects. They have many uses, a few of which are visualization of neurons, calculating global topological features, rapidly measuring electrical distances between objects, and imposing tree structures on neurons (useful for computation and user interfaces). There are several ways to compute skeletons and a few ways to define them [4]. After some experimentation, we found that the TEASAR [1] approach gave fairly good results. Other approaches include topological thinning ("onion peeling") and finding the centerline described by maximally inscribed spheres. Ignacio Arganda-Carreras, an alumnus of the Seung Lab, wrote a topological thinning plugin for Fiji called [Skeletonize3d](https://imagej.net/Skeletonize3D). 

There are several implementations of TEASAR used in the connectomics field [3][5], however it is commonly understood that implementations of TEASAR are slow and can use tens of gigabytes of memory. Our goal to skeletonize all labels in a petavoxel scale image quickly showed clear that existing sparse implementations are impractical. While adapting a sparse approach to a cloud pipeline, we noticed that there are inefficiencies in repeated evaluation of the Euclidean Distance Transform (EDT), the repeated evaluation of the connected components algorithm, in the construction of the graph used by Dijkstra's algorithm where the edges are implied by the spatial relationships between voxels, in the memory cost, quadratic in the number of voxels, of representing a graph that is implicit in image, in the unnecessarily large data type used to represent relatively small cutouts, and in the repeated downloading of overlapping regions. We also found that the naive implmentation of TEASAR's "rolling invalidation ball" unnecessarily reevaluated large numbers of voxels in a way that could be loosely characterized as quadratic in the skeleton path length.   

We further found that commodity implementations of the EDT supported only binary images. We were unable to find any available Python or C++ libraries for performing Dijkstra's shortest path on an image. Commodity implementations of connected components algorithms for images supported only binary images. Therefore, several libraries were devised to remedy these deficits (see Related Projects). 

## Why TEASAR?

TEASAR: Tree-structure Extraction Algorithm for Accurate and Robust skeletons, a 2000 paper by M. Sato and others [1], is a member of a family of algorithms that transform two and three dimensional structures into a one dimensional "skeleton" embedded in that higher dimension. One might concieve of a skeleton as extracting a stick figure drawing from a binary image. This problem is more difficult than it might seem. There are different situations one must consider when making such a drawing. For example, a stick drawing of a banana might merely be a curved centerline and a drawing of a doughnut might be a closed loop. In our case of analyzing neurons, sometimes we want the skeleton to include spines, short protrusions from dendrites that usually have synapses attached, and sometimes we want only the characterize the run length of the main trunk of a neurite.  

Additionally, data quality issues can be challenging as well. If one is skeletonizing a 2D image of a doughnut, but the angle were sufficiently declinated from the ring's orthogonal axis, would it even be possible to perform this task accurately? In a 3D case, if there are breaks or mergers in the labeling of a neuron, will the algorithm function sensibly? These issues are common in both manual and automatic image sementations.

In our problem domain of skeletonizing neurons from anisotropic voxel labels, our chosen algorithm should produce tree structures, handle fine or coarse detail extraction depending on the circumstances, handle voxel anisotropy, and be reasonably efficient in CPU and memory usage. TEASAR fufills these criteria. Notably, TEASAR doesn't guarantee the centeredness of the skeleton within the shape, but it makes an effort. The basic TEASAR algorithm is known to cut corners around turns and branch too early. A 2001 paper by members of the original TEASAR team describes a method for reducing the early branching issue on page 204, section 4.2.2. [2]

## TEASAR Derived Algorithm

We implemented TEASAR but made several deviations from the published algorithm in order to improve path centeredness, increase performance, handle bulging cell somas, and enable efficient chunked evaluation of large images. We opted not to implement the gradient vector field step from [2] as our implementation is already quite fast. The paper claims a reduction of 70-85% in input voxels, so it might be worth investigating.  

In order to work with images that contain many labels, our general strategy is to perform as many actions as possible in such a way that all labels are treated in a single pass. Several of the component algorithms (e.g. connected components, euclidean distance transform) in our implementation can take several seconds per a pass, so it is important that they not be run hundreds or thousands of times. A large part of the engineering contribution of this package lies in the efficiency of these operations which reduce the runtime from the scale of hours to minutes.  

Given a 3D labeled voxel array, *I*, with N >= 0 labels, and ordered triple describing voxel anisotropy *A*, our algorithm can be divided into three phases, the pramble, skeletonization, and finalization in that order.

### I. Preamble

The Preamble takes a 3D image containing *N* labels and efficiently generates the connected components, distance transform, and bounding boxes needed by the skeletonization phase.

1. To enhance performance, if *N* is 0 return an empty set of skeletons.
2. Label the M connected components, *I<sub>cc</sub>*, of *I*.
3. To save memory, renumber the connected components in order from 1 to *M*. Adjust the data type of the new image to the smallest uint type that will contain *M* and overwrite *I<sub>cc</sub>*.
4. Generate a mapping of the renumbered *I<sub>cc</sub>* to *I* to assign meaningful labels to skeletons later on and delete *I* to save memory.
5. Compute *E*, the multi-label anisotropic Euclidean Distance Transform of *I<sub>cc</sub>* given *A*. *E* treats all interlabel edges as transform edges, but not the boundaries of the image. Black pixels are considered background.
6. Gather a list, *L<sub>cc</sub>* of unique labels from *I<sub>cc</sub>* and threshold which ones to process based on the number of voxels they represent to remove "dust".
7. In one pass, compute the list of bounding boxes, *B*, corresponding to each label in *L<sub>cc</sub>*.

### II. Skeletonization 

In this phase, we extract the tree structured skeleton from each connected component label. Below, we reference variables defined in the Preamble. For clarity, we omit the soma specific processing and hold `fix_branching=True`. 

For each label *l* in *L<sub>cc</sub>* and *B*...

1. Extract *I<sub>l</sub>*, the cropped binary image tightly enclosing *l* from *I<sub>cc</sub>* using *B<sub>l</sub>*
2. Using *I<sub>l</sub>* and *B<sub>l</sub>*, extract *E<sub>l</sub>* from *E*. *E<sub>l</sub>* is the cropped tightly enclosed EDT of *l*. This is much faster than recomputing the EDT for each binary image.
3. Find an arbitrary foreground voxel and using that point as a source, compute the anisotropic euclidean distance field for *I<sub>l</sub>*. The coordinate of the maximum value is now "the root" *r*.
4. From *r*, compute the euclidean distance field and save it as the distance from root field *D<sub>r</sub>*.
5. Compute the penalized distance from root field *P<sub>r</sub>* = `pdrf_scale` * ((1 - *E<sub>l</sub>* / max(*E<sub>l</sub>*)) ^ `pdrf_exponent`) + *D<sub>r</sub>* / max(*D<sub>r</sub>*). 
6. While *I<sub>l</sub>* contains foreground voxels:
    1. Identify a target coordinate, *t*, as the foreground voxel with maximum distance in *D<sub>r</sub>* from *r*.
    2. Draw the shortest path *p* from *r* to *t* considering the voxel values in *P<sub>r</sub>* as edge weights.
    3. For each vertex *v* in *p*, extend an invalidation cube of physical side length computed as `scale` * *E<sub>l</sub>*(*v*) + `const` and convert any foreground pixels in *I<sub>l</sub>* that overlap with these cubes to background pixels.
    4. (Only if `fix_branching=True`) For each vertex coordinate *v* in *p*, set *P<sub>r</sub>*(*v*) = 0.
    5. Append *p* to a list of paths for this label.
7. Using *E<sub>l</sub>*, extract the distance to the nearest boundary each vertex in the skeleton represents.
8. For each raw skeleton extracted from *I<sub>l</sub>*, translate the vertices by *B<sub>l</sub>* to correct for the translation the cropping operation induced.
9. Multiply the vertices by the anisotropy *A* to place them in physical space.

If soma processing is considered, we modify the root (*r*) search process as follows:  

1. If max(*E<sub>l</sub>*) > `soma_detection_threshold`...
  1. Fill toplogical holes in *I<sub>l</sub>*. Soma are large regions that often have dust from imperfect automatic labeling methods.
  2. Recompute *E<sub>l</sub>* from this cleaned up image.
  3. If max(*E<sub>l</sub>*) > `soma_acceptance_threshold`, divert to soma processing mode.
2. If in soma processing mode, continue, else go to step 3 in the algorithm above.
3. Set *r* to the coordinate corresponding to max(*E<sub>l</sub>*)
4. Create an invalidation sphere of physical radius `soma_invalidation_scale` * max(*E<sub>l</sub>*) + `soma_invalidation_const` and erase foreground voxels from *I<sub>l</sub>* contained within it. This helps prevent errant paths from being drawn all over the soma.
5. Continue from step 4 in the above algorithm.

### III. Finalization

In the final phase, we agglomerate the disparate connected component skeletons into single skeletons and assign their labels corresponding to the input image. This step is artificially broken out compared to how intermingled its implementation is with skeletonization, but it's conceptually separate.

## Deviations from TEASAR

There were several places where we took a different approach than called for by the TEASAR authors.

### Using DAF for Targets, PDRF for Pathfinding

The original TEASAR algorithm defines the Penalized Distance from Root voxel Field (PDRF, *P<sub>r</sub>* above) as:

```
PDRF = 5000 * (1 - DBF / max(DBF))^16 + DAF
```

DBF is the Distance from Boundary Field (*E<sub>l</sub>* above) and DAF is the Distance from Any voxel Field (*D<sub>r</sub>* above).  

We found the addition of the DAF tended to perturb the skeleton path from the centerline better described by the inverted DBF alone. We also found it helpful to modify the constant and exponent to tune cornering behavior. Initially, we completely stripped out the addition of the DAF from the PDRF, but this introduced a different kind of problem. The exponentiation of the PDRF caused floating point values to collapse in wide open spaces. This made the skeletons go crazy as they traced out a path described by floating point errors.  

The DAF provides a very helpful gradient to follow between the root and the target voxel, we just don't want that gradient to knock the path off the centerline. Therefore, in light of the fact that the PDRF base field is very large, we add the normalized DAF which is just enough to overwhelm floating point errors and provide direction in wide tubes and bulges.  

The original paper also called for selecting targets using the max(PDRF) foreground values. However, this is a bit strange since the PDRF values are dominated by boundary effects rather than a pure distance metric. Therefore, we select targets from the max(DAF) forground value.

### Zero Weighting Previous Paths (`fix_branching=True`)

The 2001 skeletonization paper [2] called for correcting early forking by computing a DAF using already computed path vertices as field sources. This allows Dijkstra's algorithm to trace the existing path cost free and diverge from it at a closer point to the target.  

As we have strongly deemphasized the role of the DAF in dijkstra path finding, computing this field is unnecessary and we only need to set the PDRF to zero along the path of existing skeletons to achieve this effect. This saves us an expensive repeated DAF calculation per path.  

However, we still incur a substantial cost for taking this approach because we had been computing a dijkstra "parental field" that recorded the shortest path to the root from every foreground voxel. We then used this saved result to rapidly compute all paths. However, as this zero weighting modification makes successive calculations dependent upon previous ones, we need to compute Dijkstra's algorithm anew for each path.

### Non-Overlapped Chunked Processing (`fix_borders=True`)

When processing large volumes, a sensible approach for mass producing skeletons is to chunk the volume, process the chunks independently, and merge the resulting skeleton fragments at the end. However, this is complicated by the "edge effect" induced by a loss of context which makes it impossible to expect the endpoints of skeleton fragments produced by adjacent chunks to align. In contrast, it is easy to join mesh fragments because the vertices of the edge of mesh fragments lie at predictable identical locations given one pixel of overlap.  

Previously, we had used 50% overlap to join adjacent skeleton fragments which increased the compute cost of skeletonizing a large volume by eight times. However, if we could force skeletons to lie at predictable locations on the border, we could use single pixel overlap and copy the simple mesh joining approach. As an (incorrect but useful) intuition for how one might go about this, consider computing the centroid of each connected component on each border plane and adding that as a required path target. This would guarantee that both sides of the plane connect at the same pixel. However, the centroid may not lie inside of non-convex hulls so we have to be more sophisticated and select some real point inside of the shape.

To this end, we again repurpose the euclidean distance transform and apply it to each of the six planes of connected components and select the maximum value as a mandatory target. This works well for many types of objects that contact a single plane and have a single maximum. However, we must treat the corners of the box and shapes that have multiple maxima.  

To handle shapes that contact multiple sides of the box, we simply assign targets to all connected components. If this introduces a cycle in post-processing, we already have cycle removing code to handle it in Igneous. If it introduces tiny useless appendages, we also have code to handle this.  

If a shape has multiple distance transform maxima, it is important to choose the same pixel without needing to communicate between spatially adjacent tasks which may run at different times on different machines. Additionally, the same plane on adjacent tasks has the coordinate system flipped. One simple approach might be to pick the coordinate with minimum x and y (or some other coordinate based criterion) in one of the coordinate frames, but this requires tracking the flips on all six planes and is annoying. Instead, we use a series of coordinate-free topology based filters which is both more fun, effort efficient, and picks something reasonable looking. A valid criticism of this approach is that it will fail on a perfectly symmetrical object, but these objects are rare in biological data.  

We apply a series of filters and pick the point based on the first filter it passes:

1. The voxel closest to the centroid of the current label.
2. The voxel closest to the centroid of the image plane.
3. Closest to a corner of the plane.
4. Closest to an edge of the plane.
5. The previously found maxima.

It is important that filter #1 be based on the shape of the label so that kinks are minimimized for convex hulls. For example, originally we used only filters two thru five, but this caused skeletons for neurites located away from the center of a chunk to suddenly jink towards the center of the chunk at chunk boundaries.

## Related Projects

Several classic algorithms had to be specially tuned to make this module possible.  

1. [edt](https://github.com/seung-lab/euclidean-distance-transform-3d): A single pass, multi-label anisotropy supporting euclidean distance transform implementation. 
2. [dijkstra3d](https://github.com/seung-lab/dijkstra3d): Dijkstra's shortest-path algorithm defined on 26-connected 3D images. This avoids the time cost of edge generation and wasted memory of a graph representation.
3. [connected-components-3d](https://github.com/seung-lab/connected-components-3d): A connected components implementation defined on 26-connected 3D images with multiple labels.
4. [fastremap](https://github.com/seung-lab/fastremap): Allows high speed renumbering of labels from 1 in a 3D array in order to reduce memory consumption caused by unnecessarily large 32 and 64-bit labels.
5. [fill_voids](https://github.com/seung-lab/fill_voids): High speed binary_fill_holes.
6. [xs3d](https://github.com/seung-lab/cross-section): Cross section analysis of 3D images.

This module was originally designed to be used with CloudVolume and Igneous. 

1. [CloudVolume](https://github.com/seung-lab/cloud-volume): Serverless client for reading and writing petascale chunked images of neural tissue, meshes, and skeletons.
2. [Igneous](https://github.com/seung-lab/igneous/tree/master/igneous): Distributed computation for visualizing connectomics datasets.  

Some of the TEASAR modifications used in this package were first demonstrated by Alex Bae.

1. [skeletonization](https://github.com/seung-lab/skeletonization): Python implementation of modified TEASAR for sparse labels.

## Credits  

Alex Bae developed the precursor skeletonization package and several modifications to TEASAR that we use in this package. Alex also developed the postprocessing approach used for stitching skeletons using 50% overlap. Will Silversmith adapted these techniques for mass production, refined several basic algorithms for handling thousands of labels at once, and rewrote them into the Kimimaro package. Will added trickle DAF, zero weighted previously explored paths, and fixing borders to the algorithm. A.M. Wilson and Will designed the nucleus/soma "avocado" fuser. Forrest Collman added parameter flexibility and helped tune DAF computation performance. Sven Dorkenwald and Forrest both provided helpful discussions and feedback. Peter Li redesigned the target selection algorithm to avoid bilinear performance on complex cells. 

## Acknowledgments  

We are grateful to our partners in the Seung Lab, the Allen Institute for Brain Science, and the Baylor College of Medicine for providing the data and problems necessitating this library.

This research was supported by the Intelligence Advanced Research Projects Activity (IARPA) via Department of Interior/ Interior Business Center (DoI/IBC) contract number D16PC0005, NIH/NIMH (U01MH114824, U01MH117072, RF1MH117815), NIH/NINDS (U19NS104648, R01NS104926), NIH/NEI (R01EY027036), and ARO (W911NF-12-1-0594). The U.S. Government is authorized to reproduce and distribute reprints for Governmental purposes notwithstanding any copyright annotation thereon. Disclaimer: The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of IARPA, DoI/IBC, or the U.S. Government. We are grateful for assistance from Google, Amazon, and Intel.

## Papers Using Kimimaro

Please cite Kimimaro using the CITATION.cff file located in this repository.

The below list is not comprehensive and is sourced from collaborators or found using internet searches and does not constitute an endorsement except to the extent that they used it for their work. 

1. A.M. Wilson, R. Schalek, A. Suissa-Peleg, T.R. Jones, S. Knowles-Barley, H. Pfister, J.M. Lichtman. "Developmental Rewiring between Cerebellar Climbing Fibers and Purkinje Cells Begins with Positive Feedback Synapse Addition". Cell Reports. Vol. 29, Iss. 9, November 2019. Pgs. 2849-2861.e6 doi: 10.1016/j.celrep.2019.10.081  ([link](https://www.cell.com/cell-reports/fulltext/S2211-1247(19)31403-2))
2. S. Dorkenwald, N.L. Turner, T. Macrina, K. Lee, R. Lu, J. Wu, A.L. Bodor, A.A. Bleckert, D. Brittain, N. Kemnitz, W.M. Silversmith, D. Ih, J. Zung, A. Zlateski, I. Tartavull, S. Yu, S. Popovych, W. Wong, M. Castro, C. S. Jordan, A.M. Wilson, E. Froudarakis, J. Buchanan, M. Takeno, R. Torres, G. Mahalingam, F. Collman, C. Schneider-Mizell, D.J. Bumbarger, Y. Li, L. Becker, S. Suckow, J. Reimer, A.S. Tolias, N. Ma<span>&ccedil;</span>arico da Costa, R. C. Reid, H.S. Seung. "Binary and analog variation of synapses between cortical pyramidal neurons". bioRXiv. December 2019. doi: 10.1101/2019.12.29.890319 ([link](https://www.biorxiv.org/content/10.1101/2019.12.29.890319v1.full))  
3. N.L. Turner, T. Macrina, J.A. Bae, R. Yang, A.M. Wilson, C. Schneider-Mizell, K. Lee, R. Lu, J. Wu, A.L. Bodor, A.A. Bleckert, D. Brittain, E. Froudarakis, S. Dorkenwald, F. Collman, N. Kemnitz, D. Ih, W.M. Silversmith, J. Zung, A. Zlateski, I. Tartavull, S. Yu, S. Popovych, S. Mu, W. Wong, C.S. Jordan, M. Castro, J. Buchanan, D.J. Bumbarger, M. Takeno, R. Torres, G. Mahalingam, L. Elabbady, Y. Li, E. Cobos, P. Zhou, S. Suckow, L. Becker, L. Paninski, F. Polleux, J. Reimer, A.S. Tolias, R.C. Reid, N. Ma<span>&ccedil;</span>arico da Costa, H.S. Seung. "Multiscale and multimodal reconstruction of cortical structure and function".
bioRxiv. October 2020; doi: 10.1101/2020.10.14.338681 ([link](https://www.biorxiv.org/content/10.1101/2020.10.14.338681v3))
4. P.H. Li, L.F. Lindsey, M. Januszewski, Z. Zheng, A.S. Bates, I. Taisz, M. Tyka, M. Nichols, F. Li, E. Perlman, J. Maitin-Shepard, T. Blakely, L. Leavitt, G. S.X.E. Jefferis, D. Bock, V. Jain. "Automated Reconstruction of a Serial-Section EM Drosophila Brain with Flood-Filling Networks and Local Realignment". bioRxiv. October 2020. doi: 10.1101/605634  ([link](https://www.biorxiv.org/content/10.1101/605634v3))

## References 

1. M. Sato, I. Bitter, M.A. Bender, A.E. Kaufman, and M. Nakajima. "TEASAR: Tree-structure Extraction Algorithm for Accurate and Robust Skeletons". Proc. 8th Pacific Conf. on Computer Graphics and Applications. Oct. 2000. doi: 10.1109/PCCGA.2000.883951 ([link](https://ieeexplore.ieee.org/abstract/document/883951/))
2. I. Bitter, A.E. Kaufman, and M. Sato. "Penalized-distance volumetric skeleton algorithm". IEEE Transactions on Visualization and Computer Graphics Vol. 7, Iss. 3, Jul-Sep 2001. doi: 10.1109/2945.942688 ([link](https://ieeexplore.ieee.org/abstract/document/942688/))
3. T. Zhao, S. Plaza. "Automatic Neuron Type Identification by Neurite Localization in the Drosophila Medulla". Sept. 2014. arXiv:1409.1892 \[q-bio.NC\] ([link](https://arxiv.org/abs/1409.1892))
4. A. Tagliasacchi, T. Delame, M. Spagnuolo, N. Amenta, A. Telea. "3D Skeletons: A State-of-the-Art Report". May 2016. Computer Graphics Forum. Vol. 35, Iss. 2. doi: 10.1111/cgf.12865 ([link](https://onlinelibrary.wiley.com/doi/full/10.1111/cgf.12865))
5. P. Li, L. Lindsey, M. Januszewski, Z. Zheng, A. Bates, I. Taisz, M. Tyka, M. Nichols, F. Li, E. Perlman, J. Maitin-Shepard, T. Blakely, L. Leavitt, G. Jefferis, D. Bock, V. Jain. "Automated Reconstruction of a Serial-Section EM Drosophila Brain with Flood-Filling Networks and Local Realignment". April 2019. bioRXiv. doi: 10.1101/605634 ([link](https://www.biorxiv.org/content/10.1101/605634v1))
6. M.M. McKerns, L. Strand, T. Sullivan, A. Fang, M.A.G. Aivazis, "Building a framework for predictive science", Proceedings of the 10th Python in Science Conference, 2011; http://arxiv.org/pdf/1202.1056
7. Michael McKerns and Michael Aivazis, "pathos: a framework for heterogeneous computing", 2010- ; http://trac.mystic.cacr.caltech.edu/project/pathos


================================================
FILE: automated_test.py
================================================
import pytest

import edt
import numpy as np
from osteoid import Skeleton

import kimimaro.intake
import kimimaro.post
import kimimaro.skeletontricks
from kimimaro.utility import moving_average, cross_sectional_area

@pytest.fixture
def connectomics_data():
  import crackle
  return crackle.load("benchmarks/connectomics.npy.ckl.gz")

def test_empty_image():
  labels = np.zeros( (256, 256, 256), dtype=bool)  
  skels = kimimaro.skeletonize(labels, fix_borders=True)

  assert len(skels) == 0

def test_very_sparse_image():
  labels = np.zeros( (64, 64, 64), dtype=bool)  
  labels[5,5,5] = True
  labels[6,5,5] = True
  labels[20,20,20] = True 
  skels = kimimaro.skeletonize(labels, dust_threshold=0)
  
  # single voxels don't get skeletonized
  assert len(skels) == 1

def test_solid_image():
  labels = np.ones( (128, 128, 128), dtype=bool)  
  skels = kimimaro.skeletonize(labels, fix_borders=True)

  assert len(skels) == 1

def test_binary_image():
  labels = np.ones( (256, 256, 3), dtype=bool)
  labels[-1,0] = 0
  labels[0,-1] = 0
  
  skels = kimimaro.skeletonize(labels, fix_borders=False)

  assert len(skels) == 1

@pytest.mark.parametrize('fill_holes', (True, False))
def test_square(fill_holes):
  labels = np.ones( (1000, 1000), dtype=np.uint8)
  labels[-1,0] = 0
  labels[0,-1] = 0
  
  teasar_params = {
    "scale": 1.5, 
    "const": 300,
    "pdrf_scale": 100000,
    "pdrf_exponent": 4,
    "soma_acceptance_threshold": 3500,
    "soma_detection_threshold": 750,
    "soma_invalidation_const": 300,
    "soma_invalidation_scale": 2
  }

  skels = kimimaro.skeletonize(labels, teasar_params=teasar_params, fix_borders=False, fill_holes=fill_holes)

  assert len(skels) == 1

  skel = skels[1]
  assert skel.vertices.shape[0] == 1000
  assert skel.edges.shape[0] == 999
  assert abs(skel.cable_length() - 999 * np.sqrt(2)) < 0.001
  assert skel.space == 'physical'

  labels = np.ones( (1000, 1000), dtype=np.uint8)
  labels[0,0] = 0
  labels[-1,-1] = 0

  skels = kimimaro.skeletonize(labels, teasar_params=teasar_params, fix_borders=False, fill_holes=fill_holes)

  assert len(skels) == 1

  skel = skels[1]
  assert skel.vertices.shape[0] == 1000
  assert skel.edges.shape[0] == 999
  assert abs(skel.cable_length() - 999 * np.sqrt(2)) < 0.001
  assert skel.space == 'physical'

def test_cube():
  labels = np.ones( (128, 128, 128), dtype=np.uint8)
  labels[0, 0, 0] = 0
  labels[-1, -1, -1] = 0
  
  skels = kimimaro.skeletonize(labels, fix_borders=False)

  assert len(skels) == 1

  skel = skels[1]
  assert skel.vertices.shape[0] == 128
  assert skel.edges.shape[0] == 127
  assert abs(skel.cable_length() - 127 * np.sqrt(3)) < 0.001
  assert skel.space == 'physical'

def test_find_border_targets():
  labels = np.zeros( (257, 257), dtype=np.uint8)
  labels[1:-1,1:-1] = 1 

  dt = edt.edt(labels)
  targets = kimimaro.skeletontricks.find_border_targets(
    dt, labels.astype(np.uint32), wx=100, wy=100
  )

  assert len(targets) == 1
  assert targets[1] == (128, 128)

def test_fix_borders_z():
  labels = np.zeros((256, 256, 256), dtype=np.uint8)
  labels[ 64:196, 64:196, : ] = 128

  skels = kimimaro.skeletonize(
    labels,
    teasar_params={
      'const': 250,
      'scale': 10,
      'pdrf_exponent': 4,
      'pdrf_scale': 100000,
    }, 
    anisotropy=(40,32,20),
    object_ids=None, 
    dust_threshold=1000, 
    progress=True, 
    fix_branching=True, 
    in_place=False, 
    fix_borders=True
  )

  skel = skels[128]

  assert skel.space == 'physical'
  skel = skel.voxel_space()

  assert np.all(skel.vertices[:,0] == 129)
  assert np.all(skel.vertices[:,1] == 129)
  assert np.all(skel.vertices[:,2] == np.arange(256))
  assert skel.space == 'voxel'

def test_fix_borders_x():
  labels = np.zeros((256, 256, 256), dtype=np.uint8)
  labels[ :, 64:196, 64:196 ] = 128

  skels = kimimaro.skeletonize(
    labels,
    teasar_params={
      'const': 250,
      'scale': 10,
      'pdrf_exponent': 4,
      'pdrf_scale': 100000,
    }, 
    anisotropy=(1,1,1),
    object_ids=None, 
    dust_threshold=1000, 
    progress=True, 
    fix_branching=True, 
    in_place=False, 
    fix_borders=True
  )

  skel = skels[128]

  assert np.all(skel.vertices[:,0] == np.arange(256))
  assert np.all(skel.vertices[:,1] == 129)
  assert np.all(skel.vertices[:,2] == 129)

def test_fix_borders_y():
  labels = np.zeros((256, 256, 256), dtype=np.uint8)
  labels[ 64:196, :, 64:196 ] = 128

  skels = kimimaro.skeletonize(
    labels,
    teasar_params={
      'const': 250,
      'scale': 10,
      'pdrf_exponent': 4,
      'pdrf_scale': 100000,
    }, 
    anisotropy=(1,1,1),
    object_ids=None, 
    dust_threshold=1000, 
    progress=True, 
    fix_branching=True, 
    in_place=False, 
    fix_borders=True
  )

  skel = skels[128]

  assert np.all(skel.vertices[:,0] == 129)
  assert np.all(skel.vertices[:,1] == np.arange(256))
  assert np.all(skel.vertices[:,2] == 129)

def test_extra_targets():
  labels = np.zeros((256, 256, 1), dtype=np.uint8)
  labels[ 64:196, 64:196, : ] = 128

  def skeletonize(labels, **kwargs):
    return kimimaro.skeletonize(
      labels,
      teasar_params={
        'const': 250,
        'scale': 10,
        'pdrf_exponent': 4,
        'pdrf_scale': 100000,
      }, 
      anisotropy=(1,1,1),
      object_ids=None, 
      dust_threshold=1000, 
      progress=True, 
      fix_branching=True, 
      in_place=False, 
      fix_borders=True,
      **kwargs
    )[128]

  skel1 = skeletonize(labels)
  skel2 = skeletonize(labels, extra_targets_after=[ (65, 65, 0) ])

  assert skel1.vertices.size < skel2.vertices.size

  skel3 = skeletonize(labels, extra_targets_before=[ (65, 65, 0) ])

  assert skel3.vertices.size < skel2.vertices.size


def test_parallel():
  labels = np.zeros((256, 256, 128), dtype=np.uint8)
  labels[ 0:128, 0:128, : ] = 1
  labels[ 0:128, 128:256, : ] = 2
  labels[ 128:256, 0:128, : ] = 3
  labels[ 128:256, 128:256, : ] = 4

  skels = kimimaro.skeletonize(
    labels,
    teasar_params={
      'const': 250,
      'scale': 10,
      'pdrf_exponent': 4,
      'pdrf_scale': 100000,
    }, 
    anisotropy=(1,1,1),
    object_ids=None, 
    dust_threshold=1000, 
    progress=True, 
    fix_branching=True, 
    in_place=False, 
    fix_borders=True,
    parallel=2,
  )

  assert len(skels) == 4

def test_dimensions():
  labels = np.zeros((10,), dtype=np.uint8)
  skel = kimimaro.skeletonize(labels)

  labels = np.zeros((10,10), dtype=np.uint8)
  skel = kimimaro.skeletonize(labels)

  labels = np.zeros((10,10,10), dtype=np.uint8)
  skel = kimimaro.skeletonize(labels)

  labels = np.zeros((10,10,10,1), dtype=np.uint8)
  skel = kimimaro.skeletonize(labels)

  try:
    labels = np.zeros((10,10,10,2), dtype=np.uint8)
    skel = kimimaro.skeletonize(labels)
    assert False
  except kimimaro.DimensionError:
    pass

@pytest.mark.parametrize('axis', ('x','y'))
def test_joinability(axis):
  def skeletionize(labels, fix_borders):
    return kimimaro.skeletonize(
      labels,
      teasar_params={
        'const': 10,
        'scale': 10,
        'pdrf_exponent': 4,
        'pdrf_scale': 100000,
      }, 
      anisotropy=(1,1,1),
      object_ids=None, 
      dust_threshold=0, 
      progress=True, 
      fix_branching=True, 
      in_place=False, 
      fix_borders=fix_borders,
      parallel=1,
    )

  labels = np.zeros((256, 256, 20), dtype=np.uint8)

  if axis == 'x':
    lslice = np.s_[ 32:160, :, : ]
  elif axis == 'y':
    lslice = np.s_[ :, 32:160, : ]

  labels = np.zeros((256, 256, 20), dtype=np.uint8)
  labels[lslice] = 1

  skels1 = skeletionize(labels[:,:,:10], True)
  skels1 = skels1[1]

  skels2 = skeletionize(labels[:,:,9:], True)
  skels2 = skels2[1]
  skels2.vertices[:,2] += 9

  skels_fb = skels1.merge(skels2)
  assert len(skels_fb.components()) == 1

  skels1 = skeletionize(labels[:,:,:10], False)
  skels1 = skels1[1]

  skels2 = skeletionize(labels[:,:,9:], False)
  skels2 = skels2[1]
  skels2.vertices[:,2] += 9

  skels = skels1.merge(skels2)
  # Ususally this results in 2 connected components,
  # but random variation in how fp is handled can 
  # result in a merge near the tails.
  assert not Skeleton.equivalent(skels, skels_fb)

def test_find_cycle():
  edges = np.array([
    [0, 1],
    [1, 2],
    [2, 0],
    [2, 3],
    [2, 4]
  ], dtype=np.int32)

  cycle = kimimaro.skeletontricks.find_cycle(edges)

  assert np.all(cycle == np.array([0, 2, 1, 0]))

  edges = np.array([
    [0, 1],
    [1, 2],
    [2, 3],
    [3, 4], [4, 10], [10, 11], [11, 12], [12, 2],
    [4, 5],
    [5, 6],
    [6, 7],
  ], dtype=np.int32)

  cycle = kimimaro.skeletontricks.find_cycle(edges)
  
  assert np.all(cycle == np.array([
    2, 12, 11, 10, 4, 3, 2
  ]))

  # two loops
  edges = np.array([
    [0, 1], [0, 20], [20, 21], [21, 22], [22, 23], [23, 21],
    [1, 2],
    [2, 3],
    [3, 4],
    [4, 5],
    [5, 6],
    [6, 7], [7, 10], [10, 11], [11, 6]
  ], dtype=np.int32)

  cycle = kimimaro.skeletontricks.find_cycle(edges)
  
  assert np.all(cycle == np.array([
    21, 23, 22, 21
  ])) or np.all(cycle == np.array([ 
    6, 11, 10, 7, 6 
  ]))


def test_join_close_components_simple():
  skel = Skeleton([ 
      (0,0,0), (1,0,0), (10,0,0), (11, 0, 0)
    ], 
    edges=[ (0,1), (2,3) ],
    radii=[ 0, 1, 2, 3 ],
    vertex_types=[ 0, 1, 2, 3 ],
    segid=1337,
  )

  assert len(skel.components()) == 2

  res = kimimaro.join_close_components(skel, radius=np.inf)
  assert len(res.components()) == 1

  res = kimimaro.join_close_components(skel, radius=9)
  assert len(res.components()) == 1
  assert np.all(res.edges == [[0,1], [1,2], [2,3]])

  res = kimimaro.join_close_components(skel, radius=8.5)
  assert len(res.components()) == 2

def test_join_close_components_complex():
  skel = Skeleton([ 
      (0,0,0), (1,0,0),    (4,0,0), (6,0,0),        (20,0,0), (21, 0, 0),
      

      (0,0,5), 
      (0,0,10),
    ], 
    edges=[ (0,1), (2,3), (4,5), (6,7) ],
  )

  assert len(skel.components()) == 4

  res = kimimaro.join_close_components(skel, radius=np.inf)
  assert len(res.components()) == 1

  assert np.all(res.edges == [[0,1], [0,3], [1,2], [3,4], [4,5], [5,6], [6,7]])

def test_join_close_components_by_radius():
  skel = Skeleton([ 
      (0,0,0), (1,0,0), (5,0,0), (11, 0, 0)
    ], 
    edges=[ (0,1), (2,3) ],
    radii=[ 100, 100, 100, 100 ],
    vertex_types=[ 0, 1, 2, 3 ],
    segid=1337,
  )

  res = kimimaro.join_close_components(skel, restrict_by_radius=False)
  assert len(res.components()) == 1
  assert np.all(res.edges == [[0,1], [1,2], [2,3]])

  res = kimimaro.join_close_components(skel, restrict_by_radius=True)
  assert len(res.components()) == 1
  assert np.all(res.edges == [[0,1], [1,2], [2,3]])

  skel.radii = np.array([1,1,1,1], dtype=np.float32)
  res = kimimaro.join_close_components(skel, restrict_by_radius=True)
  assert len(res.components()) == 2
  assert np.all(res.edges == [[0,1], [2,3]])

  skel.radii = np.array([1,0.9,3,1], dtype=np.float32)
  res = kimimaro.join_close_components(skel, restrict_by_radius=True)
  assert len(res.components()) == 2
  assert np.all(res.edges == [[0,1], [2,3]])

  skel.radii = np.array([1,1,3,1], dtype=np.float32)
  res = kimimaro.join_close_components(skel, restrict_by_radius=True)
  assert len(res.components()) == 1
  assert np.all(res.edges == [[0,1], [1,2], [2,3]])


def test_fill_all_holes():
  labels = np.zeros((64, 32, 32), dtype=np.uint32)

  labels[0:32,:,:] = 1
  labels[32:64,:,:] = 8

  noise = np.random.randint(low=1, high=8, size=(30, 30, 30))
  labels[1:31,1:31,1:31] = noise

  noise = np.random.randint(low=8, high=11, size=(30, 30, 30))
  labels[33:63,1:31,1:31] = noise

  noise_labels = np.unique(labels)
  assert set(noise_labels) == set([1,2,3,4,5,6,7,8,9,10])

  result = kimimaro.intake.fill_all_holes(labels)

  filled_labels = np.unique(result)
  assert set(filled_labels) == set([1,8])

def test_fix_avocados():
  labels = np.zeros((256, 256, 256), dtype=np.uint32)

  # fake clipped avocado
  labels[:50, :40, :30] = 1 
  labels[:25, :20, :25] = 2

  # double avocado
  labels[50:100, 40:100, 30:80] = 3
  labels[60:90, 50:90, 40:70] = 4
  labels[60:70, 51:89, 41:69] = 5

  # not an avocado
  labels[200:,200:,200:] = 6 # not a pit
  labels[150:200,200:,200:] = 7 # not a fruit

  fn = lambda lbls: edt.edt(lbls)
  dt = fn(labels)

  labels, dbf, remapping = kimimaro.intake.engage_avocado_protection(
    labels, dt, { 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7 },
    soma_detection_threshold=1, 
    edtfn=fn, 
    progress=True
  )

  uniq = set(np.unique(labels))
  assert uniq == set([0, 1, 2, 3, 4]) # 0,2,5 renumbered
  assert np.all(labels[:50, :40, :30] == 1)
  assert np.all(labels[50:100, 40:100, 30:80] == 2)
  assert np.all(labels[150:200,200:,200:] == 3)
  assert np.all(labels[200:,200:,200:] == 4)


def test_cross_sectional_area():
  labels = np.ones((100,3,3), dtype=bool, order="F")

  vertices = np.array([
    [x,1,1] for x in range(labels.shape[0])
  ])

  edges = np.array([
    [x,x+1] for x in range(labels.shape[0] - 1)
  ])

  skel = Skeleton(vertices, edges, segid=1)
  skel = kimimaro.cross_sectional_area(labels, skel, smoothing_window=5)

  assert len(skel.cross_sectional_area == 100)
  assert np.all(skel.cross_sectional_area == 9)


def test_moving_average():

  data = np.array([])
  assert np.all(moving_average(data, 1) == data)
  assert np.all(moving_average(data, 2) == data)

  data = np.array([1,1,1,1,1,1,1,1,1,1,1])
  assert np.all(moving_average(data, 1) == data)

  data = np.array([1,1,1,1,1,1,1,1,1,1,1,1])
  assert np.all(moving_average(data, 1) == data)

  data = np.array([1,1,1,1,1,10,1,1,1,1,1])
  assert np.all(moving_average(data, 1) == data)

  data = np.array([1,1,1,1,1,1,1,1,1,1,1])
  assert np.all(moving_average(data, 2) == data)

  data = np.array([0,1,1,1,1,1,1,1,1,1,0])
  ans = np.array([
    0,0.5,1,1,1,1,1,1,1,1,0.5
  ])
  assert np.all(moving_average(data, 2) == ans)

  data = np.array([0,1,1,1,1,1,1,1,1,1,0])
  ans = np.array([
    1/3,1/3,2/3,1,1,1,1,1,1,1,2/3
  ])
  res = moving_average(data, 3)
  assert np.all(res == ans)
  assert len(ans) == len(data)

def test_no_fix_branching(connectomics_data):
  kimimaro.skeletonize(connectomics_data[:,:,100], fix_branching=False)


def test_remove_row():
  arr = np.array([
    [0,1],
    [1,2],
    [2,1],
    [2,2],
    [2,3],
    [3,4],
  ])

  result = kimimaro.post.remove_row(arr, np.array([[1,2]]))

  assert np.all(result == np.array([[0,1],[2,2],[2,3],[3,4]]))

  arr = np.array([
    []
  ])

  result = kimimaro.post.remove_row(arr, np.array([[1,2]]))

  assert np.all(result == np.array([]))

def test_cross_sectional_area():
  labels = np.ones([100,100,100], dtype=np.uint8)
  skel = kimimaro.skeletonize(labels, teasar_params={
    "pdrf_exponent": 16,

  })[1]

  xsa_1 = cross_sectional_area(labels, skel, step=1).cross_sectional_area
  xsa_10 = cross_sectional_area(labels, skel, step=10).cross_sectional_area

  assert np.all(xsa_1[xsa_10 == 0] != xsa_10[xsa_10 == 0])
  assert np.all(xsa_1[xsa_10 > 0] == xsa_10[xsa_10 > 0])
  assert np.any(xsa_1 == 10000)

  terminals = skel.terminals()
  assert np.all(xsa_10[terminals] > 0)
  assert np.all(xsa_10[terminals] > 0)

  try:
    cross_sectional_area(labels, skel, step=-1)
  except AssertionError:
    pass
  
def test_postprocess():
  skel = Skeleton([ 
      (0,0,0), (1,0,0),    (4,0,0), (6,0,0),        (20,0,0), (21, 0, 0),
      

      (0,0,5), 
      (0,0,10),
    ], 
    edges=[ (0,1), (2,3), (4,5), (6,7), (0,7), (1,6) ],
  )

  res_skel = kimimaro.post.postprocess(skel, dust_threshold=0, tick_threshold=0)

  ans = Skeleton([ 
      (4,0,0), (6,0,0),        (20,0,0), (21, 0, 0),
    ], 
    edges=[ (0,1), (2,3) ],
  )

  assert Skeleton.equivalent(res_skel, ans)


================================================
FILE: benchmarks/README.md
================================================
Benchmarks
==========

To open `connectomics.npy.ckl.gz` you must use [`crackle-codec`](https://github.com/seung-lab/crackle).

Except where noted, these benchmarks were executed on an 2.8 GHz Dual-Core Intel Core i7 with 1600 MHz DDR3 RAM. The data source used was `connectomics.npy` which can be found in this repository. `connectomics.npy` is a 32-bit 512x512x512 cutout of mouse visual cortex at 16nm x 16nm x 40nm resolution that contains 2124 connected components including a partial cell body and a large glia fragment.

Below, we compared the run time and peak memory usage of Kimimaro across many versions that contained performance significant updates. Due to the annoying length of each run, each value represents a single run, so there is some random perturbation around the true mean that can obscure the value of small improvements. Version 0.4.2 can be considered the first "feature complete" version that includes quality improvements like fix_branches, fix_borders, and a reasonable root selected for the cell body.

<p style="font-style: italics;" align="center">
<img height=512 src="https://raw.githubusercontent.com/seung-lab/kimimaro/master/benchmarks/kimimaro-execution-time-by-version.png" alt="Kimimaro Execution Time by Version on connectomics.npy" /><br>
Fig. 1: Kimimaro Execution Time by Version on `connectomics.npy`
</p>

<p style="font-style: italics;" align="center">
<img height=512 src="https://raw.githubusercontent.com/seung-lab/kimimaro/master/benchmarks/kimimaro-peak-memory-usage-by-version.png" alt="Kimimaro Peak Memory Usage by Version on connectomics.npy" /><br>
Fig. 2: Kimimaro Peak Memory Usage by Version on `connectomics.npy`
</p>

<p style="font-style: italics;" align="center">
<img height=512 src="https://raw.githubusercontent.com/seung-lab/kimimaro/master/benchmarks/kimimaro-memory-profiles-0.1.0-3.0.0.png" alt="Kimimaro Memory Profile Versions 0.3.1 vs. 3.0.0" /><br>
Fig. 3: Kimimaro Memory Profile Versions (blue) 0.3.1 (black) 3.0.0. The first hump on the left is processing a soma. The second hump is a glia.
</p>


================================================
FILE: benchmarks/benchmark.py
================================================
import time
import numpy as np
import kimimaro
import crackle
import pickle

labels = crackle.load("connectomics.npy.ckl.gz")

s = time.time()
skels = kimimaro.skeletonize(
  labels, 
  teasar_params={
    'scale': 1.5,
    'const': 300, # physical units
    'pdrf_exponent': 4,
    'pdrf_scale': 100000,
    'soma_detection_threshold': 1100, # physical units
    'soma_acceptance_threshold': 3500, # physical units
    'soma_invalidation_scale': 1.0,
    'soma_invalidation_const': 300, # physical units
    # 'max_paths': 50, # default None
  },
  # object_ids=[ ], # process only the specified labels
  # extra_targets_before=[ (27,33,100), (44,45,46) ], # target points in voxels
  # extra_targets_after=[ (27,33,100), (44,45,46) ], # target points in voxels
  # dust_threshold=1000, # skip connected components with fewer than this many voxels
  anisotropy=(16,16,40), # default True
  # fix_branching=True, # default True
  # fix_borders=True, # default True
  # fill_holes=False, # default False
  # fix_avocados=False, # default False
  progress=True, # default False, show progress bar
  # parallel=1, # <= 0 all cpu, 1 single process, 2+ multiprocess
  # parallel_chunk_size=100, # how many skeletons to process before updating progress bar
)
print(time.time() - s)

# with open("skels.pkl", "wb") as f:
#   pickle.dump(skels, f)

# with open("skels.pkl", "rb") as f:
#   skels = pickle.load(f)

s = time.time()
skels = kimimaro.cross_sectional_area(
  labels, skels,
  anisotropy=(16,16,40),
  smoothing_window=7,
  progress=True,
  step=1,
)
print(f"{time.time() - s:.3f}s")

================================================
FILE: build_linux.sh
================================================
#!/bin/bash
# Some dependencies don't support manylinux1
docker build . -f manylinux2010.Dockerfile --tag seunglab/kimimaro:manylinux2010
docker build . -f manylinux2014.Dockerfile --tag seunglab/kimimaro:manylinux2014
docker run -v $PWD/dist:/output seunglab/kimimaro:manylinux2010 /bin/bash -c "cp -r wheelhouse/* /output"
docker run -v $PWD/dist:/output seunglab/kimimaro:manylinux2014 /bin/bash -c "cp -r wheelhouse/* /output"

================================================
FILE: ext/skeletontricks/dijkstra_invalidation.hpp
================================================
/*
 * This file is part of Kimimaro.
 * 
 * Kimimaro is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Kimimaro is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.
 *
 * 
 * This algorithm is derived from dijkstra3d: 
 * https://github.com/seung-lab/dijkstra3d
 *
 * Author: William Silversmith
 * Affiliation: Seung Lab, Princeton University
 * Date: May 2024
 */

#ifndef DIJKSTRA_INVALIDATION_HPP
#define DIJKSTRA_INVALIDATION_HPP

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdint>
#include <functional>
#include <memory>
#include <queue>
#include <vector>

#include "./libdivide.h"

#define NHOOD_SIZE 26

namespace dijkstra_invalidation {

// helper function to compute 2D anisotropy ("_s" = "square")
inline float _s(const float wa, const float wb) {
  return std::sqrt(wa * wa + wb * wb);
}

// helper function to compute 3D anisotropy ("_c" = "cube")
inline float _c(const float wa, const float wb, const float wc) {
  return std::sqrt(wa * wa + wb * wb + wc * wc);
}

void connectivity_check(int connectivity) {
  if (connectivity != 6 && connectivity != 18 && connectivity != 26) {
    throw std::runtime_error("Only 6, 18, and 26 connectivities are supported.");
  }
}

void compute_neighborhood_helper_6(
  int *neighborhood, 
  const int x, const int y, const int z,
  const uint64_t sx, const uint64_t sy, const uint64_t sz
) {

  const int sxy = sx * sy;

  // 6-hood
  neighborhood[0] = -1 * (x > 0); // -x
  neighborhood[1] = (x < (static_cast<int>(sx) - 1)); // +x
  neighborhood[2] = -static_cast<int>(sx) * (y > 0); // -y
  neighborhood[3] = static_cast<int>(sx) * (y < static_cast<int>(sy) - 1); // +y
  neighborhood[4] = -sxy * static_cast<int>(z > 0); // -z
  neighborhood[5] = sxy * (z < static_cast<int>(sz) - 1); // +z
}

void compute_neighborhood_helper_18(
  int *neighborhood, 
  const int x, const int y, const int z,
  const uint64_t sx, const uint64_t sy, const uint64_t sz
) {
  // 6-hood
  compute_neighborhood_helper_6(neighborhood, x,y,z, sx,sy,sz);

  // 18-hood

  // xy diagonals
  neighborhood[6] = (neighborhood[0] + neighborhood[2]) * (neighborhood[0] && neighborhood[2]); // up-left
  neighborhood[7] = (neighborhood[0] + neighborhood[3]) * (neighborhood[0] && neighborhood[3]); // up-right
  neighborhood[8] = (neighborhood[1] + neighborhood[2]) * (neighborhood[1] && neighborhood[2]); // down-left
  neighborhood[9] = (neighborhood[1] + neighborhood[3]) * (neighborhood[1] && neighborhood[3]); // down-right

  // yz diagonals
  neighborhood[10] = (neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]); // up-left
  neighborhood[11] = (neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]); // up-right
  neighborhood[12] = (neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]); // down-left
  neighborhood[13] = (neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]); // down-right

  // xz diagonals
  neighborhood[14] = (neighborhood[0] + neighborhood[4]) * (neighborhood[0] && neighborhood[4]); // up-left
  neighborhood[15] = (neighborhood[0] + neighborhood[5]) * (neighborhood[0] && neighborhood[5]); // up-right
  neighborhood[16] = (neighborhood[1] + neighborhood[4]) * (neighborhood[1] && neighborhood[4]); // down-left
  neighborhood[17] = (neighborhood[1] + neighborhood[5]) * (neighborhood[1] && neighborhood[5]); // down-right
}

void compute_neighborhood_helper_26(
  int *neighborhood, 
  const int x, const int y, const int z,
  const uint64_t sx, const uint64_t sy, const uint64_t sz
) {
  compute_neighborhood_helper_18(neighborhood, x,y,z, sx,sy,sz);
  
  // 26-hood

  // Now the eight corners of the cube
  neighborhood[18] = (neighborhood[0] + neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]);
  neighborhood[19] = (neighborhood[1] + neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]);
  neighborhood[20] = (neighborhood[0] + neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]);
  neighborhood[21] = (neighborhood[0] + neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]);
  neighborhood[22] = (neighborhood[1] + neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]);
  neighborhood[23] = (neighborhood[1] + neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]);
  neighborhood[24] = (neighborhood[0] + neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]);
  neighborhood[25] = (neighborhood[1] + neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]);
}

inline void compute_neighborhood(
  int *neighborhood, 
  const int x, const int y, const int z,
  const uint64_t sx, const uint64_t sy, const uint64_t sz,
  const int connectivity = 26, const uint32_t* voxel_connectivity_graph = NULL) {

  if (connectivity == 26) {
    compute_neighborhood_helper_26(neighborhood, x, y, z, sx, sy, sz);
  }
  else if (connectivity == 18) {
    compute_neighborhood_helper_18(neighborhood, x, y, z, sx, sy, sz);
  }
  else {
    compute_neighborhood_helper_6(neighborhood, x, y, z, sx, sy, sz);
  }

  if (voxel_connectivity_graph == NULL) {
    return;
  }

  uint64_t loc = x + sx * (y + sy * z);
  uint32_t graph = voxel_connectivity_graph[loc];

  // graph conventions are defined here:
  // https://github.com/seung-lab/connected-components-3d/blob/3.2.0/cc3d_graphs.hpp#L73-L92

  // 6-hood
  neighborhood[0] *= ((graph & 0b000010) > 0); // -x
  neighborhood[1] *= ((graph & 0b000001) > 0); // +x
  neighborhood[2] *= ((graph & 0b001000) > 0); // -y
  neighborhood[3] *= ((graph & 0b000100) > 0); // +y
  neighborhood[4] *= ((graph & 0b100000) > 0); // -z
  neighborhood[5] *= ((graph & 0b010000) > 0); // +z

  // 18-hood

  // xy diagonals
  neighborhood[6] *= ((graph & 0b1000000000) > 0); // up-left -x,-y
  neighborhood[7] *= ((graph & 0b0010000000) > 0); // up-right -x,+y
  neighborhood[8] *= ((graph & 0b0100000000) > 0); // down-left +x,-y
  neighborhood[9] *= ((graph & 0b0001000000) > 0); // down-right +x,+y

  // yz diagonals
  neighborhood[10] *= ((graph & 0b100000000000000000) > 0); // up-left -y,-z
  neighborhood[11] *= ((graph & 0b000010000000000000) > 0); // up-right -y,+z
  neighborhood[12] *= ((graph & 0b010000000000000000) > 0); // down-left +y,-z
  neighborhood[13] *= ((graph & 0b000001000000000000) > 0); // down-right +y,+z

  // xz diagonals
  neighborhood[14] *= ((graph & 0b001000000000000000) > 0); // up-left, -x,-z
  neighborhood[15] *= ((graph & 0b000000100000000000) > 0); // up-right, -x,+z
  neighborhood[16] *= ((graph & 0b000100000000000000) > 0); // down-left +x,-z
  neighborhood[17] *= ((graph & 0b000000010000000000) > 0); // down-right +x,+z

  // 26-hood

  // Now the eight corners of the cube
  neighborhood[18] *= ((graph & 0b10000000000000000000000000) > 0); // -x,-y,-z
  neighborhood[19] *= ((graph & 0b01000000000000000000000000) > 0); // +x,-y,-z
  neighborhood[20] *= ((graph & 0b00100000000000000000000000) > 0); // -x,+y,-z
  neighborhood[21] *= ((graph & 0b00001000000000000000000000) > 0); // -x,-y,+z
  neighborhood[22] *= ((graph & 0b00010000000000000000000000) > 0); // +x,+y,-z
  neighborhood[23] *= ((graph & 0b00000100000000000000000000) > 0); // +x,-y,+z
  neighborhood[24] *= ((graph & 0b00000010000000000000000000) > 0); // -x,+y,+z
  neighborhood[25] *= ((graph & 0b00000001000000000000000000) > 0); // +x,+y,+z
}

#define DIJKSTRA_3D_PREFETCH_26WAY(field, loc) \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sxy - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sxy - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sxy + sx - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sxy - sx - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sxy + sx - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sxy - sx - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sx - 1]), 0, 1); \
  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sx - 1]), 0, 1);

class HeapDistanceNode {
public:
  float dist;
  uint64_t original_loc;
  uint64_t value;
  float max_dist;

  HeapDistanceNode() {
    dist = 0;
    value = 0;
    original_loc = 0;
    max_dist = 0;
  }

  HeapDistanceNode (float d, uint64_t o_loc, uint64_t val, float mx_dist) {
    dist = d;
    value = val;
    original_loc = o_loc;
    max_dist = mx_dist;
  }

  HeapDistanceNode (const HeapDistanceNode &h) {
    dist = h.dist;
    value = h.value;
    max_dist = h.max_dist;
    original_loc = h.original_loc;
  }
};

struct HeapDistanceNodeCompare {
  bool operator()(const HeapDistanceNode &t1, const HeapDistanceNode &t2) const {
    return t1.dist >= t2.dist;
  }
};

int64_t _roll_invalidation_ball(
  uint8_t* field, // really a boolean field
  const uint64_t sx, const uint64_t sy, const uint64_t sz, 
  const float wx, const float wy, const float wz, 
  const std::vector<uint64_t> &sources,
  const std::vector<float> &max_distances,
  const int connectivity = 26, 
  const uint32_t* voxel_connectivity_graph = NULL
) {

  const uint64_t sxy = sx * sy;

  const libdivide::divider<uint64_t> fast_sx(sx); 
  const libdivide::divider<uint64_t> fast_sxy(sxy); 

  const bool power_of_two = !((sx & (sx - 1)) || (sy & (sy - 1))); 
  const int xshift = std::log2(sx); // must use log2 here, not lg/lg2 to avoid fp errors
  const int yshift = std::log2(sy);

  connectivity_check(connectivity);

  int neighborhood[NHOOD_SIZE] = {};

  std::priority_queue<
    HeapDistanceNode, std::vector<HeapDistanceNode>, HeapDistanceNodeCompare
  > queue;

  for (uint64_t i = 0; i < sources.size(); i++) {
    queue.emplace(0.0, sources[i], sources[i], max_distances[i]);
  }

  uint64_t loc;
  uint64_t neighboridx;

  int64_t x, y, z;
  int64_t orig_x, orig_y, orig_z;

  int64_t invalidated = 0;

  auto xyzfn = [=](uint64_t l, int64_t& x, int64_t& y, int64_t& z) {
    if (power_of_two) {
      z = l >> (xshift + yshift);
      y = (l - (z << (xshift + yshift))) >> xshift;
      x = l - ((y + (z << yshift)) << xshift);
    }
    else {
      z = l / fast_sxy;
      y = (l - (z * sxy)) / fast_sx;
      x = l - sx * (y + z * sy);
    }
  };

  while (!queue.empty()) {
    const float max_dist = queue.top().max_dist;
    const uint64_t original_loc = queue.top().original_loc;
    loc = queue.top().value;
    queue.pop();

    if (!field[loc]) {
      continue;
    }

    field[loc] = 0;
    invalidated++;

    xyzfn(loc, x, y, z);
    xyzfn(original_loc, orig_x, orig_y, orig_z);
    compute_neighborhood(neighborhood, x, y, z, sx, sy, sz, connectivity, voxel_connectivity_graph);

    for (int i = 0; i < connectivity; i++) {
      if (neighborhood[i] == 0) {
        continue;
      }

      neighboridx = loc + neighborhood[i];
      if (field[neighboridx] == 0) {
        continue;
      }

      xyzfn(neighboridx, x, y, z);
      float new_dist = _c(
        wx * static_cast<float>(x - orig_x), 
        wy * static_cast<float>(y - orig_y), 
        wz * static_cast<float>(z - orig_z)
      );

      if (new_dist < max_dist) { 
        queue.emplace(new_dist, original_loc, neighboridx, max_dist);
      }
    }
  }

  return invalidated;
}

};

#undef NHOOD_SIZE
#undef DIJKSTRA_3D_PREFETCH_26WAY

#endif


================================================
FILE: ext/skeletontricks/libdivide.h
================================================
// libdivide.h - Optimized integer division
// https://libdivide.com
//
// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
// Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
//
// libdivide is dual-licensed under the Boost or zlib licenses.
// You may use libdivide under the terms of either of these.
// See LICENSE.txt for more details.

#ifndef LIBDIVIDE_H
#define LIBDIVIDE_H

// *** Version numbers are auto generated - do not edit ***
#define LIBDIVIDE_VERSION "5.2.0"
#define LIBDIVIDE_VERSION_MAJOR 5
#define LIBDIVIDE_VERSION_MINOR 2
#define LIBDIVIDE_VERSION_PATCH 0

#include <stdint.h>

#if !defined(__AVR__) && __STDC_HOSTED__ != 0
#include <stdio.h>
#include <stdlib.h>
#endif

#if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \
    (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L))
#include <limits.h>
#include <type_traits>
#define LIBDIVIDE_VC_CXX20
#endif

#if defined(LIBDIVIDE_SSE2)
#include <emmintrin.h>
#endif

#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
#include <immintrin.h>
#endif

#if defined(LIBDIVIDE_NEON)
#include <arm_neon.h>
#endif

// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
#if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \
    (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
#define LIBDIVIDE_MULH_INTRINSICS
#endif

#if defined(_MSC_VER)
#if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__)
#include <intrin.h>
#endif
#ifndef __clang__
#pragma warning(push)
// 4146: unary minus operator applied to unsigned type, result still unsigned
#pragma warning(disable : 4146)

// 4204: nonstandard extension used : non-constant aggregate initializer
#pragma warning(disable : 4204)
#endif
#define LIBDIVIDE_VC
#endif

#if !defined(__has_builtin)
#define __has_builtin(x) 0
#endif

#if defined(__SIZEOF_INT128__)
#define HAS_INT128_T
// clang-cl on Windows does not yet support 128-bit division
#if !(defined(__clang__) && defined(LIBDIVIDE_VC))
#define HAS_INT128_DIV
#endif
#endif

#if defined(__x86_64__) || defined(_M_X64)
#define LIBDIVIDE_X86_64
#endif

#if defined(__i386__)
#define LIBDIVIDE_i386
#endif

#if defined(__GNUC__) || defined(__clang__)
#define LIBDIVIDE_GCC_STYLE_ASM
#endif

#if defined(__cplusplus) || defined(LIBDIVIDE_VC)
#define LIBDIVIDE_FUNCTION __FUNCTION__
#else
#define LIBDIVIDE_FUNCTION __func__
#endif

// Set up forced inlining if possible.
// We need both the attribute and keyword to avoid "might not be inlineable" warnings.
#ifdef __has_attribute
#if __has_attribute(always_inline)
#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
#endif
#endif
#ifndef LIBDIVIDE_INLINE
#ifdef _MSC_VER
#define LIBDIVIDE_INLINE __forceinline
#else
#define LIBDIVIDE_INLINE inline
#endif
#endif

#if defined(__AVR__) || __STDC_HOSTED__ == 0
#define LIBDIVIDE_ERROR(msg)
#else
#define LIBDIVIDE_ERROR(msg)                                                                     \
    do {                                                                                         \
        fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
        abort();                                                                                 \
    } while (0)
#endif

#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0
#define LIBDIVIDE_ASSERT(x)                                                           \
    do {                                                                              \
        if (!(x)) {                                                                   \
            fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
                LIBDIVIDE_FUNCTION, #x);                                              \
            abort();                                                                  \
        }                                                                             \
    } while (0)
#else
#define LIBDIVIDE_ASSERT(x)
#endif

#ifdef __cplusplus

// For constexpr zero initialization, c++11 might handle things ok,
// but just limit to at least c++14 to ensure we don't break anyone's code:

// Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
#if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L)
#define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE

// Supposedly, MSVC might not implement feature test macros right:
// https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c
// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS
// 2017 15.0 (for extended constexpr support:
// https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
#elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
#define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE

#else
#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
#endif

namespace libdivide {
#endif

#if defined(_MSC_VER) && !defined(__clang__)
#if defined(LIBDIVIDE_VC_CXX20)
static LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) {
    if (std::is_constant_evaluated()) {
        for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
            if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
        }
        return sizeof(x) * CHAR_BIT;
    }
#else
static LIBDIVIDE_INLINE int __builtin_clz(unsigned x) {
#endif
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
    return (int)_CountLeadingZeros(x);
#elif defined(__AVX2__) || defined(__LZCNT__)
    return (int)_lzcnt_u32(x);
#else
    unsigned long r;
    _BitScanReverse(&r, x);
    return (int)(r ^ 31);
#endif
}

#if defined(LIBDIVIDE_VC_CXX20)
static LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) {
    if (std::is_constant_evaluated()) {
        for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {
            if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;
        }
        return sizeof(x) * CHAR_BIT;
    }
#else
static LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) {
#endif
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
    return (int)_CountLeadingZeros64(x);
#elif defined(_WIN64)
#if defined(__AVX2__) || defined(__LZCNT__)
    return (int)_lzcnt_u64(x);
#else
    unsigned long r;
    _BitScanReverse64(&r, x);
    return (int)(r ^ 63);
#endif
#else
    int l = __builtin_clz((unsigned)x) + 32;
    int h = __builtin_clz((unsigned)(x >> 32));
    return !!((unsigned)(x >> 32)) ? h : l;
#endif
}
#endif // defined(_MSC_VER) && !defined(__clang__)

// pack divider structs to prevent compilers from padding.
// This reduces memory usage by up to 43% when using a large
// array of libdivide dividers and improves performance
// by up to 10% because of reduced memory bandwidth.
#pragma pack(push, 1)

struct libdivide_u16_t {
    uint16_t magic;
    uint8_t more;
};

struct libdivide_s16_t {
    int16_t magic;
    uint8_t more;
};

struct libdivide_u32_t {
    uint32_t magic;
    uint8_t more;
};

struct libdivide_s32_t {
    int32_t magic;
    uint8_t more;
};

struct libdivide_u64_t {
    uint64_t magic;
    uint8_t more;
};

struct libdivide_s64_t {
    int64_t magic;
    uint8_t more;
};

struct libdivide_u16_branchfree_t {
    uint16_t magic;
    uint8_t more;
};

struct libdivide_s16_branchfree_t {
    int16_t magic;
    uint8_t more;
};

struct libdivide_u32_branchfree_t {
    uint32_t magic;
    uint8_t more;
};

struct libdivide_s32_branchfree_t {
    int32_t magic;
    uint8_t more;
};

struct libdivide_u64_branchfree_t {
    uint64_t magic;
    uint8_t more;
};

struct libdivide_s64_branchfree_t {
    int64_t magic;
    uint8_t more;
};

#pragma pack(pop)

// Explanation of the "more" field:
//
// * Bits 0-5 is the shift value (for shift path or mult path).
// * Bit 6 is the add indicator for mult path.
// * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
//   divisor indicator so that we can efficiently use sign extension to
//   create a bitmask with all bits set to 1 (if the divisor is negative)
//   or 0 (if the divisor is positive).
//
// u32: [0-4] shift value
//      [5] ignored
//      [6] add indicator
//      magic number of 0 indicates shift path
//
// s32: [0-4] shift value
//      [5] ignored
//      [6] add indicator
//      [7] indicates negative divisor
//      magic number of 0 indicates shift path
//
// u64: [0-5] shift value
//      [6] add indicator
//      magic number of 0 indicates shift path
//
// s64: [0-5] shift value
//      [6] add indicator
//      [7] indicates negative divisor
//      magic number of 0 indicates shift path
//
// In s32 and s64 branchfree modes, the magic number is negated according to
// whether the divisor is negated. In branchfree strategy, it is not negated.

enum {
    LIBDIVIDE_16_SHIFT_MASK = 0x1F,
    LIBDIVIDE_32_SHIFT_MASK = 0x1F,
    LIBDIVIDE_64_SHIFT_MASK = 0x3F,
    LIBDIVIDE_ADD_MARKER = 0x40,
    LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
};

static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);

static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);

static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
    int16_t numer, int16_t magic, uint8_t more);
static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
    int16_t numer, const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
    uint16_t numer, uint16_t magic, uint8_t more);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
    uint16_t numer, const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
    int32_t numer, int32_t magic, uint8_t more);
static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
    int32_t numer, const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
    uint32_t numer, uint32_t magic, uint8_t more);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
    uint32_t numer, const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
    int64_t numer, int64_t magic, uint8_t more);
static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
    int64_t numer, const struct libdivide_s64_t *denom);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
    uint64_t numer, uint64_t magic, uint8_t more);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
    uint64_t numer, const struct libdivide_u64_t *denom);

static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
    int16_t numer, const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
    uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
    int32_t numer, const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
    uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
    int64_t numer, const struct libdivide_s64_branchfree_t *denom);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
    uint64_t numer, const struct libdivide_u64_branchfree_t *denom);

static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);

static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
    const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
    const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
    const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
    const struct libdivide_u32_branchfree_t *denom);
static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
    const struct libdivide_s64_branchfree_t *denom);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
    const struct libdivide_u64_branchfree_t *denom);

//////// Internal Utility Functions

static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
    uint32_t xl = x, yl = y;
    uint32_t rl = xl * yl;
    return (uint16_t)(rl >> 16);
}

static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
    int32_t xl = x, yl = y;
    int32_t rl = xl * yl;
    // needs to be arithmetic shift
    return (int16_t)(rl >> 16);
}

static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
    uint64_t xl = x, yl = y;
    uint64_t rl = xl * yl;
    return (uint32_t)(rl >> 32);
}

static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
    int64_t xl = x, yl = y;
    int64_t rl = xl * yl;
    // needs to be arithmetic shift
    return (int32_t)(rl >> 32);
}

static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
#if defined(LIBDIVIDE_MULH_INTRINSICS)
    return __umulh(x, y);
#elif defined(HAS_INT128_T)
    __uint128_t xl = x, yl = y;
    __uint128_t rl = xl * yl;
    return (uint64_t)(rl >> 64);
#else
    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
    uint32_t mask = 0xFFFFFFFF;
    uint32_t x0 = (uint32_t)(x & mask);
    uint32_t x1 = (uint32_t)(x >> 32);
    uint32_t y0 = (uint32_t)(y & mask);
    uint32_t y1 = (uint32_t)(y >> 32);
    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
    uint64_t x0y1 = x0 * (uint64_t)y1;
    uint64_t x1y0 = x1 * (uint64_t)y0;
    uint64_t x1y1 = x1 * (uint64_t)y1;
    uint64_t temp = x1y0 + x0y0_hi;
    uint64_t temp_lo = temp & mask;
    uint64_t temp_hi = temp >> 32;

    return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
#endif
}

static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
#if defined(LIBDIVIDE_MULH_INTRINSICS)
    return __mulh(x, y);
#elif defined(HAS_INT128_T)
    __int128_t xl = x, yl = y;
    __int128_t rl = xl * yl;
    return (int64_t)(rl >> 64);
#else
    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
    uint32_t mask = 0xFFFFFFFF;
    uint32_t x0 = (uint32_t)(x & mask);
    uint32_t y0 = (uint32_t)(y & mask);
    int32_t x1 = (int32_t)(x >> 32);
    int32_t y1 = (int32_t)(y >> 32);
    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
    int64_t t = x1 * (int64_t)y0 + x0y0_hi;
    int64_t w1 = x0 * (int64_t)y1 + (t & mask);

    return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
#endif
}

static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
#if defined(__AVR__)
    // Fast way to count leading zeros
    // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
    return __builtin_clz(val);
#elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
    // Fast way to count leading zeros
    return (int16_t)(__builtin_clz(val) - 16);
#else
    if (val == 0) return 16;
    int16_t result = 4;
    uint16_t hi = 0xFU << 12;
    while ((val & hi) == 0) {
        hi >>= 4;
        result += 4;
    }
    while (val & hi) {
        result -= 1;
        hi <<= 1;
    }
    return result;
#endif
}

static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
#if defined(__AVR__)
    // Fast way to count leading zeros
    return __builtin_clzl(val);
#elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)
    // Fast way to count leading zeros
    return __builtin_clz(val);
#else
    if (val == 0) return 32;
    int32_t result = 8;
    uint32_t hi = 0xFFU << 24;
    while ((val & hi) == 0) {
        hi >>= 8;
        result += 8;
    }
    while (val & hi) {
        result -= 1;
        hi <<= 1;
    }
    return result;
#endif
}

static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
#if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER)
    // Fast way to count leading zeros
    return __builtin_clzll(val);
#else
    uint32_t hi = val >> 32;
    uint32_t lo = val & 0xFFFFFFFF;
    if (hi != 0) return libdivide_count_leading_zeros32(hi);
    return 32 + libdivide_count_leading_zeros32(lo);
#endif
}

// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
// uint {v}. The result must fit in 16 bits.
// Returns the quotient directly and the remainder in *r
static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
    uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
    uint32_t n = ((uint32_t)u1 << 16) | u0;
    uint16_t result = (uint16_t)(n / v);
    *r = (uint16_t)(n - result * (uint32_t)v);
    return result;
}

// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
// uint {v}. The result must fit in 32 bits.
// Returns the quotient directly and the remainder in *r
static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
    uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
    uint32_t result;
    __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
    return result;
#else
    uint64_t n = ((uint64_t)u1 << 32) | u0;
    uint32_t result = (uint32_t)(n / v);
    *r = (uint32_t)(n - result * (uint64_t)v);
    return result;
#endif
}

// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
// result must fit in 64 bits. Returns the quotient directly and the remainder in *r
static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
    // N.B. resist the temptation to use __uint128_t here.
    // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
    // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
    // it's not LIBDIVIDE_INLINEd.
#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
    uint64_t result;
    __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
    return result;
#else
    // We work in base 2**32.
    // A uint32 holds a single digit. A uint64 holds two digits.
    // Our numerator is conceptually [num3, num2, num1, num0].
    // Our denominator is [den1, den0].
    const uint64_t b = ((uint64_t)1 << 32);

    // The high and low digits of our computed quotient.
    uint32_t q1;
    uint32_t q0;

    // The normalization shift factor.
    int shift;

    // The high and low digits of our denominator (after normalizing).
    // Also the low 2 digits of our numerator (after normalizing).
    uint32_t den1;
    uint32_t den0;
    uint32_t num1;
    uint32_t num0;

    // A partial remainder.
    uint64_t rem;

    // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
    uint64_t qhat;
    uint64_t rhat;

    // Variables used to correct the estimated quotient.
    uint64_t c1;
    uint64_t c2;

    // Check for overflow and divide by 0.
    if (numhi >= den) {
        if (r) *r = ~0ull;
        return ~0ull;
    }

    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
    // least half b. In binary this means just shifting left by the number of leading zeros, so that
    // there's a 1 in the MSB.
    // We also shift numer by the same amount. This cannot overflow because numhi < den.
    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
    shift = libdivide_count_leading_zeros64(den);
    den <<= shift;
    numhi <<= shift;
    numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);
    numlo <<= shift;

    // Extract the low digits of the numerator and both digits of the denominator.
    num1 = (uint32_t)(numlo >> 32);
    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
    den1 = (uint32_t)(den >> 32);
    den0 = (uint32_t)(den & 0xFFFFFFFFu);

    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
    // Estimate q1 as [n3 n2] / [d1], and then correct it.
    // Note while qhat may be 2 digits, q1 is always 1 digit.
    qhat = numhi / den1;
    rhat = numhi % den1;
    c1 = qhat * den0;
    c2 = rhat * b + num1;
    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
    q1 = (uint32_t)qhat;

    // Compute the true (partial) remainder.
    rem = numhi * b + num1 - q1 * den;

    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
    qhat = rem / den1;
    rhat = rem % den1;
    c1 = qhat * den0;
    c2 = rhat * b + num0;
    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
    q0 = (uint32_t)qhat;

    // Return remainder if requested.
    if (r) *r = (rem * b + num0 - q0 * den) >> shift;
    return ((uint64_t)q1 << 32) | q0;
#endif
}

#if !(defined(HAS_INT128_T) && \
      defined(HAS_INT128_DIV))

// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
static LIBDIVIDE_INLINE void libdivide_u128_shift(
    uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
    if (signed_shift > 0) {
        uint32_t shift = signed_shift;
        *u1 <<= shift;
        *u1 |= *u0 >> (64 - shift);
        *u0 <<= shift;
    } else if (signed_shift < 0) {
        uint32_t shift = -signed_shift;
        *u0 >>= shift;
        *u0 |= *u1 << (64 - shift);
        *u1 >>= shift;
    }
}

#endif

// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
    uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
#if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
    __uint128_t ufull = u_hi;
    __uint128_t vfull = v_hi;
    ufull = (ufull << 64) | u_lo;
    vfull = (vfull << 64) | v_lo;
    uint64_t res = (uint64_t)(ufull / vfull);
    __uint128_t remainder = ufull - (vfull * res);
    *r_lo = (uint64_t)remainder;
    *r_hi = (uint64_t)(remainder >> 64);
    return res;
#else
    // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
    // We want to compute u / v
    typedef struct {
        uint64_t hi;
        uint64_t lo;
    } u128_t;
    u128_t u = {u_hi, u_lo};
    u128_t v = {v_hi, v_lo};

    if (v.hi == 0) {
        // divisor v is a 64 bit value, so we just need one 128/64 division
        // Note that we are simpler than Hacker's Delight here, because we know
        // the quotient fits in 64 bits whereas Hacker's Delight demands a full
        // 128 bit quotient
        *r_hi = 0;
        return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
    }
    // Here v >= 2**64
    // We know that v.hi != 0, so count leading zeros is OK
    // We have 0 <= n <= 63
    uint32_t n = libdivide_count_leading_zeros64(v.hi);

    // Normalize the divisor so its MSB is 1
    u128_t v1t = v;
    libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
    uint64_t v1 = v1t.hi;  // i.e. v1 = v1t >> 64

    // To ensure no overflow
    u128_t u1 = u;
    libdivide_u128_shift(&u1.hi, &u1.lo, -1);

    // Get quotient from divide unsigned insn.
    uint64_t rem_ignored;
    uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);

    // Undo normalization and division of u by 2.
    u128_t q0 = {0, q1};
    libdivide_u128_shift(&q0.hi, &q0.lo, n);
    libdivide_u128_shift(&q0.hi, &q0.lo, -63);

    // Make q0 correct or too small by 1
    // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
    if (q0.hi != 0 || q0.lo != 0) {
        q0.hi -= (q0.lo == 0);  // borrow
        q0.lo -= 1;
    }

    // Now q0 is correct.
    // Compute q0 * v as q0v
    // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
    // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
    //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)
    // Each term is 128 bit
    // High half of full product (upper 128 bits!) are dropped
    u128_t q0v = {0, 0};
    q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
    q0v.lo = q0.lo * v.lo;

    // Compute u - q0v as u_q0v
    // This is the remainder
    u128_t u_q0v = u;
    u_q0v.hi -= q0v.hi + (u.lo < q0v.lo);  // second term is borrow
    u_q0v.lo -= q0v.lo;

    // Check if u_q0v >= v
    // This checks if our remainder is larger than the divisor
    if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
        // Increment q0
        q0.lo += 1;
        q0.hi += (q0.lo == 0);  // carry

        // Subtract v from remainder
        u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
        u_q0v.lo -= v.lo;
    }

    *r_hi = u_q0v.hi;
    *r_lo = u_q0v.lo;

    LIBDIVIDE_ASSERT(q0.hi == 0);
    return q0.lo;
#endif
}

////////// UINT16

static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
    uint16_t d, int branchfree) {
    if (d == 0) {
        LIBDIVIDE_ERROR("divider must be != 0");
    }

    struct libdivide_u16_t result;
    uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));

    // Power of 2
    if ((d & (d - 1)) == 0) {
        // We need to subtract 1 from the shift value in case of an unsigned
        // branchfree divider because there is a hardcoded right shift by 1
        // in its division algorithm. Because of this we also need to add back
        // 1 in its recovery algorithm.
        result.magic = 0;
        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
    } else {
        uint8_t more;
        uint16_t rem, proposed_m;
        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);

        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
        const uint16_t e = d - rem;

        // This power works if e < 2**floor_log_2_d.
        if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
            // This power works
            more = floor_log_2_d;
        } else {
            // We have to use the general 17-bit algorithm.  We need to compute
            // (2**power) / d. However, we already have (2**(power-1))/d and
            // its remainder.  By doubling both, and then correcting the
            // remainder, we can compute the larger division.
            // don't care about overflow here - in fact, we expect it
            proposed_m += proposed_m;
            const uint16_t twice_rem = rem + rem;
            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
        }
        result.magic = 1 + proposed_m;
        result.more = more;
        // result.more's shift should in general be ceil_log_2_d. But if we
        // used the smaller power, we subtract one from the shift because we're
        // using the smaller power. If we're using the larger power, we
        // subtract one from the shift because it's taken care of by the add
        // indicator. So floor_log_2_d happens to be correct in both cases.
    }
    return result;
}

static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
    return libdivide_internal_u16_gen(d, 0);
}

static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
    if (d == 1) {
        LIBDIVIDE_ERROR("branchfree divider must be != 1");
    }
    struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
    struct libdivide_u16_branchfree_t ret = {
        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
    return ret;
}

// The original libdivide_u16_do takes a const pointer. However, this cannot be used
// with a compile time constant libdivide_u16_t: it will generate a warning about
// taking the address of a temporary. Hence this overload.
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
    if (!magic) {
        return numer >> more;
    } else {
        uint16_t q = libdivide_mullhi_u16(numer, magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            uint16_t t = ((numer - q) >> 1) + q;
            return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
        } else {
            // All upper bits are 0,
            // don't need to mask them off.
            return q >> more;
        }
    }
}

static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
    return libdivide_u16_do_raw(numer, denom->magic, denom->more);
}

static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
    uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
    uint16_t q = libdivide_mullhi_u16(numer, denom->magic);
    uint16_t t = ((numer - q) >> 1) + q;
    return t >> denom->more;
}

static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;

    if (!denom->magic) {
        return (uint16_t)1 << shift;
    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
        // We compute q = n/d = n*m / 2^(16 + shift)
        // Therefore we have d = 2^(16 + shift) / m
        // We need to ceil it.
        // We know d is not a power of 2, so m is not a power of 2,
        // so we can just add 1 to the floor
        uint16_t hi_dividend = (uint16_t)1 << shift;
        uint16_t rem_ignored;
        return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
    } else {
        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
        // Also note that shift may be as high as 15, so shift + 1 will
        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
        // then double the quotient and remainder.
        uint32_t half_n = (uint32_t)1 << (16 + shift);
        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
        // Note that the quotient is guaranteed <= 16 bits, but the remainder
        // may need 17!
        uint16_t half_q = (uint16_t)(half_n / d);
        uint32_t rem = half_n % d;
        // We computed 2^(16+shift)/(m+2^16)
        // Need to double it, and then add 1 to the quotient if doubling th
        // remainder would increase the quotient.
        // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);

        // We rounded down in gen (hence +1)
        return full_q + 1;
    }
}

static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;

    if (!denom->magic) {
        return (uint16_t)1 << (shift + 1);
    } else {
        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
        // Also note that shift may be as high as 15, so shift + 1 will
        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
        // then double the quotient and remainder.
        uint32_t half_n = (uint32_t)1 << (16 + shift);
        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
        // Note that the quotient is guaranteed <= 16 bits, but the remainder
        // may need 17!
        uint16_t half_q = (uint16_t)(half_n / d);
        uint32_t rem = half_n % d;
        // We computed 2^(16+shift)/(m+2^16)
        // Need to double it, and then add 1 to the quotient if doubling th
        // remainder would increase the quotient.
        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);

        // We rounded down in gen (hence +1)
        return full_q + 1;
    }
}

////////// UINT32

static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
    uint32_t d, int branchfree) {
    if (d == 0) {
        LIBDIVIDE_ERROR("divider must be != 0");
    }

    struct libdivide_u32_t result;
    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);

    // Power of 2
    if ((d & (d - 1)) == 0) {
        // We need to subtract 1 from the shift value in case of an unsigned
        // branchfree divider because there is a hardcoded right shift by 1
        // in its division algorithm. Because of this we also need to add back
        // 1 in its recovery algorithm.
        result.magic = 0;
        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
    } else {
        uint8_t more;
        uint32_t rem, proposed_m;
        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);

        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
        const uint32_t e = d - rem;

        // This power works if e < 2**floor_log_2_d.
        if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
            // This power works
            more = (uint8_t)floor_log_2_d;
        } else {
            // We have to use the general 33-bit algorithm.  We need to compute
            // (2**power) / d. However, we already have (2**(power-1))/d and
            // its remainder.  By doubling both, and then correcting the
            // remainder, we can compute the larger division.
            // don't care about overflow here - in fact, we expect it
            proposed_m += proposed_m;
            const uint32_t twice_rem = rem + rem;
            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
        }
        result.magic = 1 + proposed_m;
        result.more = more;
        // result.more's shift should in general be ceil_log_2_d. But if we
        // used the smaller power, we subtract one from the shift because we're
        // using the smaller power. If we're using the larger power, we
        // subtract one from the shift because it's taken care of by the add
        // indicator. So floor_log_2_d happens to be correct in both cases.
    }
    return result;
}

static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
    return libdivide_internal_u32_gen(d, 0);
}

static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
    if (d == 1) {
        LIBDIVIDE_ERROR("branchfree divider must be != 1");
    }
    struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
    struct libdivide_u32_branchfree_t ret = {
        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
    return ret;
}

static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
    if (!magic) {
        return numer >> more;
    } else {
        uint32_t q = libdivide_mullhi_u32(numer, magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            uint32_t t = ((numer - q) >> 1) + q;
            return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
        } else {
            // All upper bits are 0,
            // don't need to mask them off.
            return q >> more;
        }
    }
}

static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
    return libdivide_u32_do_raw(numer, denom->magic, denom->more);
}

static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
    uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
    uint32_t q = libdivide_mullhi_u32(numer, denom->magic);
    uint32_t t = ((numer - q) >> 1) + q;
    return t >> denom->more;
}

static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;

    if (!denom->magic) {
        return (uint32_t)1 << shift;
    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
        // We compute q = n/d = n*m / 2^(32 + shift)
        // Therefore we have d = 2^(32 + shift) / m
        // We need to ceil it.
        // We know d is not a power of 2, so m is not a power of 2,
        // so we can just add 1 to the floor
        uint32_t hi_dividend = (uint32_t)1 << shift;
        uint32_t rem_ignored;
        return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
    } else {
        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
        // Also note that shift may be as high as 31, so shift + 1 will
        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
        // then double the quotient and remainder.
        uint64_t half_n = (uint64_t)1 << (32 + shift);
        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
        // Note that the quotient is guaranteed <= 32 bits, but the remainder
        // may need 33!
        uint32_t half_q = (uint32_t)(half_n / d);
        uint64_t rem = half_n % d;
        // We computed 2^(32+shift)/(m+2^32)
        // Need to double it, and then add 1 to the quotient if doubling th
        // remainder would increase the quotient.
        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);

        // We rounded down in gen (hence +1)
        return full_q + 1;
    }
}

static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;

    if (!denom->magic) {
        return (uint32_t)1 << (shift + 1);
    } else {
        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
        // Also note that shift may be as high as 31, so shift + 1 will
        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
        // then double the quotient and remainder.
        uint64_t half_n = (uint64_t)1 << (32 + shift);
        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
        // Note that the quotient is guaranteed <= 32 bits, but the remainder
        // may need 33!
        uint32_t half_q = (uint32_t)(half_n / d);
        uint64_t rem = half_n % d;
        // We computed 2^(32+shift)/(m+2^32)
        // Need to double it, and then add 1 to the quotient if doubling th
        // remainder would increase the quotient.
        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);

        // We rounded down in gen (hence +1)
        return full_q + 1;
    }
}

////////// UINT64

static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
    uint64_t d, int branchfree) {
    if (d == 0) {
        LIBDIVIDE_ERROR("divider must be != 0");
    }

    struct libdivide_u64_t result;
    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);

    // Power of 2
    if ((d & (d - 1)) == 0) {
        // We need to subtract 1 from the shift value in case of an unsigned
        // branchfree divider because there is a hardcoded right shift by 1
        // in its division algorithm. Because of this we also need to add back
        // 1 in its recovery algorithm.
        result.magic = 0;
        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
    } else {
        uint64_t proposed_m, rem;
        uint8_t more;
        // (1 << (64 + floor_log_2_d)) / d
        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);

        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
        const uint64_t e = d - rem;

        // This power works if e < 2**floor_log_2_d.
        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
            // This power works
            more = (uint8_t)floor_log_2_d;
        } else {
            // We have to use the general 65-bit algorithm.  We need to compute
            // (2**power) / d. However, we already have (2**(power-1))/d and
            // its remainder. By doubling both, and then correcting the
            // remainder, we can compute the larger division.
            // don't care about overflow here - in fact, we expect it
            proposed_m += proposed_m;
            const uint64_t twice_rem = rem + rem;
            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
        }
        result.magic = 1 + proposed_m;
        result.more = more;
        // result.more's shift should in general be ceil_log_2_d. But if we
        // used the smaller power, we subtract one from the shift because we're
        // using the smaller power. If we're using the larger power, we
        // subtract one from the shift because it's taken care of by the add
        // indicator. So floor_log_2_d happens to be correct in both cases,
        // which is why we do it outside of the if statement.
    }
    return result;
}

static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
    return libdivide_internal_u64_gen(d, 0);
}

static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
    if (d == 1) {
        LIBDIVIDE_ERROR("branchfree divider must be != 1");
    }
    struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
    struct libdivide_u64_branchfree_t ret = {
        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
    return ret;
}

static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
   if (!magic) {
        return numer >> more;
    } else {
        uint64_t q = libdivide_mullhi_u64(numer, magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            uint64_t t = ((numer - q) >> 1) + q;
            return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
        } else {
            // All upper bits are 0,
            // don't need to mask them off.
            return q >> more;
        }
    }
}

static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
    return libdivide_u64_do_raw(numer, denom->magic, denom->more);
}

static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
    uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
    uint64_t q = libdivide_mullhi_u64(numer, denom->magic);
    uint64_t t = ((numer - q) >> 1) + q;
    return t >> denom->more;
}

static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;

    if (!denom->magic) {
        return (uint64_t)1 << shift;
    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
        // We compute q = n/d = n*m / 2^(64 + shift)
        // Therefore we have d = 2^(64 + shift) / m
        // We need to ceil it.
        // We know d is not a power of 2, so m is not a power of 2,
        // so we can just add 1 to the floor
        uint64_t hi_dividend = (uint64_t)1 << shift;
        uint64_t rem_ignored;
        return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
    } else {
        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
        // libdivide_u32_recover for more on what we do here.
        // TODO: do something better than 128 bit math

        // Full n is a (potentially) 129 bit value
        // half_n is a 128 bit value
        // Compute the hi half of half_n. Low half is 0.
        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
        // d is a 65 bit value. The high bit is always set to 1.
        const uint64_t d_hi = 1, d_lo = denom->magic;
        // Note that the quotient is guaranteed <= 64 bits,
        // but the remainder may need 65!
        uint64_t r_hi, r_lo;
        uint64_t half_q =
            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
        // We computed 2^(64+shift)/(m+2^64)
        // Double the remainder ('dr') and check if that is larger than d
        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
        // cannot overflow
        uint64_t dr_lo = r_lo + r_lo;
        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
        return full_q + 1;
    }
}

static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;

    if (!denom->magic) {
        return (uint64_t)1 << (shift + 1);
    } else {
        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
        // libdivide_u32_recover for more on what we do here.
        // TODO: do something better than 128 bit math

        // Full n is a (potentially) 129 bit value
        // half_n is a 128 bit value
        // Compute the hi half of half_n. Low half is 0.
        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
        // d is a 65 bit value. The high bit is always set to 1.
        const uint64_t d_hi = 1, d_lo = denom->magic;
        // Note that the quotient is guaranteed <= 64 bits,
        // but the remainder may need 65!
        uint64_t r_hi, r_lo;
        uint64_t half_q =
            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
        // We computed 2^(64+shift)/(m+2^64)
        // Double the remainder ('dr') and check if that is larger than d
        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
        // cannot overflow
        uint64_t dr_lo = r_lo + r_lo;
        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
        return full_q + 1;
    }
}

////////// SINT16

static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
    int16_t d, int branchfree) {
    if (d == 0) {
        LIBDIVIDE_ERROR("divider must be != 0");
    }

    struct libdivide_s16_t result;

    // If d is a power of 2, or negative a power of 2, we have to use a shift.
    // This is especially important because the magic algorithm fails for -1.
    // To check if d is a power of 2 or its inverse, it suffices to check
    // whether its absolute value has exactly one bit set. This works even for
    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    // and is a power of 2.
    uint16_t ud = (uint16_t)d;
    uint16_t absD = (d < 0) ? -ud : ud;
    uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
    // check if exactly one bit is set,
    // don't care if absD is 0 since that's divide by zero
    if ((absD & (absD - 1)) == 0) {
        // Branchfree and normal paths are exactly the same
        result.magic = 0;
        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    } else {
        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);

        uint8_t more;
        // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
        // is 0 and the high word is floor_log_2_d - 1
        uint16_t rem, proposed_m;
        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
        const uint16_t e = absD - rem;

        // We are going to start with a power of floor_log_2_d - 1.
        // This works if works if e < 2**floor_log_2_d.
        if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
            // This power works
            more = (uint8_t)(floor_log_2_d - 1);
        } else {
            // We need to go one higher. This should not make proposed_m
            // overflow, but it will make it negative when interpreted as an
            // int16_t.
            proposed_m += proposed_m;
            const uint16_t twice_rem = rem + rem;
            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
        }

        proposed_m += 1;
        int16_t magic = (int16_t)proposed_m;

        // Mark if we are negative. Note we only negate the magic number in the
        // branchfull case.
        if (d < 0) {
            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
            if (!branchfree) {
                magic = -magic;
            }
        }

        result.more = more;
        result.magic = magic;
    }
    return result;
}

static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
    return libdivide_internal_s16_gen(d, 0);
}

static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
    struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
    struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
    return result;
}

// The original libdivide_s16_do takes a const pointer. However, this cannot be used
// with a compile time constant libdivide_s16_t: it will generate a warning about
// taking the address of a temporary. Hence this overload.
static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;

    if (!magic) {
        uint16_t sign = (int8_t)more >> 7;
        uint16_t mask = ((uint16_t)1 << shift) - 1;
        uint16_t uq = numer + ((numer >> 15) & mask);
        int16_t q = (int16_t)uq;
        q >>= shift;
        q = (q ^ sign) - sign;
        return q;
    } else {
        uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift and then sign extend
            int16_t sign = (int8_t)more >> 7;
            // q += (more < 0 ? -numer : numer)
            // cast required to avoid UB
            uq += ((uint16_t)numer ^ sign) - sign;
        }
        int16_t q = (int16_t)uq;
        q >>= shift;
        q += (q < 0);
        return q;
    }
}

static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
    return libdivide_s16_do_raw(numer, denom->magic, denom->more);
}

static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    // must be arithmetic shift and then sign extend
    int16_t sign = (int8_t)more >> 7;
    int16_t magic = denom->magic;
    int16_t q = libdivide_mullhi_s16(numer, magic);
    q += numer;

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is a power of
    // 2, or (2**shift) if it is not a power of 2
    uint16_t is_power_of_2 = (magic == 0);
    uint16_t q_sign = (uint16_t)(q >> 15);
    q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);

    // Now arithmetic right shift
    q >>= shift;
    // Negate if needed
    q = (q ^ sign) - sign;

    return q;
}

static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    if (!denom->magic) {
        uint16_t absD = (uint16_t)1 << shift;
        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
            absD = -absD;
        }
        return (int16_t)absD;
    } else {
        // Unsigned math is much easier
        // We negate the magic number only in the branchfull case, and we don't
        // know which case we're in. However we have enough information to
        // determine the correct sign of the magic number. The divisor was
        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
        // the magic number's sign is opposite that of the divisor.
        // We want to compute the positive magic number.
        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;

        // Handle the power of 2 case (including branchfree)
        if (denom->magic == 0) {
            int16_t result = (uint16_t)1 << shift;
            return negative_divisor ? -result : result;
        }

        uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
        uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30
        uint16_t q = (uint16_t)(n / d);
        int16_t result = (int16_t)q;
        result += 1;
        return negative_divisor ? -result : result;
    }
}

static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
    const struct libdivide_s16_t den = {denom->magic, denom->more};
    return libdivide_s16_recover(&den);
}

////////// SINT32

static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
    int32_t d, int branchfree) {
    if (d == 0) {
        LIBDIVIDE_ERROR("divider must be != 0");
    }

    struct libdivide_s32_t result;

    // If d is a power of 2, or negative a power of 2, we have to use a shift.
    // This is especially important because the magic algorithm fails for -1.
    // To check if d is a power of 2 or its inverse, it suffices to check
    // whether its absolute value has exactly one bit set. This works even for
    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    // and is a power of 2.
    uint32_t ud = (uint32_t)d;
    uint32_t absD = (d < 0) ? -ud : ud;
    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
    // check if exactly one bit is set,
    // don't care if absD is 0 since that's divide by zero
    if ((absD & (absD - 1)) == 0) {
        // Branchfree and normal paths are exactly the same
        result.magic = 0;
        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    } else {
        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);

        uint8_t more;
        // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
        // is 0 and the high word is floor_log_2_d - 1
        uint32_t rem, proposed_m;
        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
        const uint32_t e = absD - rem;

        // We are going to start with a power of floor_log_2_d - 1.
        // This works if works if e < 2**floor_log_2_d.
        if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
            // This power works
            more = (uint8_t)(floor_log_2_d - 1);
        } else {
            // We need to go one higher. This should not make proposed_m
            // overflow, but it will make it negative when interpreted as an
            // int32_t.
            proposed_m += proposed_m;
            const uint32_t twice_rem = rem + rem;
            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
        }

        proposed_m += 1;
        int32_t magic = (int32_t)proposed_m;

        // Mark if we are negative. Note we only negate the magic number in the
        // branchfull case.
        if (d < 0) {
            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
            if (!branchfree) {
                magic = -magic;
            }
        }

        result.more = more;
        result.magic = magic;
    }
    return result;
}

static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
    return libdivide_internal_s32_gen(d, 0);
}

static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
    struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
    struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
    return result;
}

static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;

    if (!magic) {
        uint32_t sign = (int8_t)more >> 7;
        uint32_t mask = ((uint32_t)1 << shift) - 1;
        uint32_t uq = numer + ((numer >> 31) & mask);
        int32_t q = (int32_t)uq;
        q >>= shift;
        q = (q ^ sign) - sign;
        return q;
    } else {
        uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift and then sign extend
            int32_t sign = (int8_t)more >> 7;
            // q += (more < 0 ? -numer : numer)
            // cast required to avoid UB
            uq += ((uint32_t)numer ^ sign) - sign;
        }
        int32_t q = (int32_t)uq;
        q >>= shift;
        q += (q < 0);
        return q;
    }
}

static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
    return libdivide_s32_do_raw(numer, denom->magic, denom->more);
}

static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    // must be arithmetic shift and then sign extend
    int32_t sign = (int8_t)more >> 7;
    int32_t magic = denom->magic;
    int32_t q = libdivide_mullhi_s32(numer, magic);
    q += numer;

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is a power of
    // 2, or (2**shift) if it is not a power of 2
    uint32_t is_power_of_2 = (magic == 0);
    uint32_t q_sign = (uint32_t)(q >> 31);
    q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);

    // Now arithmetic right shift
    q >>= shift;
    // Negate if needed
    q = (q ^ sign) - sign;

    return q;
}

static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    if (!denom->magic) {
        uint32_t absD = (uint32_t)1 << shift;
        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
            absD = -absD;
        }
        return (int32_t)absD;
    } else {
        // Unsigned math is much easier
        // We negate the magic number only in the branchfull case, and we don't
        // know which case we're in. However we have enough information to
        // determine the correct sign of the magic number. The divisor was
        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
        // the magic number's sign is opposite that of the divisor.
        // We want to compute the positive magic number.
        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;

        // Handle the power of 2 case (including branchfree)
        if (denom->magic == 0) {
            int32_t result = (uint32_t)1 << shift;
            return negative_divisor ? -result : result;
        }

        uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
        uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30
        uint32_t q = (uint32_t)(n / d);
        int32_t result = (int32_t)q;
        result += 1;
        return negative_divisor ? -result : result;
    }
}

static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
    const struct libdivide_s32_t den = {denom->magic, denom->more};
    return libdivide_s32_recover(&den);
}

////////// SINT64

static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
    int64_t d, int branchfree) {
    if (d == 0) {
        LIBDIVIDE_ERROR("divider must be != 0");
    }

    struct libdivide_s64_t result;

    // If d is a power of 2, or negative a power of 2, we have to use a shift.
    // This is especially important because the magic algorithm fails for -1.
    // To check if d is a power of 2 or its inverse, it suffices to check
    // whether its absolute value has exactly one bit set.  This works even for
    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
    // and is a power of 2.
    uint64_t ud = (uint64_t)d;
    uint64_t absD = (d < 0) ? -ud : ud;
    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
    // check if exactly one bit is set,
    // don't care if absD is 0 since that's divide by zero
    if ((absD & (absD - 1)) == 0) {
        // Branchfree and non-branchfree cases are the same
        result.magic = 0;
        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
    } else {
        // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
        // is 0 and the high word is floor_log_2_d - 1
        uint8_t more;
        uint64_t rem, proposed_m;
        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
        const uint64_t e = absD - rem;

        // We are going to start with a power of floor_log_2_d - 1.
        // This works if works if e < 2**floor_log_2_d.
        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
            // This power works
            more = (uint8_t)(floor_log_2_d - 1);
        } else {
            // We need to go one higher. This should not make proposed_m
            // overflow, but it will make it negative when interpreted as an
            // int32_t.
            proposed_m += proposed_m;
            const uint64_t twice_rem = rem + rem;
            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
            // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
            // also set ADD_MARKER this is an annoying optimization that
            // enables algorithm #4 to avoid the mask. However we always set it
            // in the branchfree case
            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
        }
        proposed_m += 1;
        int64_t magic = (int64_t)proposed_m;

        // Mark if we are negative
        if (d < 0) {
            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
            if (!branchfree) {
                magic = -magic;
            }
        }

        result.more = more;
        result.magic = magic;
    }
    return result;
}

static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
    return libdivide_internal_s64_gen(d, 0);
}

static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
    struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
    struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
    return ret;
}

static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;

    if (!magic) {  // shift path
        uint64_t mask = ((uint64_t)1 << shift) - 1;
        uint64_t uq = numer + ((numer >> 63) & mask);
        int64_t q = (int64_t)uq;
        q >>= shift;
        // must be arithmetic shift and then sign-extend
        int64_t sign = (int8_t)more >> 7;
        q = (q ^ sign) - sign;
        return q;
    } else {
        uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift and then sign extend
            int64_t sign = (int8_t)more >> 7;
            // q += (more < 0 ? -numer : numer)
            // cast required to avoid UB
            uq += ((uint64_t)numer ^ sign) - sign;
        }
        int64_t q = (int64_t)uq;
        q >>= shift;
        q += (q < 0);
        return q;
    }
}

static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
    return libdivide_s64_do_raw(numer, denom->magic, denom->more);
}

static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    // must be arithmetic shift and then sign extend
    int64_t sign = (int8_t)more >> 7;
    int64_t magic = denom->magic;
    int64_t q = libdivide_mullhi_s64(numer, magic);
    q += numer;

    // If q is non-negative, we have nothing to do.
    // If q is negative, we want to add either (2**shift)-1 if d is a power of
    // 2, or (2**shift) if it is not a power of 2.
    uint64_t is_power_of_2 = (magic == 0);
    uint64_t q_sign = (uint64_t)(q >> 63);
    q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);

    // Arithmetic right shift
    q >>= shift;
    // Negate if needed
    q = (q ^ sign) - sign;

    return q;
}

static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    if (denom->magic == 0) {  // shift path
        uint64_t absD = (uint64_t)1 << shift;
        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
            absD = -absD;
        }
        return (int64_t)absD;
    } else {
        // Unsigned math is much easier
        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;

        uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
        uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
        uint64_t rem_ignored;
        uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
        int64_t result = (int64_t)(q + 1);
        if (negative_divisor) {
            result = -result;
        }
        return result;
    }
}

static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
    const struct libdivide_s64_t den = {denom->magic, denom->more};
    return libdivide_s64_recover(&den);
}

// Simplest possible vector type division: treat the vector type as an array
// of underlying native type.
//
// Use a union to read a vector via pointer-to-integer, without violating strict
// aliasing.
#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo)                          \
    const size_t count = sizeof(VecT) / sizeof(IntT);                     \
    union type_pun_vec {                                                  \
        VecT vec;                                                         \
        IntT arr[sizeof(VecT) / sizeof(IntT)];                            \
    };                                                                    \
    union type_pun_vec result;                                            \
    union type_pun_vec input;                                             \
    input.vec = numers;                                                   \
    for (size_t loop = 0; loop < count; ++loop) {                         \
        result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
    }                                                                     \
    return result.vec;

#if defined(LIBDIVIDE_NEON)

static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
    uint16x8_t numers, const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
    int16x8_t numers, const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
    uint32x4_t numers, const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
    int32x4_t numers, const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
    uint64x2_t numers, const struct libdivide_u64_t *denom);
static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
    int64x2_t numers, const struct libdivide_s64_t *denom);

static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);

//////// Internal Utility Functions

// Logical right shift by runtime value.
// NEON implements right shift as left shits by negative values.
static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
    int32_t wamt = (int32_t)(amt);
    return vshlq_u32(v, vdupq_n_s32(-wamt));
}

static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
    int64_t wamt = (int64_t)(amt);
    return vshlq_u64(v, vdupq_n_s64(-wamt));
}

// Arithmetic right shift by runtime value.
static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
    int32_t wamt = (int32_t)(amt);
    return vshlq_s32(v, vdupq_n_s32(-wamt));
}

static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
    int64_t wamt = (int64_t)(amt);
    return vshlq_s64(v, vdupq_n_s64(-wamt));
}

static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }

static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
    // Desire is [x0, x1, x2, x3]
    uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]
    uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]
    return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]
}

static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
    int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]
    int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]
    return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]
}

static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
    // full 128 bits product is:
    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.

    // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
    uint64x2_t y = vdupq_n_u64(sy);
    uint32x2_t x0 = vmovn_u64(x);
    uint32x2_t y0 = vmovn_u64(y);
    uint32x2_t x1 = vshrn_n_u64(x, 32);
    uint32x2_t y1 = vshrn_n_u64(y, 32);

    // Compute x0*y0.
    uint64x2_t x0y0 = vmull_u32(x0, y0);
    uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);

    // Compute other intermediate products.
    uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0);  // temp = x0y0_hi + x1*y0;
    // We want to split temp into its low 32 bits and high 32 bits, both
    // in the low half of 64 bit registers.
    // Use shifts to avoid needing a reg for the mask.
    uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32);  // temp_lo = temp & 0xFFFFFFFF;
    uint64x2_t temp_hi = vshrq_n_u64(temp, 32);                   // temp_hi = temp >> 32;

    temp_lo = vmlal_u32(temp_lo, x0, y1);  // temp_lo += x0*y0
    temp_lo = vshrq_n_u64(temp_lo, 32);    // temp_lo >>= 32
    temp_hi = vmlal_u32(temp_hi, x1, y1);  // temp_hi += x1*y1
    uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
    return result;
}

static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
    int64x2_t p = vreinterpretq_s64_u64(
        libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
    int64x2_t y = vdupq_n_s64(sy);
    int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
    int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
    p = vsubq_s64(p, t1);
    p = vsubq_s64(p, t2);
    return p;
}

////////// UINT16

uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}

uint16x8_t libdivide_u16_branchfree_do_vec128(
    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}

////////// UINT32

uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return libdivide_u32_neon_srl(numers, more);
    } else {
        uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            // Note we can use halving-subtract to avoid the shift.
            uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
            uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
            return libdivide_u32_neon_srl(t, shift);
        } else {
            return libdivide_u32_neon_srl(q, more);
        }
    }
}

uint32x4_t libdivide_u32_branchfree_do_vec128(
    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
    uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
    uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
    return libdivide_u32_neon_srl(t, denom->more);
}

////////// UINT64

uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return libdivide_u64_neon_srl(numers, more);
    } else {
        uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            // No 64-bit halving subtracts in NEON :(
            uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
            uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
            return libdivide_u64_neon_srl(t, shift);
        } else {
            return libdivide_u64_neon_srl(q, more);
        }
    }
}

uint64x2_t libdivide_u64_branchfree_do_vec128(
    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
    uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
    uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
    return libdivide_u64_neon_srl(t, denom->more);
}

////////// SINT16

int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}

int16x8_t libdivide_s16_branchfree_do_vec128(
    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}

////////// SINT32

int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
        uint32_t mask = ((uint32_t)1 << shift) - 1;
        int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
        // q = numer + ((numer >> 31) & roundToZeroTweak);
        int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
        q = libdivide_s32_neon_sra(q, shift);
        int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = vsubq_s32(veorq_s32(q, sign), sign);
        return q;
    } else {
        int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
        }
        // q >>= shift
        q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
        q = vaddq_s32(
            q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31)));  // q += (q < 0)
        return q;
    }
}

int32x4_t libdivide_s32_branchfree_do_vec128(
    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
    int32_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    // must be arithmetic shift
    int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
    int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
    q = vaddq_s32(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2
    uint32_t is_power_of_2 = (magic == 0);
    int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31
    int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
    q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)
    q = libdivide_s32_neon_sra(q, shift);       // q >>= shift
    q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

////////// SINT64

int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
    uint8_t more = denom->more;
    int64_t magic = denom->magic;
    if (magic == 0) {  // shift path
        uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
        uint64_t mask = ((uint64_t)1 << shift) - 1;
        int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend
        // q = numer + ((numer >> 63) & roundToZeroTweak);
        int64x2_t q =
            vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
        q = libdivide_s64_neon_sra(q, shift);
        // q = (q ^ sign) - sign;
        int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
        q = vsubq_s64(veorq_s64(q, sign), sign);
        return q;
    } else {
        int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: no need to widen
            // q += ((numer ^ sign) - sign);
            q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
        }
        // q >>= denom->mult_path.shift
        q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
        q = vaddq_s64(
            q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63)));  // q += (q < 0)
        return q;
    }
}

int64x2_t libdivide_s64_branchfree_do_vec128(
    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
    int64_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    // must be arithmetic shift
    int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: avoid sign extend

    // libdivide_mullhi_s64(numers, magic);
    int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
    q = vaddq_s64(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do.
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2.
    uint32_t is_power_of_2 = (magic == 0);
    int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
    int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
    q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)
    q = libdivide_s64_neon_sra(q, shift);       // q >>= shift
    q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

#endif

#if defined(LIBDIVIDE_AVX512)

static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
    __m512i numers, const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
    __m512i numers, const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
    __m512i numers, const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
    __m512i numers, const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
    __m512i numers, const struct libdivide_u64_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
    __m512i numers, const struct libdivide_s64_t *denom);

static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
    __m512i numers, const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
    __m512i numers, const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
    __m512i numers, const struct libdivide_u32_branchfree_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
    __m512i numers, const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
    __m512i numers, const struct libdivide_u64_branchfree_t *denom);
static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
    __m512i numers, const struct libdivide_s64_branchfree_t *denom);

//////// Internal Utility Functions

static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
    ;
    return _mm512_srai_epi64(v, 63);
}

static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
    return _mm512_srai_epi64(v, amt);
}

// Here, b is assumed to contain one 32-bit value repeated.
static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
    __m512i a1X3X = _mm512_srli_epi64(a, 32);
    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
}

// b is one 32-bit value repeated.
static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
    __m512i a1X3X = _mm512_srli_epi64(a, 32);
    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
}

// Here, y is assumed to contain one 64-bit value repeated.
static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
    // see m128i variant for comments.
    __m512i x0y0 = _mm512_mul_epu32(x, y);
    __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);

    __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
    __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));

    __m512i x0y1 = _mm512_mul_epu32(x, y1);
    __m512i x1y0 = _mm512_mul_epu32(x1, y);
    __m512i x1y1 = _mm512_mul_epu32(x1, y1);

    __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
    __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
    __m512i temp_lo = _mm512_and_si512(temp, mask);
    __m512i temp_hi = _mm512_srli_epi64(temp, 32);

    temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
    temp_hi = _mm512_add_epi64(x1y1, temp_hi);
    return _mm512_add_epi64(temp_lo, temp_hi);
}

// y is one 64-bit value repeated.
static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
    __m512i p = libdivide_mullhi_u64_vec512(x, y);
    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
    p = _mm512_sub_epi64(p, t1);
    p = _mm512_sub_epi64(p, t2);
    return p;
}

////////// UINT16

__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}

__m512i libdivide_u16_branchfree_do_vec512(
    __m512i numers, const struct libdivide_u16_branchfree_t *denom){
    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}

////////// UINT32

__m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm512_srli_epi32(numers, more);
    } else {
        __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
            __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
            return _mm512_srli_epi32(t, shift);
        } else {
            return _mm512_srli_epi32(q, more);
        }
    }
}

__m512i libdivide_u32_branchfree_do_vec512(
    __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
    __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
    __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
    return _mm512_srli_epi32(t, denom->more);
}

////////// UINT64

__m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm512_srli_epi64(numers, more);
    } else {
        __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
            __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
            return _mm512_srli_epi64(t, shift);
        } else {
            return _mm512_srli_epi64(q, more);
        }
    }
}

__m512i libdivide_u64_branchfree_do_vec512(
    __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
    __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
    __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
    return _mm512_srli_epi64(t, denom->more);
}

////////// SINT16

__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}

__m512i libdivide_s16_branchfree_do_vec512(
    __m512i numers, const struct libdivide_s16_branchfree_t *denom){
    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}

////////// SINT32

__m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
        uint32_t mask = ((uint32_t)1 << shift) - 1;
        __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
        // q = numer + ((numer >> 31) & roundToZeroTweak);
        __m512i q = _mm512_add_epi32(
            numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
        q = _mm512_srai_epi32(q, shift);
        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
        return q;
    } else {
        __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
        }
        // q >>= shift
        q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
        q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31));  // q += (q < 0)
        return q;
    }
}

__m512i libdivide_s32_branchfree_do_vec512(
    __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
    int32_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    // must be arithmetic shift
    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
    __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
    q = _mm512_add_epi32(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2
    uint32_t is_power_of_2 = (magic == 0);
    __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31
    __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
    q = _mm512_srai_epi32(q, shift);                          // q >>= shift
    q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

////////// SINT64

__m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
    uint8_t more = denom->more;
    int64_t magic = denom->magic;
    if (magic == 0) {  // shift path
        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
        uint64_t mask = ((uint64_t)1 << shift) - 1;
        __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
        // q = numer + ((numer >> 63) & roundToZeroTweak);
        __m512i q = _mm512_add_epi64(
            numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
        q = libdivide_s64_shift_right_vec512(q, shift);
        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
        return q;
    } else {
        __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
        }
        // q >>= denom->mult_path.shift
        q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
        q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63));  // q += (q < 0)
        return q;
    }
}

__m512i libdivide_s64_branchfree_do_vec512(
    __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
    int64_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    // must be arithmetic shift
    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);

    // libdivide_mullhi_s64(numers, magic);
    __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
    q = _mm512_add_epi64(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do.
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2.
    uint32_t is_power_of_2 = (magic == 0);
    __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63
    __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
    q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
    q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift
    q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

#endif

#if defined(LIBDIVIDE_AVX2)

static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
    __m256i numers, const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
    __m256i numers, const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
    __m256i numers, const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
    __m256i numers, const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
    __m256i numers, const struct libdivide_u64_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
    __m256i numers, const struct libdivide_s64_t *denom);

static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
    __m256i numers, const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
    __m256i numers, const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
    __m256i numers, const struct libdivide_u32_branchfree_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
    __m256i numers, const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
    __m256i numers, const struct libdivide_u64_branchfree_t *denom);
static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
    __m256i numers, const struct libdivide_s64_branchfree_t *denom);

//////// Internal Utility Functions

// Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
    __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
    __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
    return signBits;
}

// Implementation of _mm256_srai_epi64 (from AVX512).
static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
    const int b = 64 - amt;
    __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
    __m256i x = _mm256_srli_epi64(v, amt);
    __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
    return result;
}

// Here, b is assumed to contain one 32-bit value repeated.
static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
    __m256i a1X3X = _mm256_srli_epi64(a, 32);
    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
}

// b is one 32-bit value repeated.
static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
    __m256i a1X3X = _mm256_srli_epi64(a, 32);
    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
}

// Here, y is assumed to contain one 64-bit value repeated.
static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
    // see m128i variant for comments.
    __m256i x0y0 = _mm256_mul_epu32(x, y);
    __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);

    __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
    __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));

    __m256i x0y1 = _mm256_mul_epu32(x, y1);
    __m256i x1y0 = _mm256_mul_epu32(x1, y);
    __m256i x1y1 = _mm256_mul_epu32(x1, y1);

    __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
    __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
    __m256i temp_lo = _mm256_and_si256(temp, mask);
    __m256i temp_hi = _mm256_srli_epi64(temp, 32);

    temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
    temp_hi = _mm256_add_epi64(x1y1, temp_hi);
    return _mm256_add_epi64(temp_lo, temp_hi);
}

// y is one 64-bit value repeated.
static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
    __m256i p = libdivide_mullhi_u64_vec256(x, y);
    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
    p = _mm256_sub_epi64(p, t1);
    p = _mm256_sub_epi64(p, t2);
    return p;
}

////////// UINT16

__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm256_srli_epi16(numers, more);
    } else {
        __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
            return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
        } else {
            return _mm256_srli_epi16(q, more);
        }
    }
}

__m256i libdivide_u16_branchfree_do_vec256(
    __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
    __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
    __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
    return _mm256_srli_epi16(t, denom->more);
}

////////// UINT32

__m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm256_srli_epi32(numers, more);
    } else {
        __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
            __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
            return _mm256_srli_epi32(t, shift);
        } else {
            return _mm256_srli_epi32(q, more);
        }
    }
}

__m256i libdivide_u32_branchfree_do_vec256(
    __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
    __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
    __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
    return _mm256_srli_epi32(t, denom->more);
}

////////// UINT64

__m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm256_srli_epi64(numers, more);
    } else {
        __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
            __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
            return _mm256_srli_epi64(t, shift);
        } else {
            return _mm256_srli_epi64(q, more);
        }
    }
}

__m256i libdivide_u64_branchfree_do_vec256(
    __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
    __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
    __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
    return _mm256_srli_epi64(t, denom->more);
}

////////// SINT16

__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
        uint16_t mask = ((uint16_t)1 << shift) - 1;
        __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
        // q = numer + ((numer >> 15) & roundToZeroTweak);
        __m256i q = _mm256_add_epi16(
            numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
        q = _mm256_srai_epi16(q, shift);
        __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
        return q;
    } else {
        __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
        }
        // q >>= shift
        q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
        q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15));  // q += (q < 0)
        return q;
    }
}

__m256i libdivide_s16_branchfree_do_vec256(
    __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
    int16_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    // must be arithmetic shift
    __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
    __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
    q = _mm256_add_epi16(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2
    uint16_t is_power_of_2 = (magic == 0);
    __m256i q_sign = _mm256_srai_epi16(q, 15);  // q_sign = q >> 15
    __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
    q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    q = _mm256_srai_epi16(q, shift);                          // q >>= shift
    q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

////////// SINT32

__m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
        uint32_t mask = ((uint32_t)1 << shift) - 1;
        __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
        // q = numer + ((numer >> 31) & roundToZeroTweak);
        __m256i q = _mm256_add_epi32(
            numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
        q = _mm256_srai_epi32(q, shift);
        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
        return q;
    } else {
        __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
        }
        // q >>= shift
        q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
        q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31));  // q += (q < 0)
        return q;
    }
}

__m256i libdivide_s32_branchfree_do_vec256(
    __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
    int32_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    // must be arithmetic shift
    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
    __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
    q = _mm256_add_epi32(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2
    uint32_t is_power_of_2 = (magic == 0);
    __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31
    __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    q = _mm256_srai_epi32(q, shift);                          // q >>= shift
    q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

////////// SINT64

__m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
    uint8_t more = denom->more;
    int64_t magic = denom->magic;
    if (magic == 0) {  // shift path
        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
        uint64_t mask = ((uint64_t)1 << shift) - 1;
        __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
        // q = numer + ((numer >> 63) & roundToZeroTweak);
        __m256i q = _mm256_add_epi64(
            numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
        q = libdivide_s64_shift_right_vec256(q, shift);
        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
        return q;
    } else {
        __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
        }
        // q >>= denom->mult_path.shift
        q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
        q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63));  // q += (q < 0)
        return q;
    }
}

__m256i libdivide_s64_branchfree_do_vec256(
    __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
    int64_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    // must be arithmetic shift
    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);

    // libdivide_mullhi_s64(numers, magic);
    __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
    q = _mm256_add_epi64(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do.
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2.
    uint32_t is_power_of_2 = (magic == 0);
    __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63
    __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
    q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
    q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift
    q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

#endif

#if defined(LIBDIVIDE_SSE2)

static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
    __m128i numers, const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
    __m128i numers, const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
    __m128i numers, const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
    __m128i numers, const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
    __m128i numers, const struct libdivide_u64_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
    __m128i numers, const struct libdivide_s64_t *denom);

static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
    __m128i numers, const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
    __m128i numers, const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
    __m128i numers, const struct libdivide_u32_branchfree_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
    __m128i numers, const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
    __m128i numers, const struct libdivide_u64_branchfree_t *denom);
static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
    __m128i numers, const struct libdivide_s64_branchfree_t *denom);

//////// Internal Utility Functions

// Implementation of _mm_srai_epi64(v, 63) (from AVX512).
static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
    __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
    __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
    return signBits;
}

// Implementation of _mm_srai_epi64 (from AVX512).
static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
    const int b = 64 - amt;
    __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
    __m128i x = _mm_srli_epi64(v, amt);
    __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
    return result;
}

// Here, b is assumed to contain one 32-bit value repeated.
static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
    __m128i a1X3X = _mm_srli_epi64(a, 32);
    __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
}

// SSE2 does not have a signed multiplication instruction, but we can convert
// unsigned to signed pretty efficiently. Again, b is just a 32 bit value
// repeated four times.
static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
    __m128i p = libdivide_mullhi_u32_vec128(a, b);
    // t1 = (a >> 31) & y, arithmetic shift
    __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
    __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
    p = _mm_sub_epi32(p, t1);
    p = _mm_sub_epi32(p, t2);
    return p;
}

// Here, y is assumed to contain one 64-bit value repeated.
static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
    // full 128 bits product is:
    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.

    // Compute x0*y0.
    // Note x1, y1 are ignored by mul_epu32.
    __m128i x0y0 = _mm_mul_epu32(x, y);
    __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);

    // Get x1, y1 in the low bits.
    // We could shuffle or right shift. Shuffles are preferred as they preserve
    // the source register for the next computation.
    __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
    __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));

    // No need to mask off top 32 bits for mul_epu32.
    __m128i x0y1 = _mm_mul_epu32(x, y1);
    __m128i x1y0 = _mm_mul_epu32(x1, y);
    __m128i x1y1 = _mm_mul_epu32(x1, y1);

    // Mask here selects low bits only.
    __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
    __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
    __m128i temp_lo = _mm_and_si128(temp, mask);
    __m128i temp_hi = _mm_srli_epi64(temp, 32);

    temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
    temp_hi = _mm_add_epi64(x1y1, temp_hi);
    return _mm_add_epi64(temp_lo, temp_hi);
}

// y is one 64-bit value repeated.
static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
    __m128i p = libdivide_mullhi_u64_vec128(x, y);
    __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
    __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
    p = _mm_sub_epi64(p, t1);
    p = _mm_sub_epi64(p, t2);
    return p;
}

////////// UINT26

__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm_srli_epi16(numers, more);
    } else {
        __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
            return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
        } else {
            return _mm_srli_epi16(q, more);
        }
    }
}

__m128i libdivide_u16_branchfree_do_vec128(
    __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
    __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
    __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
    return _mm_srli_epi16(t, denom->more);
}

////////// UINT32

__m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm_srli_epi32(numers, more);
    } else {
        __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
            __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
            return _mm_srli_epi32(t, shift);
        } else {
            return _mm_srli_epi32(q, more);
        }
    }
}

__m128i libdivide_u32_branchfree_do_vec128(
    __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
    __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
    return _mm_srli_epi32(t, denom->more);
}

////////// UINT64

__m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        return _mm_srli_epi64(numers, more);
    } else {
        __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // uint32_t t = ((numer - q) >> 1) + q;
            // return t >> denom->shift;
            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
            __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
            return _mm_srli_epi64(t, shift);
        } else {
            return _mm_srli_epi64(q, more);
        }
    }
}

__m128i libdivide_u64_branchfree_do_vec128(
    __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
    __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
    return _mm_srli_epi64(t, denom->more);
}

////////// SINT16

__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
        uint16_t mask = ((uint16_t)1 << shift) - 1;
        __m128i roundToZeroTweak = _mm_set1_epi16(mask);
        // q = numer + ((numer >> 15) & roundToZeroTweak);
        __m128i q =
            _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
        q = _mm_srai_epi16(q, shift);
        __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
        return q;
    } else {
        __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
        }
        // q >>= shift
        q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
        q = _mm_add_epi16(q, _mm_srli_epi16(q, 15));  // q += (q < 0)
        return q;
    }
}

__m128i libdivide_s16_branchfree_do_vec128(
    __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
    int16_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
    // must be arithmetic shift
    __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
    __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
    q = _mm_add_epi16(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2
    uint16_t is_power_of_2 = (magic == 0);
    __m128i q_sign = _mm_srai_epi16(q, 15);  // q_sign = q >> 15
    __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
    q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    q = _mm_srai_epi16(q, shift);                       // q >>= shift
    q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

////////// SINT32

__m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
    uint8_t more = denom->more;
    if (!denom->magic) {
        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
        uint32_t mask = ((uint32_t)1 << shift) - 1;
        __m128i roundToZeroTweak = _mm_set1_epi32(mask);
        // q = numer + ((numer >> 31) & roundToZeroTweak);
        __m128i q =
            _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
        q = _mm_srai_epi32(q, shift);
        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
        return q;
    } else {
        __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
        }
        // q >>= shift
        q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));  // q += (q < 0)
        return q;
    }
}

__m128i libdivide_s32_branchfree_do_vec128(
    __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
    int32_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
    // must be arithmetic shift
    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
    __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
    q = _mm_add_epi32(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2
    uint32_t is_power_of_2 = (magic == 0);
    __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31
    __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    q = _mm_srai_epi32(q, shift);                       // q >>= shift
    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

////////// SINT64

__m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
    uint8_t more = denom->more;
    int64_t magic = denom->magic;
    if (magic == 0) {  // shift path
        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
        uint64_t mask = ((uint64_t)1 << shift) - 1;
        __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
        // q = numer + ((numer >> 63) & roundToZeroTweak);
        __m128i q = _mm_add_epi64(
            numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
        q = libdivide_s64_shift_right_vec128(q, shift);
        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
        // q = (q ^ sign) - sign;
        q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
        return q;
    } else {
        __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
        if (more & LIBDIVIDE_ADD_MARKER) {
            // must be arithmetic shift
            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
            // q += ((numer ^ sign) - sign);
            q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
        }
        // q >>= denom->mult_path.shift
        q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
        q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));  // q += (q < 0)
        return q;
    }
}

__m128i libdivide_s64_branchfree_do_vec128(
    __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
    int64_t magic = denom->magic;
    uint8_t more = denom->more;
    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
    // must be arithmetic shift
    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);

    // libdivide_mullhi_s64(numers, magic);
    __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
    q = _mm_add_epi64(q, numers);  // q += numers

    // If q is non-negative, we have nothing to do.
    // If q is negative, we want to add either (2**shift)-1 if d is
    // a power of 2, or (2**shift) if it is not a power of 2.
    uint32_t is_power_of_2 = (magic == 0);
    __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63
    __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
    q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
    q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift
    q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
    return q;
}

#endif

////////// C++ stuff

#ifdef __cplusplus

enum Branching {
    BRANCHFULL,  // use branching algorithms
    BRANCHFREE   // use branchfree algorithms
};

namespace detail {
enum Signedness {
    SIGNED,
    UNSIGNED,
};

#if defined(LIBDIVIDE_NEON)
// Helper to deduce NEON vector type for integral type.
template <int _WIDTH, Signedness _SIGN>
struct NeonVec {};

template <>
struct NeonVec<16, UNSIGNED> {
    typedef uint16x8_t type;
};

template <>
struct NeonVec<16, SIGNED> {
    typedef int16x8_t type;
};

template <>
struct NeonVec<32, UNSIGNED> {
    typedef uint32x4_t type;
};

template <>
struct NeonVec<32, SIGNED> {
    typedef int32x4_t type;
};

template <>
struct NeonVec<64, UNSIGNED> {
    typedef uint64x2_t type;
};

template <>
struct NeonVec<64, SIGNED> {
    typedef int64x2_t type;
};

template <typename T>
struct NeonVecFor {
    // See 'class divider' for an explanation of these template parameters.
    typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
};

#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
    LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
        typename NeonVecFor<INT_TYPE>::type n) const {           \
        return libdivide_##ALGO##_do_vec128(n, &denom);          \
    }
#else
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
#endif

#if defined(LIBDIVIDE_SSE2)
#define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
    LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
        return libdivide_##ALGO##_do_vec128(n, &denom); \
    }
#else
#define LIBDIVIDE_DIVIDE_SSE2(ALGO)
#endif

#if defined(LIBDIVIDE_AVX2)
#define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \
    LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \
        return libdivide_##ALGO##_do_vec256(n, &denom); \
    }
#else
#define LIBDIVIDE_DIVIDE_AVX2(ALGO)
#endif

#if defined(LIBDIVIDE_AVX512)
#define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \
    LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \
        return libdivide_##ALGO##_do_vec512(n, &denom); \
    }
#else
#define LIBDIVIDE_DIVIDE_AVX512(ALGO)
#endif

// The DISPATCHER_GEN() macro generates C++ methods (for the given integer
// and algorithm types) that redirect to libdivide's C API.
#define DISPATCHER_GEN(T, ALGO)                                                       \
    libdivide_##ALGO##_t denom;                                                       \
    LIBDIVIDE_INLINE dispatcher() {}                                                  \
    explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {}              \
    LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
    LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
    LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
    LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \
    LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \
    LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \
    LIBDIVIDE_DIVIDE_AVX512(ALGO)

// The dispatcher selects a specific division algorithm for a given
// width, signedness, and ALGO using partial template specialization.
template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
struct dispatcher {};

template <>
struct dispatcher<16, SIGNED, BRANCHFULL> {
    DISPATCHER_GEN(int16_t, s16)
};
template <>
struct dispatcher<16, SIGNED, BRANCHFREE> {
    DISPATCHER_GEN(int16_t, s16_branchfree)
};
template <>
struct dispatcher<16, UNSIGNED, BRANCHFULL> {
    DISPATCHER_GEN(uint16_t, u16)
};
template <>
struct dispatcher<16, UNSIGNED, BRANCHFREE> {
    DISPATCHER_GEN(uint16_t, u16_branchfree)
};
template <>
struct dispatcher<32, SIGNED, BRANCHFULL> {
    DISPATCHER_GEN(int32_t, s32)
};
template <>
struct dispatcher<32, SIGNED, BRANCHFREE> {
    DISPATCHER_GEN(int32_t, s32_branchfree)
};
template <>
struct dispatcher<32, UNSIGNED, BRANCHFULL> {
    DISPATCHER_GEN(uint32_t, u32)
};
template <>
struct dispatcher<32, UNSIGNED, BRANCHFREE> {
    DISPATCHER_GEN(uint32_t, u32_branchfree)
};
template <>
struct dispatcher<64, SIGNED, BRANCHFULL> {
    DISPATCHER_GEN(int64_t, s64)
};
template <>
struct dispatcher<64, SIGNED, BRANCHFREE> {
    DISPATCHER_GEN(int64_t, s64_branchfree)
};
template <>
struct dispatcher<64, UNSIGNED, BRANCHFULL> {
    DISPATCHER_GEN(uint64_t, u64)
};
template <>
struct dispatcher<64, UNSIGNED, BRANCHFREE> {
    DISPATCHER_GEN(uint64_t, u64_branchfree)
};
}  // namespace detail

#if defined(LIBDIVIDE_NEON)
// Allow NeonVecFor outside of detail namespace.
template <typename T>
struct NeonVecFor {
    typedef typename detail::NeonVecFor<T>::type type;
};
#endif

// This is the main divider class for use by the user (C++ API).
// The actual division algorithm is selected using the dispatcher struct
// based on the integer width and algorithm template parameters.
template <typename T, Branching ALGO = BRANCHFULL>
class divider {
   private:
    // Dispatch based on the size and signedness.
    // We avoid using type_traits as it's not available in AVR.
    // Detect signedness by checking if T(-1) is less than T(0).
    // Also throw in a shift by 0, which prevents floating point types from being passed.
    typedef detail::dispatcher<sizeof(T) * 8,
        (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
        dispatcher_t;

   public:
    // We leave the default constructor empty so that creating
    // an array of dividers and then initializing them
    // later doesn't slow us down.
    divider() {}

    // constexpr zero-initialization to allow for use w/ static constinit
    explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}

    // Constructor that takes the divisor as a parameter
    LIBDIVIDE_INLINE divider(T d) : div(d) {}

    // Divides n by the divisor
    LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }

    // Recovers the divisor, returns the value that was
    // used to initialize this divider object.
    T recover() const { return div.recover(); }

    bool operator==(const divider<T, ALGO> &other) const {
        return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
    }

    bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }

    // Vector variants treat the input as packed integer values with the same type as the divider
    // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
    // quotients.
#if defined(LIBDIVIDE_SSE2)
    LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
#endif
#if defined(LIBDIVIDE_AVX2)
    LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
#endif
#if defined(LIBDIVIDE_AVX512)
    LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
#endif
#if defined(LIBDIVIDE_NEON)
    LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
        return div.divide(n);
    }
#endif

   private:
    // Storage for the actual divisor
    dispatcher_t div;
};

// Overload of operator / for scalar division
template <typename T, Branching ALGO>
LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
    return div.divide(n);
}

// Overload of operator /= for scalar division
template <typename T, Branching ALGO>
LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
    n = div.divide(n);
    return n;
}

// Overloads for vector types.
#if defined(LIBDIVIDE_SSE2)
template <typename T, Branching ALGO>
LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
    return div.divide(n);
}

template <typename T, Branching ALGO>
LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
    n = div.divide(n);
    return n;
}
#endif
#if defined(LIBDIVIDE_AVX2)
template <typename T, Branching ALGO>
LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
    return div.divide(n);
}

template <typename T, Branching ALGO>
LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
    n = div.divide(n);
    return n;
}
#endif
#if defined(LIBDIVIDE_AVX512)
template <typename T, Branching ALGO>
LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
    return div.divide(n);
}

template <typename T, Branching ALGO>
LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
    n = div.divide(n);
    return n;
}
#endif

#if defined(LIBDIVIDE_NEON)
template <typename T, Branching ALGO>
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
    typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
    return div.divide(n);
}

template <typename T, Branching ALGO>
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
    typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
    n = div.divide(n);
    return n;
}
#endif

#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
// libdivide::branchfree_divider<T>
template <typename T>
using branchfree_divider = divider<T, BRANCHFREE>;
#endif

}  // namespace libdivide

#endif  // __cplusplus

#if defined(_MSC_VER) && !defined(__clang__)
#pragma warning(pop)
#endif

#endif  // LIBDIVIDE_H


================================================
FILE: ext/skeletontricks/skeletontricks.hpp
================================================
/*
 * This file is part of Kimimaro.
 * 
 * Kimimaro is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Kimimaro is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.
 *
 * 
 * Author: William Silversmith
 * Affiliation: Seung Lab, Princeton University
 * Date: September 2018 - April 2025
 */

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdint>
#include <queue>
#include <vector>
#include <stack>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <utility>

#include "unordered_dense.hpp"

#ifndef SKELETONTRICKS_HPP
#define SKELETONTRICKS_HPP

namespace skeletontricks {

size_t _roll_invalidation_cube(
  uint8_t* labels, float* DBF,
  const int64_t sx, const int64_t sy, const int64_t sz,
  const float wx, const float wy, const float wz,
  size_t* path, const size_t path_size,
  const float scale, const float constant
) {

  if (path_size == 0) {
    return 0;
  }

  const size_t sxy = sx * sy;
  const size_t voxels = sxy * sz;

  int64_t minx, maxx, miny, maxy, minz, maxz;
  int64_t x, y, z;

  int64_t global_minx = sx;
  int64_t global_maxx = 0;
  int64_t global_miny = sy;
  int64_t global_maxy = 0;
  int64_t global_minz = sz;
  int64_t global_maxz = 0;

  std::vector<int16_t> topology(voxels);
  
  const bool power_of_two = !((sx & (sx - 1)) || (sy & (sy - 1))); 
  const int xshift = std::log2(sx); // must use log2 here, not lg/lg2 to avoid fp errors
  const int yshift = std::log2(sy);

  size_t loc;
  float radius;

  // First pass: compute toplology
  for (size_t i = 0; i < path_size; i++) {
    loc = path[i];
    radius = scale * DBF[loc] + constant;

    if (power_of_two) {
      z = loc >> (xshift + yshift);
      y = (loc - (z << (xshift + yshift))) >> xshift;
      x = loc - ((y + (z << yshift)) << xshift);
    }
    else {
      z = loc / sxy;
      y = (loc - (z * sxy)) / sx;
      x = loc - sx * (y + z * sy);
    }

    const int64_t ZERO = 0;

    minx = std::max(ZERO,    static_cast<int64_t>(x - (radius / wx)));
    maxx = std::min(sx-1, static_cast<int64_t>(0.5 + (x + (radius / wx))));
    miny = std::max(ZERO,    static_cast<int64_t>(y - (radius / wy)));
    maxy = std::min(sy-1, static_cast<int64_t>(0.5 + (y + (radius / wy))));
    minz = std::max(ZERO,    static_cast<int64_t>(z - (radius / wz)));
    maxz = std::min(sz-1, static_cast<int64_t>(0.5 + (z + (radius / wz))));

    global_minx = std::min(global_minx, minx);
    global_maxx = std::max(global_maxx, maxx);
    global_miny = std::min(global_miny, miny);
    global_maxy = std::max(global_maxy, maxy);
    global_minz = std::min(global_minz, minz);
    global_maxz = std::max(global_maxz, maxz);

    for (y = miny; y <= maxy; y++) {
      for (z = minz; z <= maxz; z++) {
        topology[minx + sx * y + sxy * z] += 1;
        topology[maxx + sx * y + sxy * z] -= 1;
      }
    }
  }

  // Second pass: invalidate labels
  int coloring;
  size_t invalidated = 0;
  size_t yzoffset;
  for (z = global_minz; z <= global_maxz; z++) {
    for (y = global_miny; y <= global_maxy; y++) {
      yzoffset = sx * y + sxy * z;

      coloring = 0;
      for (x = global_minx; x <= global_maxx; x++) {
        coloring += topology[x + yzoffset];
        if (coloring > 0 || topology[x + yzoffset]) {
          invalidated += static_cast<size_t>(labels[x + yzoffset] > 0); // convert non-bool vals
          labels[x + yzoffset] = 0;
        }
      }
    }
  }

  return invalidated;
}

template <typename T>
inline size_t max(T* edges, const size_t size) {
  if (size == 0) {
    return 0;
  }

  size_t mx = edges[0];
  for (size_t i = 0; i < size; i++) {
    if (static_cast<size_t>(edges[i]) > mx) {
      mx = static_cast<size_t>(edges[i]);
    }
  }

  return mx;
}

template <typename T>
void printvec(std::vector<T> vec) {
  for (T v : vec) {
    printf("%d, ", v);
  }
  printf("\n");
}

template <typename T>
void printstack(std::stack<T> stack) {
  while (!stack.empty()) {
    printf("%d, ", stack.top());
    stack.pop();
  }

  printf("\n");
}

template <typename T>
std::vector<T> stack2vec(std::stack<T> stk) {
  std::vector<T> vec;
  vec.reserve(stk.size());

  while (!stk.empty()) {
    vec.push_back(stk.top());
    stk.pop();
  }

  std::reverse(vec.begin(), vec.end());

  return vec;
}

// Ne = size of edges / 2
// Nv = number of vertices (max of edge values)
template <typename T>
std::vector<T> _find_cycle(const T* edges, const size_t Ne) {
  if (Ne == 0) {
    return std::vector<T>(0);
  }

  size_t Nv = max(edges, Ne * 2) + 1; // +1 to ensure zero is counted

  std::vector< ankerl::unordered_dense::set<T> > index(Nv);
  index.reserve(Nv);

  // NB: consolidate handles the trivial loops (e1 == e2)
  //     and deduplication of edges
  for (size_t i = 0; i < 2 * Ne; i += 2) {
    T e1 = edges[i];
    T e2 = edges[i+1];

    index[e1].insert(e2);
    index[e2].insert(e1);
  }

  T root = edges[0];
  T node = -1;
  T parent = -1;
  uint32_t depth = -1;

  std::stack<T> stack;
  std::stack<T> parents;
  std::stack<uint32_t> depth_stack;
  std::stack<T> path;

  stack.push(root);
  parents.push(-1);
  depth_stack.push(0);
  
  std::vector<bool> visited(Nv, false);

  while (!stack.empty()) {
    node = stack.top();
    parent = parents.top();
    depth = depth_stack.top();

    stack.pop();
    parents.pop();
    depth_stack.pop();

    while (path.size() > depth) {
      path.pop();
    }

    path.push(node);

    if (visited[node]) {
      break;
    }
    visited[node] = true;

    for (T child : index[node]) {
      if (child == parent) {
        continue;
      }

      stack.push(child);
      parents.push(node);
      depth_stack.push(depth + 1);
    }
  }

  if (path.size() <= 1) {
    return std::vector<T>(0);
  }

  // cast stack to vector w/ zero copy
  std::vector<T> vec_path = stack2vec<T>(path);

  // Find start of loop. Since a cycle was detected,
  // the last node found started the cycle. We need
  // to trim the path leading up to that connection.
  size_t i;
  for (i = 0; i < vec_path.size() - 1; i++) {
    if (vec_path[i] == node) {
      break;
    }
  }

  if (vec_path.size() - i < 3) {
    return std::vector<T>(0);
  }

  return std::vector<T>(vec_path.begin() + i, vec_path.end());
}

// Had trouble returning an unordered_map< pair<int,int>, float>
// to python, so I decided to just pack two uint32s into a uint64
// and unpack them on the other side.
std::unordered_map<uint64_t, float> _create_distance_graph(
  float* vertices, size_t Nv, 
  uint32_t* edges, size_t Ne, uint32_t start_node,
  std::vector<int32_t> critical_points_vec
) {

  std::vector< std::vector<uint32_t> > tree(Nv);
  tree.reserve(Nv);

  std::vector<bool> critical_points(Nv, false);
  for (uint32_t edge : critical_points_vec) {
    critical_points[edge] = true;
  }

  for (size_t i = 0; i < Ne; i++) {
    uint32_t e1 = edges[2*i];
    uint32_t e2 = edges[2*i + 1];

    tree[e1].push_back(e2);
    tree[e2].push_back(e1);
  }

  std::unordered_map<uint64_t, float> distgraph;

  std::stack<uint32_t> stack;
  std::stack<int32_t> parents;
  std::stack<float> dist_stack;
  std::stack<uint32_t> root_stack;

  stack.push(start_node);
  parents.push(-1);
  dist_stack.push(0.0);
  root_stack.push(start_node);

  uint32_t node, root;
  int32_t parent;
  float dist;

  uint64_t key = 0;

  std::vector<bool> visited(Nv, false);

  while (!stack.empty()) {
    node = stack.top();
    dist = dist_stack.top();
    root = root_stack.top();
    parent = parents.top();

    if (visited[node]) {
      throw std::runtime_error(std::string("Cycle detected. Node: ") + std::to_string(node));
    }
    visited[node] = true;

    stack.pop();
    dist_stack.pop();
    root_stack.pop();
    parents.pop();

    if (critical_points[node] && node != root) {
      key = (root < node)
        ? static_cast<uint64_t>(root) | (static_cast<uint64_t>(node) << 32)
        : static_cast<uint64_t>(node) | (static_cast<uint64_t>(root) << 32);

      distgraph[key] = dist;
      dist = 0.0;
      root = node;
    }

    for (int32_t child : tree[node]) {
      if (static_cast<int32_t>(child) == parent) {
        continue;
      }

      float dx = vertices[3*node + 0] - vertices[3*child + 0];
      float dy = vertices[3*node + 1] - vertices[3*child + 1];
      float dz = vertices[3*node + 2] - vertices[3*child + 2];

      dx *= dx;
      dy *= dy;
      dz *= dz;

      stack.push(child);
      parents.push(static_cast<int32_t>(node));
      dist_stack.push(
        dist + sqrt(dx + dy + dz)
      );
      root_stack.push(root);
    }
  }

  return distgraph;
}

// extracting skeletons from binary images produced by
// other thinning based skeletonization algorithms

inline void compute_neighborhood(
  int *neighborhood, 
  const int x, const int y, const int z,
  const uint64_t sx, const uint64_t sy, const uint64_t sz,
  const int connectivity = 26
) {

  const int sxy = sx * sy;

  const int plus_x = (x < (static_cast<int>(sx) - 1)); // +x
  const int minus_x = -1 * (x > 0); // -x
  const int plus_y = static_cast<int>(sx) * (y < static_cast<int>(sy) - 1); // +y
  const int minus_y = -static_cast<int>(sx) * (y > 0); // -y
  const int minus_z = -sxy * static_cast<int>(z > 0); // -z

  // 6-hood
  neighborhood[0] = minus_x;
  neighborhood[1] = minus_y;
  neighborhood[2] = minus_z;
  
  // 18-hood

  // xy diagonals
  neighborhood[3] = (connectivity > 6) * (minus_x + minus_y) * (minus_x && minus_y); // up-left
  neighborhood[4] = (connectivity > 6) * (plus_x + minus_y) * (plus_x && minus_y); // up-right

  // yz diagonals
  neighborhood[5] = (connectivity > 6) * (minus_x + minus_z) * (minus_x && minus_z); // down-left
  neighborhood[6] = (connectivity > 6) * (plus_x + minus_z) * (plus_x && minus_z); // down-right

  // xz diagonals
  neighborhood[7] = (connectivity > 6) * (minus_y + minus_z) * (minus_y && minus_z); // down-left
  neighborhood[8] = (connectivity > 6) * (plus_y + minus_z) * (plus_y && minus_z); // down-right

  // 26-hood

  // Now the eight corners of the cube
  neighborhood[9] = (connectivity > 18) * (minus_x + minus_y + minus_z) * (minus_y && minus_z);
  neighborhood[10] = (connectivity > 18) * (plus_x + minus_y + minus_z) * (minus_y && minus_z);
  neighborhood[11] = (connectivity > 18) * (minus_x + plus_y + minus_z) * (plus_y && minus_z);
  neighborhood[12] = (connectivity > 18) * (plus_x + plus_y + minus_z) * (plus_y && minus_z);
}

struct pair_hash {
  inline std::size_t operator()(const std::pair<uint64_t,uint64_t> & v) const {
    return v.first * 31 + v.second; // arbitrary hash fn
  }
};

std::unordered_set<std::pair<uint64_t, uint64_t>, pair_hash> 
_extract_edges_from_binary_image(
  const uint8_t* image,
  const uint64_t sx, const uint64_t sy, const uint64_t sz,
  const int connectivity = 26
) {

  const uint64_t sxy = sx * sy;

  std::unordered_set<std::pair<uint64_t, uint64_t>, pair_hash> edges;
  edges.reserve(sx * sy * sz / 100);

  int neighborhood[13];
  uint64_t neighboridx = 0;

  for (uint64_t z = 0; z < sz; z++) {
    for (uint64_t y = 0; y < sy; y++) {
      for (uint64_t x = 0; x < sx; x++) {
        uint64_t loc = x + sx * y + sxy * z;
        if (image[loc] == 0) {
          continue;
        }

        compute_neighborhood(neighborhood, x, y, z, sx, sy, sz, connectivity);

        for (int i = 0; i < 13; i++) {
          if (neighborhood[i] == 0) {
            continue;
          }

          neighboridx = loc + neighborhood[i];
          if (image[neighboridx] == 0) {
            continue;
          }

          if (loc <= neighboridx) {
            edges.emplace(std::make_pair(loc, neighboridx));
          } 
          else {
            edges.emplace(std::make_pair(neighboridx, loc));
          }
        }
      }
    }
  }

  return edges;
}

};

#endif


================================================
FILE: ext/skeletontricks/skeletontricks.pyx
================================================
# cython: language_level=3
"""
Certain operations have to be fast for the skeletonization
procedure. The ones that didn't fit elsewhere have a home here.

Author: William Silversmith
Affiliation: Seung Lab, Princeton Neuroscience Institute
Date: August 2018 - May 2024

*****************************************************************
This file is part of Kimimaro.

Kimimaro is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Kimimaro is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.
*****************************************************************
"""
cimport cython
from libc.stdlib cimport calloc, free
from libc.stdint cimport (
  int8_t, int16_t, int32_t, int64_t,
  uint8_t, uint16_t, uint32_t, uint64_t
)
from libcpp cimport bool
from cpython cimport array 
import array
import sys

from libcpp.vector cimport vector
from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set
from libcpp.utility cimport pair as cpp_pair

cimport numpy as cnp
import numpy as np

cnp.import_array()

from collections import defaultdict

cdef float INFINITY = float('inf')

ctypedef fused UINT:
  uint8_t
  uint16_t
  uint32_t
  uint64_t
  unsigned char

ctypedef fused INTEGER: 
  int8_t
  int16_t
  int32_t
  int64_t
  UINT

cdef extern from "dijkstra_invalidation.hpp" namespace "dijkstra_invalidation":
  cdef int64_t _roll_invalidation_ball(
    uint8_t* field,
    uint64_t sx, uint64_t sy, uint64_t sz, 
    float wx, float wy, float wz, 
    vector[uint64_t] sources,
    vector[float] max_distances,
    int connectivity,
    uint32_t* voxel_connectivity_graph
  )

cdef extern from "skeletontricks.hpp" namespace "skeletontricks":
  cdef size_t _roll_invalidation_cube(
    uint8_t* labels, float* DBF,
    int64_t sx, int64_t sy, int64_t sz,
    float wx, float wy, float wz,
    size_t* path, size_t path_size,
    float scale, float constant
  )

  cdef vector[T] _find_cycle[T](T* edges, size_t Ne)
  
  cdef unordered_map[ uint64_t, float ] _create_distance_graph(
    float* vertices, size_t Nv, 
    uint32_t* edges, size_t Ne, uint32_t start_node,
    vector[int32_t] critical_points_vec
  )

  cdef struct pair_hash:
    size_t __call__(cpp_pair[uint64_t,uint64_t] v)
  cdef unordered_set[ cpp_pair[uint64_t, uint64_t], pair_hash ] _extract_edges_from_binary_image(
    uint8_t* image, 
    uint64_t sx, uint64_t sy, uint64_t sz,
    int connectivity
  )

def find_cycle(cnp.ndarray[int32_t, ndim=2] edges):
  """
  Given a graph of edges that are a single connected component,
  find a cycle via depth first search.

  Returns: list of edges in a cycle (empty list if no cycle is found)
  """
  if edges.size == 0:
    return np.zeros((0,), dtype=np.uint32)

  edges = np.ascontiguousarray(edges)

  cdef cnp.ndarray[int32_t, ndim=1] elist = np.array(
    _find_cycle[int32_t](
      <int32_t*>&edges[0,0], <size_t>(edges.size // 2)
    ),
    dtype=np.int32
  )
  return elist

def create_distance_graph(skeleton):
  """
  Creates the distance "supergraph" from a single connected component 
  skeleton as described in _remove_ticks.

  Returns: a distance "supergraph" describing the physical distance
    between the critical points in the skeleton's structure.

  Example skeleton with output:

      60nm   60nm   60nm     
    1------2------3------4
      30nm |  70nm \
           5        ----6

  { 
    (1,2): 60,  
    (2,3): 60,
    (2,5): 30,
    (3,4): 60,
    (3,6): 70,
  }
  """
  cdef cnp.ndarray[float, ndim=2] vertices = skeleton.vertices
  cdef cnp.ndarray[uint32_t, ndim=2] edges = skeleton.edges

  unique_nodes, unique_counts = np.unique(edges, return_counts=True)
  terminal_nodes = unique_nodes[ unique_counts == 1 ]
  branch_nodes = set(unique_nodes[ unique_counts >= 3 ])
  
  critical_points = set(terminal_nodes)
  critical_points.update(branch_nodes)

  res = _create_distance_graph(
    <float*>&vertices[0,0], vertices.shape[0],
    <uint32_t*>&edges[0,0], edges.shape[0], terminal_nodes[0],
    list(critical_points)
  )
  cdef dict supergraph = res

  cdef dict real_supergraph = {}
  cdef uint64_t key = 0
  cdef int32_t e1, e2

  for key in supergraph.keys():
    e2 = <int32_t>(key & 0xffffffff)
    e1 = <int32_t>(key >> 32)
    real_supergraph[ (e1, e2) ] = supergraph[key]

  return real_supergraph


@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def inf2zero(cnp.ndarray[float, cast=True, ndim=3] field):
  """
  inf2zero(cnp.ndarray[float, cast=True, ndim=3] field)

  Convert infinities to zeros.

  Returns: field
  """
  cdef size_t sx, sy, sz 
  cdef size_t  x,  y,  z

  sx = field.shape[0]
  sy = field.shape[1]
  sz = field.shape[2]

  for z in range(0, sz):
    for y in range(0, sy):
      for x in range(0, sx):
        if field[x,y,z] == INFINITY:
          field[x,y,z] = 0

  return field

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def zero2inf(cnp.ndarray[float, cast=True, ndim=3] field):
  """
  zero2inf(cnp.ndarray[float, cast=True, ndim=3] field)

  Convert zeros to positive infinities.

  Returns: field
  """
  cdef size_t sx, sy, sz 
  cdef size_t  x,  y,  z

  sx = field.shape[0]
  sy = field.shape[1]
  sz = field.shape[2]

  for z in range(0, sz):
    for y in range(0, sy):
      for x in range(0, sx):
        if (field[x,y,z] == 0):
          field[x,y,z] = INFINITY

  return field

@cython.boundscheck(False)  
@cython.wraparound(False)  # turn off negative index wrapping for entire function 
@cython.nonecheck(False)  
def zero_out_all_except(cnp.ndarray[INTEGER, cast=True, ndim=3] field, INTEGER leave_alone): 
  """
  zero_out_all_except(cnp.ndarray[INTEGER, cast=True, ndim=3] field, INTEGER leave_alone)

  Change all values in field to zero except `leave_alone`.

  Returns: field
  """
  cdef size_t sx, sy, sz   
  cdef size_t  x,  y,  z 

  sx = field.shape[0]  
  sy = field.shape[1] 
  sz = field.shape[2] 

  for z in range(0, sz): 
    for y in range(0, sy):  
      for x in range(0, sx):  
        if (field[x,y,z] != leave_alone): 
          field[x,y,z] = 0  

  return field  

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def finite_max(cnp.ndarray[float, cast=True, ndim=3] field):
  """
  float finite_max(cnp.ndarray[float, cast=True, ndim=3] field)

  Given a field of floats that may include infinities, find the 
  largest finite value.
  """
  cdef size_t sx, sy, sz 
  cdef size_t  x,  y,  z

  sx = field.shape[0]
  sy = field.shape[1]
  sz = field.shape[2]

  cdef float maximum = -INFINITY
  for z in range(0, sz):
    for y in range(0, sy):
      for x in range(0, sx):
        if (field[x,y,z] > maximum) and (field[x,y,z] < +INFINITY):
          maximum = field[x,y,z]

  return maximum

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def finite_min(cnp.ndarray[float, cast=True, ndim=3] field):
  """
  float finite_min(cnp.ndarray[float, cast=True, ndim=3] field)

  Given a field of floats that may include infinities, find the 
  minimum finite value.
  """
  cdef size_t sx, sy, sz 
  cdef size_t  x,  y,  z

  sx = field.shape[0]
  sy = field.shape[1]
  sz = field.shape[2]

  cdef float minimum = -INFINITY
  for z in range(0, sz):
    for y in range(0, sy):
      for x in range(0, sx):
        if (field[x,y,z] < minimum) and (field[x,y,z] > -INFINITY):
          minimum = field[x,y,z]

  return minimum

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def first_label(cnp.ndarray[uint8_t, cast=True, ndim=3] labels):
  """
  uint8_t first_label(cnp.ndarray[uint8_t, cast=True, ndim=3] labels)

  Scan through labels to find the first non-zero value and return it.
  """
  cdef size_t sx, sy, sz 
  cdef size_t  x,  y,  z

  sx = labels.shape[0]
  sy = labels.shape[1]
  sz = labels.shape[2]

  for z in range(0, sz):
    for y in range(0, sy):
      for x in range(0, sx):
        if labels[x,y,z]:
          return (x,y,z)

  return None

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def find_target(
    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, 
    cnp.ndarray[float, ndim=3] PDRF
  ):
  """
  find_target(ndarray[uint8_t, cast=True, ndim=3] labels, ndarray[float, ndim=3] PDRF)

  Given a binary image and a coregistered map of values to it, 
  find the coordinate of the voxel corresponding to the first
  instance of the maximum map value.

  Returns: (x, y, z)
  """
  cdef size_t x,y,z
  cdef size_t sx, sy, sz

  sx = labels.shape[0]
  sy = labels.shape[1]
  sz = labels.shape[2]

  cdef int64_t mx, my, mz

  mx = -1
  my = -1
  mz = -1

  cdef float maxpdrf = -INFINITY
  for x in range(0, sx):
    for y in range(0, sy):
      for z in range(0, sz):
        if labels[x,y,z] and PDRF[x,y,z] > maxpdrf:
          maxpdrf = PDRF[x,y,z]
          mx = x
          my = y
          mz = z

  return (mx, my, mz)

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
@cython.binding(True)
def roll_invalidation_ball_inside_component(
    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, 
    cnp.ndarray[float, ndim=3] DBF, 
    float scale, 
    float constant,
    anisotropy,
    path,
    voxel_connectivity_graph = None,
    connectivity = 26,
):
  cdef int64_t sx, sy, sz 
  sx = labels.shape[0]
  sy = labels.shape[1]
  sz = labels.shape[2]

  cdef size_t sxy = sx * sy

  cdef float wx, wy, wz
  (wx, wy, wz) = anisotropy

  max_distances = [ 
    (scale * DBF[x,y,z] + constant) for (x,y,z) in path 
  ]

  path = [ 
    coord[0] + sx * coord[1] + sxy * coord[2] 
    for coord in path if tuple(coord)
  ]

  cdef uint32_t* vcg = NULL
  cdef cnp.ndarray[uint32_t, ndim=3] vcg_arr

  if isinstance(voxel_connectivity_graph, np.ndarray):
    vcg_arr = voxel_connectivity_graph
    vcg = <uint32_t*>&vcg_arr[0,0,0]

  invalidated = _roll_invalidation_ball(
    <uint8_t*>&labels[0,0,0],
    sx, sy, sz, 
    wx, wy, wz,
    path, max_distances,
    connectivity, 
    vcg
  )

  return (invalidated, labels)

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
@cython.binding(True)
def roll_invalidation_ball(
    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, 
    cnp.ndarray[float, ndim=3] DBF, 
    path, float scale, float const,
    anisotropy=(1,1,1),
    invalid_vertices={},
  ):
  """
  Given an anisotropic binary image, its distance transform, and a path 
  traversing the binary image, erase the voxels surrounding the path
  in a sphere around each vertex on the path corresponding to the 
  equation: 

  r = scale * DBF[x,y,z] + const

  Returns: modified labels
  """
  cdef int64_t sx, sy, sz 
  sx = labels.shape[0]
  sy = labels.shape[1]
  sz = labels.shape[2]

  cdef float wx, wy, wz
  (wx, wy, wz) = anisotropy
    
  cdef float radius, dist
  cdef int64_t minx, maxx, miny, maxy, minz, maxz

  cdef int64_t x,y,z
  cdef int64_t x0, y0, z0

  cdef size_t invalidated = 0

  for coord in path:
    if tuple(coord) in invalid_vertices:
      continue

    (x0, y0, z0) = coord
    radius = DBF[x0,y0,z0] * scale + const # physical units (e.g. nm)

    minx = max(0,  <int64_t>(0.5 + (x0 - (radius / wx))))
    maxx = min(sx, <int64_t>(0.5 + (x0 + (radius / wx))))
    miny = max(0,  <int64_t>(0.5 + (y0 - (radius / wy))))
    maxy = min(sy, <int64_t>(0.5 + (y0 + (radius / wy))))
    minz = max(0,  <int64_t>(0.5 + (z0 - (radius / wz))))
    maxz = min(sz, <int64_t>(0.5 + (z0 + (radius / wz))))

    radius *= radius 

    for x in range(minx, maxx):
      for y in range(miny, maxy):
        for z in range(minz, maxz):
          if not labels[x,y,z]:
            continue 

          dist = (wx * (x - x0)) ** 2 + (wy * (y - y0)) ** 2 + (wz * (z - z0)) ** 2
          if dist <= radius:
            invalidated += 1
            labels[x,y,z] = 0

  return invalidated, labels

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
@cython.binding(True)
def get_mapping(
    cnp.ndarray[INTEGER, ndim=3] orig_labels, 
    cnp.ndarray[UINT, ndim=3] cc_labels
  ):
  """
  Given a set of possibly not connected labels 
  and an image containing their labeled connected components, 
  produce a dictionary containing the inverse of this mapping.

  Returns: { $CC_LABEL: $ORIGINAL_LABEL }
  """

  cdef size_t sx, sy, sz 
  sx = orig_labels.shape[0]
  sy = orig_labels.shape[1]
  sz = orig_labels.shape[2]

  cdef size_t x,y,z 

  remap = {}

  if orig_labels.size == 0:
    return remap

  cdef UINT last_label = cc_labels[0,0,0]
  remap[cc_labels[0,0,0]] = orig_labels[0,0,0]

  for z in range(sz):
    for y in range(sy):
      for x in range(sx):
        if last_label == cc_labels[x,y,z]:
          continue
        remap[cc_labels[x,y,z]] = orig_labels[x,y,z]
        last_label = cc_labels[x,y,z]

  return remap

@cython.binding(True)
def compute_centroids(
    cnp.ndarray[UINT, ndim=2] labels,
    float wx, float wy
  ):
  """
  Compute the centroid for every label on a 2D image at once.

  Returns: { $segid: (x, y), ... }
  """

  cdef float[:] xsum = np.zeros( (labels.size,), dtype=np.float32)
  cdef float[:] ysum = np.zeros( (labels.size,), dtype=np.float32)
  cdef uint32_t[:] labelct = np.zeros( (labels.size,), dtype=np.uint32)

  cdef size_t sx, sy
  sx = labels.shape[0]
  sy = labels.shape[1]

  cdef size_t x, y
  cdef uint32_t label = 0

  for x in range(sx):
    for y in range(sy):
      label = labels[x,y]
      if label == 0:
        continue

      xsum[label] += x 
      ysum[label] += y 
      labelct[label] += 1

  result = {}

  cdef float cx = wx * sx / 2
  cdef float cy = wy * sy / 2

  cdef float px, py

  for label in range(labels.size):
    if labelct[label] == 0:
      continue

    px = wx * <float>xsum[label] / <float>labelct[label]
    py = wy * <float>ysum[label] / <float>labelct[label]

    # Since we don't know which coordinate frame we 
    # are using, round toward the center of the image
    # to ensure we get the same pixel every time.
    if px - cx >= 0:
      px = px # will be truncated towards center
    else:
      px = px + wx

    if py - cy >= 0:
      py = py # will be truncated towards center
    else:
      py = py + wy

    result[label] = (<int>(px / wx), <int>(py / wy))

  return result

@cython.binding(True)
def find_border_targets(
    cnp.ndarray[float, ndim=2] dt,
    cnp.ndarray[UINT, ndim=2] cc_labels,
    float wx, float wy
  ):
  """
  Given a set of connected components that line within 
  a plane and their distance transform, return a map of
  label ID to the coordinate of its maximum distance 
  transform value. If there are multiple maxima, we 
  disambiguate based on topological criteria that are
  coordinate frame independent in order to avoid dealing
  with issues that come from the six rotated frames and
  their mirrored partners.

  The purpose of this function is to fix the edge effect
  the standard TEASAR algorithm generates and ensure that
  we can trivially join skeletons from adjacent chunks.  

  Rotating the (x,y) pairs into their appropriate frame
  is performed in the function that calls this one.

  Returns: { $SEGID: (x, y), ... }
  """
  cdef size_t sx, sy
  sx = dt.shape[0]
  sy = dt.shape[1]

  cdef size_t x, y

  mx = defaultdict(float)
  pts = {}

  cdef UINT label = 0
  cdef dict centroids = compute_centroids(cc_labels, wx, wy)
  cdef float px, py
  cdef float centx, centy

  for y in range(sy):
    for x in range(sx):
      label = cc_labels[x,y]
      if label == 0:
        continue
      elif dt[x,y] == 0:
        continue
      elif dt[x,y] > mx[label]:
        mx[label] = dt[x,y]
        pts[label] = (x,y)
      elif mx[label] == dt[x,y]:
        px, py = pts[label]
        centx, centy = centroids[label]
        pts[label] = compute_tiebreaker_maxima(
          px, py, x, y, 
          centx, centy,
          sx, sy, wx, wy
        )

  return pts

def compute_tiebreaker_maxima(
    float px, float py, 
    float x, float y, 
    float centx, float centy,
    float sx, float sy,
    float wx, float wy
  ):
  """
  compute_tiebreaker_maxima(
    float px, float py, 
    float x, float y, 
    float centx, float centy,
    float sx, float sy,
    float wx, float wy
  )

  This function breaks ties for `compute_border_targets`.

  (px,py): A previously found distance transform maxima 
  (x,y): The coordinate of the newly found maxima
  (sx,sy): The length and width of the image plane.
  (wx,wy): Weighting for anisotropy.
  (centx, centy): The centroid of the current label.

  We use following topolological criteria to achieve
  a coordinate frame-free voxel selection. We pick
  the result of the first criterion that is satisfied.

  1) Pick the voxel closest to the centroid of the label.
  2) The voxel closest to the centroid of the plane.
  3) Closest to a corner of the plane.
  4) Closest to an edge of the plane.
  5) The previous maxima.

  The worst case would be an annulus drawn around the center,
  which would result in four equally eligible pixels....

  Hopefully this won't happen too often...

  Returns: some (x, y)
  """
  cdef float cx = wx * sx / 2.0
  cdef float cy = wy * sy / 2.0

  cdef float dist1 = distsq(px,py, centx,centy, wx,wy)
  cdef float dist2 = distsq( x, y, centx,centy, wx,wy)

  if dist2 < dist1:
    return (x, y)
  elif dist1 == dist2:
    dist1 = distsq(px,py, cx,cy, wx,wy)
    dist2 = distsq( x, y, cx,cy, wx,wy)
    if dist2 < dist1:
      return (x,y)
    elif dist1 == dist2:
      dist1 = cornerness(px, py, sx, sy, wx,wy)
      dist2 = cornerness( x,  y, sx, sy, wx,wy)
      if dist2 < dist1:
        return (x, y)
      elif dist1 == dist2:
        dist1 = edgeness(px, py, sx, sy, wx,wy)
        dist2 = edgeness( x,  y, sx, sy, wx,wy)
        if dist2 < dist1:
          return (x, y)

  return (px, py)

cdef float edgeness(
    float x, float y, float sx, float sy,
    float wx, float wy
  ):
  """
  float edgeness(float x, float y, float sx, float sy)

  Nearness of (x,y) to the edge of an image of size (sx,sy).
  """
  return min(
    wx * (x - 0.5),
    wx * (sx - 0.5 - x),
    wy * (y - 0.5),
    wy * (sy - 0.5 - y)
  )

cdef float cornerness(
    float x, float y, float sx, float sy,
    float wx, float wy
  ):
  """
  float cornerness(
      float x, float y, float sx, float sy
      float wx, float wy
  )

  Nearness of (x,y) to a corner of an image of size (sx,sy).
  """
  return min( 
    distsq(x,y,-0.5,-0.5, wx, wy), 
    distsq(x,y,sx-0.5,-0.5, wx, wy),
    distsq(x,y,sx-0.5,sy-0.5, wx, wy),
    distsq(x,y,-0.5,sx-0.5, wx, wy)
  )

cdef float distsq(
    float p1x, float p1y, 
    float p2x, float p2y, 
    float wx, float wy
  ):

  p1x = wx * (p1x - p2x)
  p1y = wy * (p1y - p2y)
  return p1x * p1x + p1y * p1y 

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
@cython.binding(True)
def roll_invalidation_cube(
    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, 
    cnp.ndarray[float, ndim=3] DBF, 
    path, float scale, float const,
    anisotropy=(1,1,1),
    invalid_vertices={},
  ):
  """
  Given an anisotropic binary image, its distance transform, and a path 
  traversing the binary image, erase the voxels surrounding the path
  in a cube around each vertex. In contrast to `roll_invalidation_ball`,
  this function runs in time linear in the number of image pixels.
  """
  cdef int64_t sx, sy, sz 
  sx = labels.shape[0]
  sy = labels.shape[1]
  sz = labels.shape[2]

  cdef size_t sxy = sx * sy

  cdef float wx, wy, wz
  (wx, wy, wz) = anisotropy

  path = [ 
    coord[0] + sx * coord[1] + sxy * coord[2] 
    for coord in path if tuple(coord) not in invalid_vertices 
  ]
  path = np.array(path, dtype=np.uintp)

  cdef size_t[:] pathview = path

  cdef size_t invalidated = _roll_invalidation_cube(
    <uint8_t*>&labels[0,0,0], <float*>&DBF[0,0,0],
    sx, sy, sz, 
    wx, wy, wz,
    <size_t*>&pathview[0], path.size,
    scale, const
  )

  return invalidated, labels

@cython.boundscheck(False)
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def find_cycle_cython(cnp.ndarray[int32_t, ndim=2] edges):
  """
  Given a graph of edges that are a single connected component,
  find a cycle via depth first search.

  Returns: list of edges in a cycle (empty list if no cycle is found)
  """
  index = defaultdict(set)
  visited = defaultdict(int)

  if edges.size == 0:
    return np.array([], dtype=np.int32)

  for e1, e2 in edges:
    index[e1].add(e2)
    index[e2].add(e1)

  cdef int root = edges[0,0]
  cdef int node = -1
  cdef int child = -1
  cdef int parent = -1
  cdef int depth = -1
  cdef int i = 0

  cdef list stack = [root]
  cdef list parents = [-1]
  cdef list depth_stack = [0]
  cdef list path = []

  while stack:
    node = stack.pop()
    parent = parents.pop()
    depth = depth_stack.pop()

    for i in range(len(path) - depth):
      path.pop()

    path.append(node)

    if visited[node] == 1:
      break

    visited[node] = 1

    for child in index[node]:
      if child != parent:
        stack.append(child)
        parents.append(node)
        depth_stack.append(depth + 1)

  if len(path) <= 1:
    return np.array([], dtype=np.int32)
  
  for i in range(len(path) - 1):
    if path[i] == node:
      break

  path = path[i:]

  if len(path) < 3:
    return np.array([], dtype=np.int32)

  return np.array(path, dtype=np.int32)

def find_avocado_fruit(
  cnp.ndarray[INTEGER, ndim=3] labels, 
  size_t cx, size_t cy, size_t cz,
  INTEGER background = 0
):
  """
  Tests to see if the current coordinate is inside 
  the nucleus of a somata that has been assigned
  to a separate label from the rest of the cell.

  Returns: (pit, fruit)
  """
  cdef size_t sx, sy, sz
  sx, sy, sz = labels.shape[:3]
  cdef size_t voxels = sx * sy * sz 

  if cx >= sx or cy >= sy or cz >= sz:
    raise ValueError(
      "<{},{},{}> must be be contained within shape <{},{},{}>".format(
        cx,cy,cz,sx,sy,sz
    ))

  cdef size_t x, y, z 
  cdef INTEGER label = labels[cx, cy, cz]
  cdef list changes = [ None ] * 6

  for x in range(cx, sx):
    if labels[x,cy,cz] == background:
      break
    elif labels[x,cy,cz] != label:
      changes[0] = labels[x,cy,cz]
      break

  for x in range(cx, 0, -1):
    if labels[x,cy,cz] == background:
      break
    elif labels[x,cy,cz] != label:
      changes[1] = labels[x,cy,cz]
      break

  for y in range(cy, sy):
    if labels[cx,y,cz] == background:
      break
    if labels[cx,y,cz] != label:
      changes[2] = labels[cx,y,cz]
      break

  for y in range(cy, 0, -1):
    if labels[cx,y,cz] == background:
      break
    if labels[cx,y,cz] != label:
      changes[3] = labels[cx,y,cz]
      break

  for z in range(cz, sz):
    if labels[cx,cy,z] == background:
      break
    if labels[cx,cy,z] != label:
      changes[4] = labels[cx,cy,z]
      break

  for z in range(cz, 0, -1):
    if labels[cx,cy,z] == background:
      break
    if labels[cx,cy,z] != label:
      changes[5] = labels[cx,cy,z]
      break

  changes = [ _ for _ in changes if _ is not None ]

  # Too little info to make a decision
  if len(changes) < 3:
    return (label, label)

  if len(changes) > 3: # if more than 3, allow one non-match
    allowed_differences = 1
  else: # allow no non-matches (we're in a corner)
    allowed_differences = 0

  uniq, cts = np.unique(changes, return_counts=True)
  candidate_fruit_index = np.argmax(cts)
  differences = len(changes) - cts[candidate_fruit_index]

  # it's not an avocado if there's lots of
  # labels surrounding the candidate "pit"
  if differences > allowed_differences:
    return (label, label)
  
  return (label, uniq[candidate_fruit_index])

class CachedTargetFinder:
  def __init__(self, mask: np.ndarray, daf: np.ndarray):
    """
    From DAF, compute a sorted list of the maximum values
    so that finding them becomes very fast.
    """
    mask_indices = np.flatnonzero(mask.ravel(order='F'))
    if mask.size < np.iinfo(np.uint32).max:
      mask_indices = mask_indices.astype(np.uint32, copy=False)
    daf_sort = np.argsort(daf.ravel(order='F')[mask_indices])
    daf_sort = np.flip(daf_sort)
    self.daf_indices = mask_indices[daf_sort]

  def find_target(self, mask: np.ndarray):
    """
    Find the coordinate of a voxel corresponding 
    the maximum map value.

    Returns: (x, y, z)
    """
    first_positive_index = self.first_label_indexed(
      mask.ravel(order='F'), self.daf_indices
    )
    if first_positive_index is None:
      self.daf_indices = self.daf_indices[self.daf_indices.size:]  # Clear it.
      return None

    # This tells us mask positions daf_indices[0:first_positive_index] are now
    # zeroed out. We assume that this is permanent, so we don't need to search
    # those positions again next time.
    self.daf_indices = self.daf_indices[first_positive_index:]

    return np.unravel_index(self.daf_indices[0], mask.shape, order='F')

  @cython.boundscheck(False)
  @cython.wraparound(False)  # turn off negative index wrapping for entire function
  @cython.nonecheck(False)
  def first_label_indexed(self, uint8_t[:] labels not None, INTEGER[:] indices not None):
    """
    Returns: first i for which labels[indices[i]] is non-zero.
    """
    cdef size_t length = indices.size
    cdef size_t i = 0
    cdef INTEGER label_index

    for i in range(length):
      label_index = indices[i]
      if labels[label_index]:
        return i

    return None  

def extract_edges_from_binary_image(uint8_t[:,:,:] binimg, int connectivity = 26):
  cdef uint64_t sx, sy, sz
  sx, sy, sz = tuple(binimg.shape)[:3]

  cdef uint64_t sxy = sx * sy

  binimg = np.asfortranarray(binimg)
  cdef unordered_set[cpp_pair[uint64_t,uint64_t], pair_hash] edges = _extract_edges_from_binary_image(
    &binimg[0,0,0], 
    sx, sy, sz, 
    connectivity
  )

  numbering = {}
  cdef int64_t i = 0
  for edge in edges:
    for v in (edge.first, edge.second):
      if v not in numbering:
        numbering[v] = i
        i += 1

  inumbering = { v:k for k,v in numbering.items() }
  vertices = []

  cdef uint64_t loc, x, y, z
  for i in range(len(inumbering)):
    loc = inumbering[i]
    z = loc // sxy
    y = (loc - z * sxy) // sx
    x = loc - z * sxy - y * sx
    vertices.append((x,y,z))

  int_edges = []
  for v1,v2 in edges:
    int_edges.append((numbering[v1], numbering[v2]))

  vertices = np.array(vertices, dtype=np.uint32)
  int_edges = np.array(int_edges, dtype=np.uint32)

  return (vertices, int_edges)


================================================
FILE: ext/skeletontricks/unordered_dense.hpp
================================================
///////////////////////// ankerl::unordered_dense::{map, set} /////////////////////////

// A fast & densely stored hashmap and hashset based on robin-hood backward shift deletion.
// Version 4.5.0
// https://github.com/martinus/unordered_dense
//
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// Copyright (c) 2022-2024 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#ifndef ANKERL_UNORDERED_DENSE_H
#define ANKERL_UNORDERED_DENSE_H

// see https://semver.org/spec/v2.0.0.html
#define ANKERL_UNORDERED_DENSE_VERSION_MAJOR 4 // NOLINT(cppcoreguidelines-macro-usage) incompatible API changes
#define ANKERL_UNORDERED_DENSE_VERSION_MINOR 5 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible functionality
#define ANKERL_UNORDERED_DENSE_VERSION_PATCH 0 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible bug fixes

// API versioning with inline namespace, see https://www.foonathan.net/2018/11/inline-namespaces/

// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch) v##major##_##minor##_##patch
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define ANKERL_UNORDERED_DENSE_VERSION_CONCAT(major, minor, patch) ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch)
#define ANKERL_UNORDERED_DENSE_NAMESPACE   \
    ANKERL_UNORDERED_DENSE_VERSION_CONCAT( \
        ANKERL_UNORDERED_DENSE_VERSION_MAJOR, ANKERL_UNORDERED_DENSE_VERSION_MINOR, ANKERL_UNORDERED_DENSE_VERSION_PATCH)

#if defined(_MSVC_LANG)
#    define ANKERL_UNORDERED_DENSE_CPP_VERSION _MSVC_LANG
#else
#    define ANKERL_UNORDERED_DENSE_CPP_VERSION __cplusplus
#endif

#if defined(__GNUC__)
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#    define ANKERL_UNORDERED_DENSE_PACK(decl) decl __attribute__((__packed__))
#elif defined(_MSC_VER)
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#    define ANKERL_UNORDERED_DENSE_PACK(decl) __pragma(pack(push, 1)) decl __pragma(pack(pop))
#endif

// exceptions
#if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)
#    define ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() 1 // NOLINT(cppcoreguidelines-macro-usage)
#else
#    define ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() 0 // NOLINT(cppcoreguidelines-macro-usage)
#endif
#ifdef _MSC_VER
#    define ANKERL_UNORDERED_DENSE_NOINLINE __declspec(noinline)
#else
#    define ANKERL_UNORDERED_DENSE_NOINLINE __attribute__((noinline))
#endif

// defined in unordered_dense.cpp
#if !defined(ANKERL_UNORDERED_DENSE_EXPORT)
#    define ANKERL_UNORDERED_DENSE_EXPORT
#endif

#if ANKERL_UNORDERED_DENSE_CPP_VERSION < 201703L
#    error ankerl::unordered_dense requires C++17 or higher
#else
#    include <array>            // for array
#    include <cstdint>          // for uint64_t, uint32_t, uint8_t, UINT64_C
#    include <cstring>          // for size_t, memcpy, memset
#    include <functional>       // for equal_to, hash
#    include <initializer_list> // for initializer_list
#    include <iterator>         // for pair, distance
#    include <limits>           // for numeric_limits
#    include <memory>           // for allocator, allocator_traits, shared_ptr
#    include <optional>         // for optional
#    include <stdexcept>        // for out_of_range
#    include <string>           // for basic_string
#    include <string_view>      // for basic_string_view, hash
#    include <tuple>            // for forward_as_tuple
#    include <type_traits>      // for enable_if_t, declval, conditional_t, ena...
#    include <utility>          // for forward, exchange, pair, as_const, piece...
#    include <vector>           // for vector
#    if ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() == 0
#        include <cstdlib> // for abort
#    endif

#    if defined(__has_include) && !defined(ANKERL_UNORDERED_DENSE_DISABLE_PMR)
#        if __has_include(<memory_resource>)
#            define ANKERL_UNORDERED_DENSE_PMR std::pmr // NOLINT(cppcoreguidelines-macro-usage)
#            include <memory_resource>                  // for polymorphic_allocator
#        elif __has_include(<experimental/memory_resource>)
#            define ANKERL_UNORDERED_DENSE_PMR std::experimental::pmr // NOLINT(cppcoreguidelines-macro-usage)
#            include <experimental/memory_resource>                   // for polymorphic_allocator
#        endif
#    endif

#    if defined(_MSC_VER) && defined(_M_X64)
#        include <intrin.h>
#        pragma intrinsic(_umul128)
#    endif

#    if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
#        define ANKERL_UNORDERED_DENSE_LIKELY(x) __builtin_expect(x, 1)   // NOLINT(cppcoreguidelines-macro-usage)
#        define ANKERL_UNORDERED_DENSE_UNLIKELY(x) __builtin_expect(x, 0) // NOLINT(cppcoreguidelines-macro-usage)
#    else
#        define ANKERL_UNORDERED_DENSE_LIKELY(x) (x)   // NOLINT(cppcoreguidelines-macro-usage)
#        define ANKERL_UNORDERED_DENSE_UNLIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage)
#    endif

namespace ankerl::unordered_dense {
inline namespace ANKERL_UNORDERED_DENSE_NAMESPACE {

namespace detail {

#    if ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS()

// make sure this is not inlined as it is slow and dramatically enlarges code, thus making other
// inlinings more difficult. Throws are also generally the slow path.
[[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_key_not_found() {
    throw std::out_of_range("ankerl::unordered_dense::map::at(): key not found");
}
[[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_bucket_overflow() {
    throw std::overflow_error("ankerl::unordered_dense: reached max bucket size, cannot increase size");
}
[[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_too_many_elements() {
    throw std::out_of_range("ankerl::unordered_dense::map::replace(): too many elements");
}

#    else

[[noreturn]] inline void on_error_key_not_found() {
    abort();
}
[[noreturn]] inline void on_error_bucket_overflow() {
    abort();
}
[[noreturn]] inline void on_error_too_many_elements() {
    abort();
}

#    endif

} // namespace detail

// hash ///////////////////////////////////////////////////////////////////////

// This is a stripped-down implementation of wyhash: https://github.com/wangyi-fudan/wyhash
// No big-endian support (because different values on different machines don't matter),
// hardcodes seed and the secret, reformats the code, and clang-tidy fixes.
namespace detail::wyhash {

inline void mum(uint64_t* a, uint64_t* b) {
#    if defined(__SIZEOF_INT128__)
    __uint128_t r = *a;
    r *= *b;
    *a = static_cast<uint64_t>(r);
    *b = static_cast<uint64_t>(r >> 64U);
#    elif defined(_MSC_VER) && defined(_M_X64)
    *a = _umul128(*a, *b, b);
#    else
    uint64_t ha = *a >> 32U;
    uint64_t hb = *b >> 32U;
    uint64_t la = static_cast<uint32_t>(*a);
    uint64_t lb = static_cast<uint32_t>(*b);
    uint64_t hi{};
    uint64_t lo{};
    uint64_t rh = ha * hb;
    uint64_t rm0 = ha * lb;
    uint64_t rm1 = hb * la;
    uint64_t rl = la * lb;
    uint64_t t = rl + (rm0 << 32U);
    auto c = static_cast<uint64_t>(t < rl);
    lo = t + (rm1 << 32U);
    c += static_cast<uint64_t>(lo < t);
    hi = rh + (rm0 >> 32U) + (rm1 >> 32U) + c;
    *a = lo;
    *b = hi;
#    endif
}

// multiply and xor mix function, aka MUM
[[nodiscard]] inline auto mix(uint64_t a, uint64_t b) -> uint64_t {
    mum(&a, &b);
    return a ^ b;
}

// read functions. WARNING: we don't care about endianness, so results are different on big endian!
[[nodiscard]] inline auto r8(const uint8_t* p) -> uint64_t {
    uint64_t v{};
    std::memcpy(&v, p, 8U);
    return v;
}

[[nodiscard]] inline auto r4(const uint8_t* p) -> uint64_t {
    uint32_t v{};
    std::memcpy(&v, p, 4);
    return v;
}

// reads 1, 2, or 3 bytes
[[nodiscard]] inline auto r3(const uint8_t* p, size_t k) -> uint64_t {
    return (static_cast<uint64_t>(p[0]) << 16U) | (static_cast<uint64_t>(p[k >> 1U]) << 8U) | p[k - 1];
}

[[maybe_unused]] [[nodiscard]] inline auto hash(void const* key, size_t len) -> uint64_t {
    static constexpr auto secret = std::array{UINT64_C(0xa0761d6478bd642f),
                                              UINT64_C(0xe7037ed1a0b428db),
                                              UINT64_C(0x8ebc6af09c88c6e3),
                                              UINT64_C(0x589965cc75374cc3)};

    auto const* p = static_cast<uint8_t const*>(key);
    uint64_t seed = secret[0];
    uint64_t a{};
    uint64_t b{};
    if (ANKERL_UNORDERED_DENSE_LIKELY(len <= 16)) {
        if (ANKERL_UNORDERED_DENSE_LIKELY(len >= 4)) {
            a = (r4(p) << 32U) | r4(p + ((len >> 3U) << 2U));
            b = (r4(p + len - 4) << 32U) | r4(p + len - 4 - ((len >> 3U) << 2U));
        } else if (ANKERL_UNORDERED_DENSE_LIKELY(len > 0)) {
            a = r3(p, len);
            b = 0;
        } else {
            a = 0;
            b = 0;
        }
    } else {
        size_t i = len;
        if (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 48)) {
            uint64_t see1 = seed;
            uint64_t see2 = seed;
            do {
                seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
                see1 = mix(r8(p + 16) ^ secret[2], r8(p + 24) ^ see1);
                see2 = mix(r8(p + 32) ^ secret[3], r8(p + 40) ^ see2);
                p += 48;
                i -= 48;
            } while (ANKERL_UNORDERED_DENSE_LIKELY(i > 48));
            seed ^= see1 ^ see2;
        }
        while (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 16)) {
            seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
            i -= 16;
            p += 16;
        }
        a = r8(p + i - 16);
        b = r8(p + i - 8);
    }

    return mix(secret[1] ^ len, mix(a ^ secret[1], b ^ seed));
}

[[nodiscard]] inline auto hash(uint64_t x) -> uint64_t {
    return detail::wyhash::mix(x, UINT64_C(0x9E3779B97F4A7C15));
}

} // namespace detail::wyhash

ANKERL_UNORDERED_DENSE_EXPORT template <typename T, typename Enable = void>
struct hash {
    auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))
        -> uint64_t {
        return std::hash<T>{}(obj);
    }
};

template <typename T>
struct hash<T, typename std::hash<T>::is_avalanching> {
    using is_avalanching = void;
    auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))
        -> uint64_t {
        return std::hash<T>{}(obj);
    }
};

template <typename CharT>
struct hash<std::basic_string<CharT>> {
    using is_avalanching = void;
    auto operator()(std::basic_string<CharT> const& str) const noexcept -> uint64_t {
        return detail::wyhash::hash(str.data(), sizeof(CharT) * str.size());
    }
};

template <typename CharT>
struct hash<std::basic_string_view<CharT>> {
    using is_avalanching = void;
    auto operator()(std::basic_string_view<CharT> const& sv) const noexcept -> uint64_t {
        return detail::wyhash::hash(sv.data(), sizeof(CharT) * sv.size());
    }
};

template <class T>
struct hash<T*> {
    using is_avalanching = void;
    auto operator()(T* ptr) const noexcept -> uint64_t {
        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr));
    }
};

template <class T>
struct hash<std::unique_ptr<T>> {
    using is_avalanching = void;
    auto operator()(std::unique_ptr<T> const& ptr) const noexcept -> uint64_t {
        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr.get()));
    }
};

template <class T>
struct hash<std::shared_ptr<T>> {
    using is_avalanching = void;
    auto operator()(std::shared_ptr<T> const& ptr) const noexcept -> uint64_t {
        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr.get()));
    }
};

template <typename Enum>
struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
    using is_avalanching = void;
    auto operator()(Enum e) const noexcept -> uint64_t {
        using underlying = typename std::underlying_type_t<Enum>;
        return detail::wyhash::hash(static_cast<underlying>(e));
    }
};

template <typename... Args>
struct tuple_hash_helper {
    // Converts the value into 64bit. If it is an integral type, just cast it. Mixing is doing the rest.
    // If it isn't an integral we need to hash it.
    template <typename Arg>
    [[nodiscard]] constexpr static auto to64(Arg const& arg) -> uint64_t {
        if constexpr (std::is_integral_v<Arg> || std::is_enum_v<Arg>) {
            return static_cast<uint64_t>(arg);
        } else {
            return hash<Arg>{}(arg);
        }
    }

    [[nodiscard]] static auto mix64(uint64_t state, uint64_t v) -> uint64_t {
        return detail::wyhash::mix(state + v, uint64_t{0x9ddfea08eb382d69});
    }

    // Creates a buffer that holds all the data from each element of the tuple. If possible we memcpy the data directly. If
    // not, we hash the object and use this for the array. Size of the array is known at compile time, and memcpy is optimized
    // away, so filling the buffer is highly efficient. Finally, call wyhash with this buffer.
    template <typename T, std::size_t... Idx>
    [[nodiscard]] static auto calc_hash(T const& t, std::index_sequence<Idx...>) noexcept -> uint64_t {
        auto h = uint64_t{};
        ((h = mix64(h, to64(std::get<Idx>(t)))), ...);
        return h;
    }
};

template <typename... Args>
struct hash<std::tuple<Args...>> : tuple_hash_helper<Args...> {
    using is_avalanching = void;
    auto operator()(std::tuple<Args...> const& t) const noexcept -> uint64_t {
        return tuple_hash_helper<Args...>::calc_hash(t, std::index_sequence_for<Args...>{});
    }
};

template <typename A, typename B>
struct hash<std::pair<A, B>> : tuple_hash_helper<A, B> {
    using is_avalanching = void;
    auto operator()(std::pair<A, B> const& t) const noexcept -> uint64_t {
        return tuple_hash_helper<A, B>::calc_hash(t, std::index_sequence_for<A, B>{});
    }
};

// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#    define ANKERL_UNORDERED_DENSE_HASH_STATICCAST(T)                    \
        template <>                                                      \
        struct hash<T> {                                                 \
            using is_avalanching = void;                                 \
            auto operator()(T const& obj) const noexcept -> uint64_t {   \
                return detail::wyhash::hash(static_cast<uint64_t>(obj)); \
            }                                                            \
        }

#    if defined(__GNUC__) && !defined(__clang__)
#        pragma GCC diagnostic push
#        pragma GCC diagnostic ignored "-Wuseless-cast"
#    endif
// see https://en.cppreference.com/w/cpp/utility/hash
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(bool);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(signed char);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned char);
#    if ANKERL_UNORDERED_DENSE_CPP_VERSION >= 202002L && defined(__cpp_char8_t)
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char8_t);
#    endif
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char16_t);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char32_t);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(wchar_t);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(short);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned short);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(int);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned int);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long long);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long);
ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long long);

#    if defined(__GNUC__) && !defined(__clang__)
#        pragma GCC diagnostic pop
#    endif

// bucket_type //////////////////////////////////////////////////////////

namespace bucket_type {

struct standard {
    static constexpr uint32_t dist_inc = 1U << 8U;             // skip 1 byte fingerprint
    static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint

    uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash
    uint32_t m_value_idx;            // index into the m_values vector.
};

ANKERL_UNORDERED_DENSE_PACK(struct big {
    static constexpr uint32_t dist_inc = 1U << 8U;             // skip 1 byte fingerprint
    static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint

    uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash
    size_t m_value_idx;              // index into the m_values vector.
});

} // namespace bucket_type

namespace detail {

struct nonesuch {};
struct default_container_t {};

template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
struct detector {
    using value_t = std::false_type;
    using type = Default;
};

template <class Default, template <class...> class Op, class... Args>
struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> {
    using value_t = std::true_type;
    using type = Op<Args...>;
};

template <template <class...> class Op, class... Args>
using is_detected = typename detail::detector<detail::nonesuch, void, Op, Args...>::value_t;

template <template <class...> class Op, class... Args>
constexpr bool is_detected_v = is_detected<Op, Args...>::value;

template <typename T>
using detect_avalanching = typename T::is_avalanching;

template <typename T>
using detect_is_transparent = typename T::is_transparent;

template <typename T>
using detect_iterator = typename T::iterator;

template <typename T>
using detect_reserve = decltype(std::declval<T&>().reserve(size_t{}));

// enable_if helpers

template <typename Mapped>
constexpr bool is_map_v = !std::is_void_v<Mapped>;

// clang-format off
template <typename Hash, typename KeyEqual>
constexpr bool is_transparent_v = is_detected_v<detect_is_transparent, Hash> && is_detected_v<detect_is_transparent, KeyEqual>;
// clang-format on

template <typename From, typename To1, typename To2>
constexpr bool is_neither_convertible_v = !std::is_convertible_v<From, To1> && !std::is_convertible_v<From, To2>;

template <typename T>
constexpr bool has_reserve = is_detected_v<detect_reserve, T>;

// base type for map has mapped_type
template <class T>
struct base_table_type_map {
    using mapped_type = T;
};

// base type for set doesn't have mapped_type
struct base_table_type_set {};

} // namespace detail

// Very much like std::deque, but faster for indexing (in most cases). As of now this doesn't implement the full std::vector
// API, but merely what's necessary to work as an underlying container for ankerl::unordered_dense::{map, set}.
// It allocates blocks of equal size and puts them into the m_blocks vector. That means it can grow simply by adding a new
// block to the back of m_blocks, and doesn't double its size like an std::vector. The disadvantage is that memory is not
// linear and thus there is one more indirection necessary for indexing.
template <typename T, typename Allocator = std::allocator<T>, size_t MaxSegmentSizeBytes = 4096>
class segmented_vector {
    template <bool IsConst>
    class iter_t;

public:
    using allocator_type = Allocator;
    using pointer = typename std::allocator_traits<allocator_type>::pointer;
    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
    using difference_type = typename std::allocator_traits<allocator_type>::difference_type;
    using value_type = T;
    using size_type = std::size_t;
    using reference = T&;
    using const_reference = T const&;
    using iterator = iter_t<false>;
    using const_iterator = iter_t<true>;

private:
    using vec_alloc = typename std::allocator_traits<Allocator>::template rebind_alloc<pointer>;
    std::vector<pointer, vec_alloc> m_blocks{};
    size_t m_size{};

    // Calculates the maximum number for x in  (s << x) <= max_val
    static constexpr auto num_bits_closest(size_t max_val, size_t s) -> size_t {
        auto f = size_t{0};
        while (s << (f + 1) <= max_val) {
            ++f;
        }
        return f;
    }

    using self_t = segmented_vector<T, Allocator, MaxSegmentSizeBytes>;
    static constexpr auto num_bits = num_bits_closest(MaxSegmentSizeBytes, sizeof(T));
    static constexpr auto num_elements_in_block = 1U << num_bits;
    static constexpr auto mask = num_elements_in_block - 1U;

    /**
     * Iterator class doubles as const_iterator and iterator
     */
    template <bool IsConst>
    class iter_t {
        using ptr_t = typename std::conditional_t<IsConst, segmented_vector::const_pointer const*, segmented_vector::pointer*>;
        ptr_t m_data{};
        size_t m_idx{};

        template <bool B>
        friend class iter_t;

    public:
        using difference_type = segmented_vector::difference_type;
        using value_type = T;
        using reference = typename std::conditional_t<IsConst, value_type const&, value_type&>;
        using pointer = typename std::conditional_t<IsConst, segmented_vector::const_pointer, segmented_vector::pointer>;
        using iterator_category = std::forward_iterator_tag;

        iter_t() noexcept = default;

        template <bool OtherIsConst, typename = typename std::enable_if<IsConst && !OtherIsConst>::type>
        // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
        constexpr iter_t(iter_t<OtherIsConst> const& other) noexcept
            : m_data(other.m_data)
            , m_idx(other.m_idx) {}

        constexpr iter_t(ptr_t data, size_t idx) noexcept
            : m_data(data)
            , m_idx(idx) {}

        template <bool OtherIsConst, typename = typename std::enable_if<IsConst && !OtherIsConst>::type>
        constexpr auto operator=(iter_t<OtherIsConst> const& other) noexcept -> iter_t& {
            m_data = other.m_data;
            m_idx = other.m_idx;
            return *this;
        }

        constexpr auto operator++() noexcept -> iter_t& {
            ++m_idx;
            return *this;
        }

        constexpr auto operator++(int) noexcept -> iter_t {
            iter_t prev(*this);
            this->operator++();
            return prev;
        }

        constexpr auto operator+(difference_type diff) noexcept -> iter_t {
            return {m_data, static_cast<size_t>(static_cast<difference_type>(m_idx) + diff)};
        }

        template <bool OtherIsConst>
        constexpr auto operator-(iter_t<OtherIsConst> const& other) noexcept -> difference_type {
            return static_cast<difference_type>(m_idx) - static_cast<difference_type>(other.m_idx);
        }

        constexpr auto operator*() const noexcept -> reference {
            return m_data[m_idx >> num_bits][m_idx & mask];
        }

        constexpr auto operator->() const noexcept -> pointer {
            return &m_data[m_idx >> num_bits][m_idx & mask];
        }

        template <bool O>
        constexpr auto operator==(iter_t<O> const& o) const noexcept -> bool {
            return m_idx == o.m_idx;
        }

        template <bool O>
        constexpr auto operator!=(iter_t<O> const& o) const noexcept -> bool {
            return !(*this == o);
        }
    };

    // slow path: need to allocate a new segment every once in a while
    void increase_capacity() {
        auto ba = Allocator(m_blocks.get_allocator());
        pointer block = std::allocator_traits<Allocator>::allocate(ba, num_elements_in_block);
        m_blocks.push_back(block);
    }

    // Moves everything from other
    void append_everything_from(segmented_vector&& other) {
        reserve(size() + other.size());
        for (auto&& o : other) {
            emplace_back(std::move(o));
        }
    }

    // Copies everything from other
    void append_everything_from(segmented_vector const& other) {
        reserve(size() + other.size());
        for (auto const& o : other) {
            emplace_back(o);
        }
    }

    void dealloc() {
        auto ba = Allocator(m_blocks.get_allocator());
        for (auto ptr : m_blocks) {
            std::allocator_traits<Allocator>::deallocate(ba, ptr, num_elements_in_block);
        }
    }

    [[nodiscard]] static constexpr auto calc_num_blocks_for_capacity(size_t capacity) {
        return (capacity + num_elements_in_block - 1U) / num_elements_in_block;
    }

public:
    segmented_vector() = default;

    // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
    segmented_vector(Allocator alloc)
        : m_blocks(vec_alloc(alloc)) {}

    segmented_vector(segmented_vector&& other, Allocator alloc)
        : segmented_vector(alloc) {
        *this = std::move(other);
    }

    segmented_vector(segmented_vector const& other, Allocator alloc)
        : m_blocks(vec_alloc(alloc)) {
        append_everything_from(other);
    }

    segmented_vector(segmented_vector&& other) noexcept
        : segmented_vector(std::move(other), get_allocator()) {}

    segmented_vector(segmented_vector const& other) {
        append_everything_from(other);
    }

    auto operator=(segmented_vector const& other) -> segmented_vector& {
        if (this == &other) {
            return *this;
        }
        clear();
        append_everything_from(other);
        return *this;
    }

    auto operator=(segmented_vector&& other) noexcept -> segmented_vector& {
        clear();
        dealloc();
        if (other.get_allocator() == get_allocator()) {
            m_blocks = std::move(other.m_blocks);
            m_size = std::exchange(other.m_size, {});
        } else {
            // make sure to construct with other's allocator!
            m_blocks = std::vector<pointer, vec_alloc>(vec_alloc(other.get_allocator()));
            append_everything_from(std::move(other));
        }
        return *this;
    }

    ~segmented_vector() {
        clear();
        dealloc();
    }

    [[nodiscard]] constexpr auto size() const -> size_t {
        return m_size;
    }

    [[nodiscard]] constexpr auto capacity() const -> size_t {
        return m_blocks.size() * num_elements_in_block;
    }

    // Indexing is highly performance critical
    [[nodiscard]] constexpr auto operator[](size_t i) const noexcept -> T const& {
        return m_blocks[i >> num_bits][i & mask];
    }

    [[nodiscard]] constexpr auto operator[](size_t i) noexcept -> T& {
        return m_blocks[i >> num_bits][i & mask];
    }

    [[nodiscard]] constexpr auto begin() -> iterator {
        return {m_blocks.data(), 0U};
    }
    [[nodiscard]] constexpr auto begin() const -> const_iterator {
        return {m_blocks.data(), 0U};
    }
    [[nodiscard]] constexpr auto cbegin() const -> const_iterator {
        return {m_blocks.data(), 0U};
    }

    [[nodiscard]] constexpr auto end() -> iterator {
        return {m_blocks.data(), m_size};
    }
    [[nodiscard]] constexpr auto end() const -> const_iterator {
        return {m_blocks.data(), m_size};
    }
    [[nodiscard]] constexpr auto cend() const -> const_iterator {
        return {m_blocks.data(), m_size};
    }

    [[nodiscard]] constexpr auto back() -> reference {
        return operator[](m_size - 1);
    }
    [[nodiscard]] constexpr auto back() const -> const_reference {
        return operator[](m_size - 1);
    }

    void pop_back() {
        back().~T();
        --m_size;
    }

    [[nodiscard]] auto empty() const {
        return 0 == m_size;
    }

    void reserve(size_t new_capacity) {
        m_blocks.reserve(calc_num_blocks_for_capacity(new_capacity));
        while (new_capacity > capacity()) {
            increase_capacity();
        }
    }

    [[nodiscard]] auto get_allocator() const -> allocator_type {
        return allocator_type{m_blocks.get_allocator()};
    }

    template <class... Args>
    auto emplace_back(Args&&... args) -> reference {
        if (m_size == capacity()) {
            increase_capacity();
        }
        auto* ptr = static_cast<void*>(&operator[](m_size));
        auto& ref = *new (ptr) T(std::forward<Args>(args)...);
        ++m_size;
        return ref;
    }

    void clear() {
        if constexpr (!std::is_trivially_destructible_v<T>) {
            for (size_t i = 0, s = size(); i < s; ++i) {
                operator[](i).~T();
            }
        }
        m_size = 0;
    }

    void shrink_to_fit() {
        auto ba = Allocator(m_blocks.get_allocator());
        auto num_blocks_required = calc_num_blocks_for_capacity(m_size);
        while (m_blocks.size() > num_blocks_required) {
            std::allocator_traits<Allocator>::deallocate(ba, m_blocks.back(), num_elements_in_block);
            m_blocks.pop_back();
        }
        m_blocks.shrink_to_fit();
    }
};

namespace detail {

// This is it, the table. Doubles as map and set, and uses `void` for T when its used as a set.
template <class Key,
          class T, // when void, treat it as a set.
          class Hash,
          class KeyEqual,
          class AllocatorOrContainer,
          class Bucket,
          class BucketContainer,
          bool IsSegmented>
class table : public std::conditional_t<is_map_v<T>, base_table_type_map<T>, base_table_type_set> {
    using underlying_value_type = typename std::conditional_t<is_map_v<T>, std::pair<Key, T>, Key>;
    using underlying_container_type = std::conditional_t<IsSegmented,
                                                         segmented_vector<underlying_value_type, AllocatorOrContainer>,
                                                         std::vector<underlying_value_type, AllocatorOrContainer>>;

public:
    using value_container_type = std::
        conditional_t<is_detected_v<detect_iterator, AllocatorOrContainer>, AllocatorOrContainer, underlying_container_type>;

private:
    using bucket_alloc =
        typename std::allocator_traits<typename value_container_type::allocator_type>::template rebind_alloc<Bucket>;
    using default_bucket_container_type =
        std::conditional_t<IsSegmented, segmented_vector<Bucket, bucket_alloc>, std::vector<Bucket, bucket_alloc>>;

    using bucket_container_type = std::conditional_t<std::is_same_v<BucketContainer, detail::default_container_t>,
                                                     default_bucket_container_type,
                                                     BucketContainer>;

    static constexpr uint8_t initial_shifts = 64 - 2; // 2^(64-m_shift) number of buckets
    static constexpr float default_max_load_factor = 0.8F;

public:
    using key_type = Key;
    using value_type = typename value_container_type::value_type;
    using size_type = typename value_container_type::size_type;
    using difference_type = typename value_container_type::difference_type;
    using hasher = Hash;
    using key_equal = KeyEqual;
    using allocator_type = typename value_container_type::allocator_type;
    using reference = typename value_container_type::reference;
    using const_reference = typename value_container_type::const_reference;
    using pointer = typename value_container_type::pointer;
    using const_pointer = typename value_container_type::const_pointer;
    using const_iterator = typename value_container_type::const_iterator;
    using iterator = std::conditional_t<is_map_v<T>, typename value_container_type::iterator, const_iterator>;
    using bucket_type = Bucket;

private:
    using value_idx_type = decltype(Bucket::m_value_idx);
    using dist_and_fingerprint_type = decltype(Bucket::m_dist_and_fingerprint);

    static_assert(std::is_trivially_destructible_v<Bucket>, "assert there's no need to call destructor / std::destroy");
    static_assert(std::is_trivially_copyable_v<Bucket>, "assert we can just memset / memcpy");

    value_container_type m_values{}; // Contains all the key-value pairs in one densely stored container. No holes.
    bucket_container_type m_buckets{};
    size_t m_max_bucket_capacity = 0;
    float m_max_load_factor = default_max_load_factor;
    Hash m_hash{};
    KeyEqual m_equal{};
    uint8_t m_shifts = initial_shifts;

    [[nodiscard]] auto next(value_idx_type bucket_idx) const -> value_idx_type {
        return ANKERL_UNORDERED_DENSE_UNLIKELY(bucket_idx + 1U == bucket_count())
                   ? 0
                   : static_cast<value_idx_type>(bucket_idx + 1U);
    }

    // Helper to access bucket through pointer types
    [[nodiscard]] static constexpr auto at(bucket_container_type& bucket, size_t offset) -> Bucket& {
        return bucket[offset];
    }

    [[nodiscard]] static constexpr auto at(const bucket_container_type& bucket, size_t offset) -> const Bucket& {
        return bucket[offset];
    }

    // use the dist_inc and dist_dec functions so that uint16_t types work without warning
    [[nodiscard]] static constexpr auto dist_inc(dist_and_fingerprint_type x) -> dist_and_fingerprint_type {
        return static_cast<dist_and_fingerprint_type>(x + Bucket::dist_inc);
    }

    [[nodiscard]] static constexpr auto dist_dec(dist_and_fingerprint_type x) -> dist_and_fingerprint_type {
        return static_cast<dist_and_fingerprint_type>(x - Bucket::dist_inc);
    }

    // The goal of mixed_hash is to always produce a high quality 64bit hash.
    template <typename K>
    [[nodiscard]] constexpr auto mixed_hash(K const& key) const -> uint64_t {
        if constexpr (is_detected_v<detect_avalanching, Hash>) {
            // we know that the hash is good because is_avalanching.
            if constexpr (sizeof(decltype(m_hash(key))) < sizeof(uint64_t)) {
                // 32bit hash and is_avalanching => multiply with a constant to avalanche bits upwards
                return m_hash(key) * UINT64_C(0x9ddfea08eb382d69);
            } else {
                // 64bit and is_avalanching => only use the hash itself.
                return m_hash(key);
            }
        } else {
            // not is_avalanching => apply wyhash
            return wyhash::hash(m_hash(key));
        }
    }

    [[nodiscard]] constexpr auto dist_and_fingerprint_from_hash(uint64_t hash) const -> dist_and_fingerprint_type {
        return Bucket::dist_inc | (static_cast<dist_and_fingerprint_type>(hash) & Bucket::fingerprint_mask);
    }

    [[nodiscard]] constexpr auto bucket_idx_from_hash(uint64_t hash) const -> value_idx_type {
        return static_cast<value_idx_type>(hash >> m_shifts);
    }

    [[nodiscard]] static constexpr auto get_key(value_type const& vt) -> key_type const& {
        if constexpr (is_map_v<T>) {
            return vt.first;
        } else {
            return vt;
        }
    }

    template <typename K>
    [[nodiscard]] auto next_while_less(K const& key) const -> Bucket {
        auto hash = mixed_hash(key);
        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
        auto bucket_idx = bucket_idx_from_hash(hash);

        while (dist_and_fingerprint < at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
            bucket_idx = next(bucket_idx);
        }
        return {dist_and_fingerprint, bucket_idx};
    }

    void place_and_shift_up(Bucket bucket, value_idx_type place) {
        while (0 != at(m_buckets, place).m_dist_and_fingerprint) {
            bucket = std::exchange(at(m_buckets, place), bucket);
            bucket.m_dist_and_fingerprint = dist_inc(bucket.m_dist_and_fingerprint);
            place = next(place);
        }
        at(m_buckets, place) = bucket;
    }

    [[nodiscard]] static constexpr auto calc_num_buckets(uint8_t shifts) -> size_t {
        return (std::min)(max_bucket_count(), size_t{1} << (64U - shifts));
    }

    [[nodiscard]] constexpr auto calc_shifts_for_size(size_t s) const -> uint8_t {
        auto shifts = initial_shifts;
        while (shifts > 0 && static_cast<size_t>(static_cast<float>(calc_num_buckets(shifts)) * max_load_factor()) < s) {
            --shifts;
        }
        return shifts;
    }

    // assumes m_values has data, m_buckets=m_buckets_end=nullptr, m_shifts is INITIAL_SHIFTS
    void copy_buckets(table const& other) {
        // assumes m_values has already the correct data copied over.
        if (empty()) {
            // when empty, at least allocate an initial buckets and clear them.
            allocate_buckets_from_shift();
            clear_buckets();
        } else {
            m_shifts = other.m_shifts;
            allocate_buckets_from_shift();
            if constexpr (IsSegmented || !std::is_same_v<BucketContainer, default_container_t>) {
                for (auto i = 0UL; i < bucket_count(); ++i) {
                    at(m_buckets, i) = at(other.m_buckets, i);
                }
            } else {
                std::memcpy(m_buckets.data(), other.m_buckets.data(), sizeof(Bucket) * bucket_count());
            }
        }
    }

    /**
     * True when no element can be added any more without increasing the size
     */
    [[nodiscard]] auto is_full() const -> bool {
        return size() > m_max_bucket_capacity;
    }

    void deallocate_buckets() {
        m_buckets.clear();
        m_buckets.shrink_to_fit();
        m_max_bucket_capacity = 0;
    }

    void allocate_buckets_from_shift() {
        auto num_buckets = calc_num_buckets(m_shifts);
        if constexpr (IsSegmented || !std::is_same_v<BucketContainer, default_container_t>) {
            if constexpr (has_reserve<bucket_container_type>) {
                m_buckets.reserve(num_buckets);
            }
            for (size_t i = m_buckets.size(); i < num_buckets; ++i) {
                m_buckets.emplace_back();
            }
        } else {
            m_buckets.resize(num_buckets);
        }
        if (num_buckets == max_bucket_count()) {
            // reached the maximum, make sure we can use each bucket
            m_max_bucket_capacity = max_bucket_count();
        } else {
            m_max_bucket_capacity = static_cast<value_idx_type>(static_cast<float>(num_buckets) * max_load_factor());
        }
    }

    void clear_buckets() {
        if constexpr (IsSegmented || !std::is_same_v<BucketContainer, default_container_t>) {
            for (auto&& e : m_buckets) {
                std::memset(&e, 0, sizeof(e));
            }
        } else {
            std::memset(m_buckets.data(), 0, sizeof(Bucket) * bucket_count());
        }
    }

    void clear_and_fill_buckets_from_values() {
        clear_buckets();
        for (value_idx_type value_idx = 0, end_idx = static_cast<value_idx_type>(m_values.size()); value_idx < end_idx;
             ++value_idx) {
            auto const& key = get_key(m_values[value_idx]);
            auto [dist_and_fingerprint, bucket] = next_while_less(key);

            // we know for certain that key has not yet been inserted, so no need to check it.
            place_and_shift_up({dist_and_fingerprint, value_idx}, bucket);
        }
    }

    void increase_size() {
        if (m_max_bucket_capacity == max_bucket_count()) {
            // remove the value again, we can't add it!
            m_values.pop_back();
            on_error_bucket_overflow();
        }
        --m_shifts;
        if constexpr (!IsSegmented || std::is_same_v<BucketContainer, default_container_t>) {
            deallocate_buckets();
        }
        allocate_buckets_from_shift();
        clear_and_fill_buckets_from_values();
    }

    template <typename Op>
    void do_erase(value_idx_type bucket_idx, Op handle_erased_value) {
        auto const value_idx_to_remove = at(m_buckets, bucket_idx).m_value_idx;

        // shift down until either empty or an element with correct spot is found
        auto next_bucket_idx = next(bucket_idx);
        while (at(m_buckets, next_bucket_idx).m_dist_and_fingerprint >= Bucket::dist_inc * 2) {
            at(m_buckets, bucket_idx) = {dist_dec(at(m_buckets, next_bucket_idx).m_dist_and_fingerprint),
                                         at(m_buckets, next_bucket_idx).m_value_idx};
            bucket_idx = std::exchange(next_bucket_idx, next(next_bucket_idx));
        }
        at(m_buckets, bucket_idx) = {};
        handle_erased_value(std::move(m_values[value_idx_to_remove]));

        // update m_values
        if (value_idx_to_remove != m_values.size() - 1) {
            // no luck, we'll have to replace the value with the last one and update the index accordingly
            auto& val = m_values[value_idx_to_remove];
            val = std::move(m_values.back());

            // update the values_idx of the moved entry. No need to play the info game, just look until we find the values_idx
            auto mh = mixed_hash(get_key(val));
            bucket_idx = bucket_idx_from_hash(mh);

            auto const values_idx_back = static_cast<value_idx_type>(m_values.size() - 1);
            while (values_idx_back != at(m_buckets, bucket_idx).m_value_idx) {
                bucket_idx = next(bucket_idx);
            }
            at(m_buckets, bucket_idx).m_value_idx = value_idx_to_remove;
        }
        m_values.pop_back();
    }

    template <typename K, typename Op>
    auto do_erase_key(K&& key, Op handle_erased_value) -> size_t {
        if (empty()) {
            return 0;
        }

        auto [dist_and_fingerprint, bucket_idx] = next_while_less(key);

        while (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
               !m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {
            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
            bucket_idx = next(bucket_idx);
        }

        if (dist_and_fingerprint != at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
            return 0;
        }
        do_erase(bucket_idx, handle_erased_value);
        return 1;
    }

    template <class K, class M>
    auto do_insert_or_assign(K&& key, M&& mapped) -> std::pair<iterator, bool> {
        auto it_isinserted = try_emplace(std::forward<K>(key), std::forward<M>(mapped));
        if (!it_isinserted.second) {
            it_isinserted.first->second = std::forward<M>(mapped);
        }
        return it_isinserted;
    }

    template <typename... Args>
    auto do_place_element(dist_and_fingerprint_type dist_and_fingerprint, value_idx_type bucket_idx, Args&&... args)
        -> std::pair<iterator, bool> {

        // emplace the new value. If that throws an exception, no harm done; index is still in a valid state
        m_values.emplace_back(std::forward<Args>(args)...);

        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);
        if (ANKERL_UNORDERED_DENSE_UNLIKELY(is_full())) {
            increase_size();
        } else {
            place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
        }

        // place element and shift up until we find an empty spot
        return {begin() + static_cast<difference_type>(value_idx), true};
    }

    template <typename K, typename... Args>
    auto do_try_emplace(K&& key, Args&&... args) -> std::pair<iterator, bool> {
        auto hash = mixed_hash(key);
        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
        auto bucket_idx = bucket_idx_from_hash(hash);

        while (true) {
            auto* bucket = &at(m_buckets, bucket_idx);
            if (dist_and_fingerprint == bucket->m_dist_and_fingerprint) {
                if (m_equal(key, get_key(m_values[bucket->m_value_idx]))) {
                    return {begin() + static_cast<difference_type>(bucket->m_value_idx), false};
                }
            } else if (dist_and_fingerprint > bucket->m_dist_and_fingerprint) {
                return do_place_element(dist_and_fingerprint,
                                        bucket_idx,
                                        std::piecewise_construct,
                                        std::forward_as_tuple(std::forward<K>(key)),
                                        std::forward_as_tuple(std::forward<Args>(args)...));
            }
            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
            bucket_idx = next(bucket_idx);
        }
    }

    template <typename K>
    auto do_find(K const& key) -> iterator {
        if (ANKERL_UNORDERED_DENSE_UNLIKELY(empty())) {
            return end();
        }

        auto mh = mixed_hash(key);
        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(mh);
        auto bucket_idx = bucket_idx_from_hash(mh);
        auto* bucket = &at(m_buckets, bucket_idx);

        // unrolled loop. *Always* check a few directly, then enter the loop. This is faster.
        if (dist_and_fingerprint == bucket->m_dist_and_fingerprint && m_equal(key, get_key(m_values[bucket->m_value_idx]))) {
            return begin() + static_cast<difference_type>(bucket->m_value_idx);
        }
        dist_and_fingerprint = dist_inc(dist_and_fingerprint);
        bucket_idx = next(bucket_idx);
        bucket = &at(m_buckets, bucket_idx);

        if (dist_and_fingerprint == bucket->m_dist_and_fingerprint && m_equal(key, get_key(m_values[bucket->m_value_idx]))) {
            return begin() + static_cast<difference_type>(bucket->m_value_idx);
        }
        dist_and_fingerprint = dist_inc(dist_and_fingerprint);
        bucket_idx = next(bucket_idx);
        bucket = &at(m_buckets, bucket_idx);

        while (true) {
            if (dist_and_fingerprint == bucket->m_dist_and_fingerprint) {
                if (m_equal(key, get_key(m_values[bucket->m_value_idx]))) {
                    return begin() + static_cast<difference_type>(bucket->m_value_idx);
                }
            } else if (dist_and_fingerprint > bucket->m_dist_and_fingerprint) {
                return end();
            }
            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
            bucket_idx = next(bucket_idx);
            bucket = &at(m_buckets, bucket_idx);
        }
    }

    template <typename K>
    auto do_find(K const& key) const -> const_iterator {
        return const_cast<table*>(this)->do_find(key); // NOLINT(cppcoreguidelines-pro-type-const-cast)
    }

    template <typename K, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto do_at(K const& key) -> Q& {
        if (auto it = find(key); ANKERL_UNORDERED_DENSE_LIKELY(end() != it)) {
            return it->second;
        }
        on_error_key_not_found();
    }

    template <typename K, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto do_at(K const& key) const -> Q const& {
        return const_cast<table*>(this)->at(key); // NOLINT(cppcoreguidelines-pro-type-const-cast)
    }

public:
    explicit table(size_t bucket_count,
                   Hash const& hash = Hash(),
                   KeyEqual const& equal = KeyEqual(),
                   allocator_type const& alloc_or_container = allocator_type())
        : m_values(alloc_or_container)
        , m_buckets(alloc_or_container)
        , m_hash(hash)
        , m_equal(equal) {
        if (0 != bucket_count) {
            reserve(bucket_count);
        } else {
            allocate_buckets_from_shift();
            clear_buckets();
        }
    }

    table()
        : table(0) {}

    table(size_t bucket_count, allocator_type const& alloc)
        : table(bucket_count, Hash(), KeyEqual(), alloc) {}

    table(size_t bucket_count, Hash const& hash, allocator_type const& alloc)
        : table(bucket_count, hash, KeyEqual(), alloc) {}

    explicit table(allocator_type const& alloc)
        : table(0, Hash(), KeyEqual(), alloc) {}

    template <class InputIt>
    table(InputIt first,
          InputIt last,
          size_type bucket_count = 0,
          Hash const& hash = Hash(),
          KeyEqual const& equal = KeyEqual(),
          allocator_type const& alloc = allocator_type())
        : table(bucket_count, hash, equal, alloc) {
        insert(first, last);
    }

    template <class InputIt>
    table(InputIt first, InputIt last, size_type bucket_count, allocator_type const& alloc)
        : table(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}

    template <class InputIt>
    table(InputIt first, InputIt last, size_type bucket_count, Hash const& hash, allocator_type const& alloc)
        : table(first, last, bucket_count, hash, KeyEqual(), alloc) {}

    table(table const& other)
        : table(other, other.m_values.get_allocator()) {}

    table(table const& other, allocator_type const& alloc)
        : m_values(other.m_values, alloc)
        , m_max_load_factor(other.m_max_load_factor)
        , m_hash(other.m_hash)
        , m_equal(other.m_equal) {
        copy_buckets(other);
    }

    table(table&& other) noexcept
        : table(std::move(other), other.m_values.get_allocator()) {}

    table(table&& other, allocator_type const& alloc) noexcept
        : m_values(alloc) {
        *this = std::move(other);
    }

    table(std::initializer_list<value_type> ilist,
          size_t bucket_count = 0,
          Hash const& hash = Hash(),
          KeyEqual const& equal = KeyEqual(),
          allocator_type const& alloc = allocator_type())
        : table(bucket_count, hash, equal, alloc) {
        insert(ilist);
    }

    table(std::initializer_list<value_type> ilist, size_type bucket_count, allocator_type const& alloc)
        : table(ilist, bucket_count, Hash(), KeyEqual(), alloc) {}

    table(std::initializer_list<value_type> init, size_type bucket_count, Hash const& hash, allocator_type const& alloc)
        : table(init, bucket_count, hash, KeyEqual(), alloc) {}

    ~table() {}

    auto operator=(table const& other) -> table& {
        if (&other != this) {
            deallocate_buckets(); // deallocate before m_values is set (might have another allocator)
            m_values = other.m_values;
            m_max_load_factor = other.m_max_load_factor;
            m_hash = other.m_hash;
            m_equal = other.m_equal;
            m_shifts = initial_shifts;
            copy_buckets(other);
        }
        return *this;
    }

    auto operator=(table&& other) noexcept(noexcept(std::is_nothrow_move_assignable_v<value_container_type> &&
                                                    std::is_nothrow_move_assignable_v<Hash> &&
                                                    std::is_nothrow_move_assignable_v<KeyEqual>)) -> table& {
        if (&other != this) {
            deallocate_buckets(); // deallocate before m_values is set (might have another allocator)
            m_values = std::move(other.m_values);
            other.m_values.clear();

            // we can only reuse m_buckets when both maps have the same allocator!
            if (get_allocator() == other.get_allocator()) {
                m_buckets = std::move(other.m_buckets);
                other.m_buckets.clear();
                m_max_bucket_capacity = std::exchange(other.m_max_bucket_capacity, 0);
                m_shifts = std::exchange(other.m_shifts, initial_shifts);
                m_max_load_factor = std::exchange(other.m_max_load_factor, default_max_load_factor);
                m_hash = std::exchange(other.m_hash, {});
                m_equal = std::exchange(other.m_equal, {});
                other.allocate_buckets_from_shift();
                other.clear_buckets();
            } else {
                // set max_load_factor *before* copying the other's buckets, so we have the same
                // behavior
                m_max_load_factor = other.m_max_load_factor;

                // copy_buckets sets m_buckets, m_num_buckets, m_max_bucket_capacity, m_shifts
                copy_buckets(other);
                // clear's the other's buckets so other is now already usable.
                other.clear_buckets();
                m_hash = other.m_hash;
                m_equal = other.m_equal;
            }
            // map "other" is now already usable, it's empty.
        }
        return *this;
    }

    auto operator=(std::initializer_list<value_type> ilist) -> table& {
        clear();
        insert(ilist);
        return *this;
    }

    auto get_allocator() const noexcept -> allocator_type {
        return m_values.get_allocator();
    }

    // iterators //////////////////////////////////////////////////////////////

    auto begin() noexcept -> iterator {
        return m_values.begin();
    }

    auto begin() const noexcept -> const_iterator {
        return m_values.begin();
    }

    auto cbegin() const noexcept -> const_iterator {
        return m_values.cbegin();
    }

    auto end() noexcept -> iterator {
        return m_values.end();
    }

    auto cend() const noexcept -> const_iterator {
        return m_values.cend();
    }

    auto end() const noexcept -> const_iterator {
        return m_values.end();
    }

    // capacity ///////////////////////////////////////////////////////////////

    [[nodiscard]] auto empty() const noexcept -> bool {
        return m_values.empty();
    }

    [[nodiscard]] auto size() const noexcept -> size_t {
        return m_values.size();
    }

    [[nodiscard]] static constexpr auto max_size() noexcept -> size_t {
        if constexpr ((std::numeric_limits<value_idx_type>::max)() == (std::numeric_limits<size_t>::max)()) {
            return size_t{1} << (sizeof(value_idx_type) * 8 - 1);
        } else {
            return size_t{1} << (sizeof(value_idx_type) * 8);
        }
    }

    // modifiers //////////////////////////////////////////////////////////////

    void clear() {
        m_values.clear();
        clear_buckets();
    }

    auto insert(value_type const& value) -> std::pair<iterator, bool> {
        return emplace(value);
    }

    auto insert(value_type&& value) -> std::pair<iterator, bool> {
        return emplace(std::move(value));
    }

    template <class P, std::enable_if_t<std::is_constructible_v<value_type, P&&>, bool> = true>
    auto insert(P&& value) -> std::pair<iterator, bool> {
        return emplace(std::forward<P>(value));
    }

    auto insert(const_iterator /*hint*/, value_type const& value) -> iterator {
        return insert(value).first;
    }

    auto insert(const_iterator /*hint*/, value_type&& value) -> iterator {
        return insert(std::move(value)).first;
    }

    template <class P, std::enable_if_t<std::is_constructible_v<value_type, P&&>, bool> = true>
    auto insert(const_iterator /*hint*/, P&& value) -> iterator {
        return insert(std::forward<P>(value)).first;
    }

    template <class InputIt>
    void insert(InputIt first, InputIt last) {
        while (first != last) {
            insert(*first);
            ++first;
        }
    }

    void insert(std::initializer_list<value_type> ilist) {
        insert(ilist.begin(), ilist.end());
    }

    // nonstandard API: *this is emptied.
    // Also see "A Standard flat_map" https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p0429r9.pdf
    auto extract() && -> value_container_type {
        return std::move(m_values);
    }

    // nonstandard API:
    // Discards the internally held container and replaces it with the one passed. Erases non-unique elements.
    auto replace(value_container_type&& container) {
        if (ANKERL_UNORDERED_DENSE_UNLIKELY(container.size() > max_size())) {
            on_error_too_many_elements();
        }
        auto shifts = calc_shifts_for_size(container.size());
        if (0 == bucket_count() || shifts < m_shifts || container.get_allocator() != m_values.get_allocator()) {
            m_shifts = shifts;
            deallocate_buckets();
            allocate_buckets_from_shift();
        }
        clear_buckets();

        m_values = std::move(container);

        // can't use clear_and_fill_buckets_from_values() because container elements might not be unique
        auto value_idx = value_idx_type{};

        // loop until we reach the end of the container. duplicated entries will be replaced with back().
        while (value_idx != static_cast<value_idx_type>(m_values.size())) {
            auto const& key = get_key(m_values[value_idx]);

            auto hash = mixed_hash(key);
            auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
            auto bucket_idx = bucket_idx_from_hash(hash);

            bool key_found = false;
            while (true) {
                auto const& bucket = at(m_buckets, bucket_idx);
                if (dist_and_fingerprint > bucket.m_dist_and_fingerprint) {
                    break;
                }
                if (dist_and_fingerprint == bucket.m_dist_and_fingerprint &&
                    m_equal(key, get_key(m_values[bucket.m_value_idx]))) {
                    key_found = true;
                    break;
                }
                dist_and_fingerprint = dist_inc(dist_and_fingerprint);
                bucket_idx = next(bucket_idx);
            }

            if (key_found) {
                if (value_idx != static_cast<value_idx_type>(m_values.size() - 1)) {
                    m_values[value_idx] = std::move(m_values.back());
                }
                m_values.pop_back();
            } else {
                place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
                ++value_idx;
            }
        }
    }

    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto insert_or_assign(Key const& key, M&& mapped) -> std::pair<iterator, bool> {
        return do_insert_or_assign(key, std::forward<M>(mapped));
    }

    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto insert_or_assign(Key&& key, M&& mapped) -> std::pair<iterator, bool> {
        return do_insert_or_assign(std::move(key), std::forward<M>(mapped));
    }

    template <typename K,
              typename M,
              typename Q = T,
              typename H = Hash,
              typename KE = KeyEqual,
              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
    auto insert_or_assign(K&& key, M&& mapped) -> std::pair<iterator, bool> {
        return do_insert_or_assign(std::forward<K>(key), std::forward<M>(mapped));
    }

    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto insert_or_assign(const_iterator /*hint*/, Key const& key, M&& mapped) -> iterator {
        return do_insert_or_assign(key, std::forward<M>(mapped)).first;
    }

    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto insert_or_assign(const_iterator /*hint*/, Key&& key, M&& mapped) -> iterator {
        return do_insert_or_assign(std::move(key), std::forward<M>(mapped)).first;
    }

    template <typename K,
              typename M,
              typename Q = T,
              typename H = Hash,
              typename KE = KeyEqual,
              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
    auto insert_or_assign(const_iterator /*hint*/, K&& key, M&& mapped) -> iterator {
        return do_insert_or_assign(std::forward<K>(key), std::forward<M>(mapped)).first;
    }

    // Single arguments for unordered_set can be used without having to construct the value_type
    template <class K,
              typename Q = T,
              typename H = Hash,
              typename KE = KeyEqual,
              std::enable_if_t<!is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
    auto emplace(K&& key) -> std::pair<iterator, bool> {
        auto hash = mixed_hash(key);
        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
        auto bucket_idx = bucket_idx_from_hash(hash);

        while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
            if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
                m_equal(key, m_values[at(m_buckets, bucket_idx).m_value_idx])) {
                // found it, return without ever actually creating anything
                return {begin() + static_cast<difference_type>(at(m_buckets, bucket_idx).m_value_idx), false};
            }
            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
            bucket_idx = next(bucket_idx);
        }

        // value is new, insert element first, so when exception happens we are in a valid state
        return do_place_element(dist_and_fingerprint, bucket_idx, std::forward<K>(key));
    }

    template <class... Args>
    auto emplace(Args&&... args) -> std::pair<iterator, bool> {
        // we have to instantiate the value_type to be able to access the key.
        // 1. emplace_back the object so it is constructed. 2. If the key is already there, pop it later in the loop.
        auto& key = get_key(m_values.emplace_back(std::forward<Args>(args)...));
        auto hash = mixed_hash(key);
        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);
        auto bucket_idx = bucket_idx_from_hash(hash);

        while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {
            if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&
                m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {
                m_values.pop_back(); // value was already there, so get rid of it
                return {begin() + static_cast<difference_type>(at(m_buckets, bucket_idx).m_value_idx), false};
            }
            dist_and_fingerprint = dist_inc(dist_and_fingerprint);
            bucket_idx = next(bucket_idx);
        }

        // value is new, place the bucket and shift up until we find an empty spot
        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);
        if (ANKERL_UNORDERED_DENSE_UNLIKELY(is_full())) {
            // increase_size just rehashes all the data we have in m_values
            increase_size();
        } else {
            // place element and shift up until we find an empty spot
            place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);
        }
        return {begin() + static_cast<difference_type>(value_idx), true};
    }

    template <class... Args>
    auto emplace_hint(const_iterator /*hint*/, Args&&... args) -> iterator {
        return emplace(std::forward<Args>(args)...).first;
    }

    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto try_emplace(Key const& key, Args&&... args) -> std::pair<iterator, bool> {
        return do_try_emplace(key, std::forward<Args>(args)...);
    }

    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto try_emplace(Key&& key, Args&&... args) -> std::pair<iterator, bool> {
        return do_try_emplace(std::move(key), std::forward<Args>(args)...);
    }

    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto try_emplace(const_iterator /*hint*/, Key const& key, Args&&... args) -> iterator {
        return do_try_emplace(key, std::forward<Args>(args)...).first;
    }

    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto try_emplace(const_iterator /*hint*/, Key&& key, Args&&... args) -> iterator {
        return do_try_emplace(std::move(key), std::forward<Args>(args)...).first;
    }

    template <
        typename K,
        typename... Args,
        typename Q = T,
        typename H = Hash,
        typename KE = KeyEqual,
        std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE> && is_neither_convertible_v<K&&, iterator, const_iterator>,
                         bool> = true>
    auto try_emplace(K&& key, Args&&... args) -> std::pair<iterator, bool> {
        return do_try_emplace(std::forward<K>(key), std::forward<Args>(args)...);
    }

    template <
        typename K,
        typename... Args,
        typename Q = T,
        typename H = Hash,
        typename KE = KeyEqual,
        std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE> && is_neither_convertible_v<K&&, iterator, const_iterator>,
                         bool> = true>
    auto try_emplace(const_iterator /*hint*/, K&& key, Args&&... args) -> iterator {
        return do_try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
    }

    auto erase(iterator it) -> iterator {
        auto hash = mixed_hash(get_key(*it));
        auto bucket_idx = bucket_idx_from_hash(hash);

        auto const value_idx_to_remove = static_cast<value_idx_type>(it - cbegin());
        while (at(m_buckets, bucket_idx).m_value_idx != value_idx_to_remove) {
            bucket_idx = next(bucket_idx);
        }

        do_erase(bucket_idx, [](value_type&& /*unused*/) {
        });
        return begin() + static_cast<difference_type>(value_idx_to_remove);
    }

    auto extract(iterator it) -> value_type {
        auto hash = mixed_hash(get_key(*it));
        auto bucket_idx = bucket_idx_from_hash(hash);

        auto const value_idx_to_remove = static_cast<value_idx_type>(it - cbegin());
        while (at(m_buckets, bucket_idx).m_value_idx != value_idx_to_remove) {
            bucket_idx = next(bucket_idx);
        }

        auto tmp = std::optional<value_type>{};
        do_erase(bucket_idx, [&tmp](value_type&& val) {
            tmp = std::move(val);
        });
        return std::move(tmp).value();
    }

    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto erase(const_iterator it) -> iterator {
        return erase(begin() + (it - cbegin()));
    }

    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto extract(const_iterator it) -> value_type {
        return extract(begin() + (it - cbegin()));
    }

    auto erase(const_iterator first, const_iterator last) -> iterator {
        auto const idx_first = first - cbegin();
        auto const idx_last = last - cbegin();
        auto const first_to_last = std::distance(first, last);
        auto const last_to_end = std::distance(last, cend());

        // remove elements from left to right which moves elements from the end back
        auto const mid = idx_first + (std::min)(first_to_last, last_to_end);
        auto idx = idx_first;
        while (idx != mid) {
            erase(begin() + idx);
            ++idx;
        }

        // all elements from the right are moved, now remove the last element until all done
        idx = idx_last;
        while (idx != mid) {
            --idx;
            erase(begin() + idx);
        }

        return begin() + idx_first;
    }

    auto erase(Key const& key) -> size_t {
        return do_erase_key(key, [](value_type&& /*unused*/) {
        });
    }

    auto extract(Key const& key) -> std::optional<value_type> {
        auto tmp = std::optional<value_type>{};
        do_erase_key(key, [&tmp](value_type&& val) {
            tmp = std::move(val);
        });
        return tmp;
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto erase(K&& key) -> size_t {
        return do_erase_key(std::forward<K>(key), [](value_type&& /*unused*/) {
        });
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto extract(K&& key) -> std::optional<value_type> {
        auto tmp = std::optional<value_type>{};
        do_erase_key(std::forward<K>(key), [&tmp](value_type&& val) {
            tmp = std::move(val);
        });
        return tmp;
    }

    void swap(table& other) noexcept(noexcept(std::is_nothrow_swappable_v<value_container_type> &&
                                              std::is_nothrow_swappable_v<Hash> && std::is_nothrow_swappable_v<KeyEqual>)) {
        using std::swap;
        swap(other, *this);
    }

    // lookup /////////////////////////////////////////////////////////////////

    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto at(key_type const& key) -> Q& {
        return do_at(key);
    }

    template <typename K,
              typename Q = T,
              typename H = Hash,
              typename KE = KeyEqual,
              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
    auto at(K const& key) -> Q& {
        return do_at(key);
    }

    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto at(key_type const& key) const -> Q const& {
        return do_at(key);
    }

    template <typename K,
              typename Q = T,
              typename H = Hash,
              typename KE = KeyEqual,
              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
    auto at(K const& key) const -> Q const& {
        return do_at(key);
    }

    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto operator[](Key const& key) -> Q& {
        return try_emplace(key).first->second;
    }

    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>
    auto operator[](Key&& key) -> Q& {
        return try_emplace(std::move(key)).first->second;
    }

    template <typename K,
              typename Q = T,
              typename H = Hash,
              typename KE = KeyEqual,
              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>
    auto operator[](K&& key) -> Q& {
        return try_emplace(std::forward<K>(key)).first->second;
    }

    auto count(Key const& key) const -> size_t {
        return find(key) == end() ? 0 : 1;
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto count(K const& key) const -> size_t {
        return find(key) == end() ? 0 : 1;
    }

    auto find(Key const& key) -> iterator {
        return do_find(key);
    }

    auto find(Key const& key) const -> const_iterator {
        return do_find(key);
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto find(K const& key) -> iterator {
        return do_find(key);
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto find(K const& key) const -> const_iterator {
        return do_find(key);
    }

    auto contains(Key const& key) const -> bool {
        return find(key) != end();
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto contains(K const& key) const -> bool {
        return find(key) != end();
    }

    auto equal_range(Key const& key) -> std::pair<iterator, iterator> {
        auto it = do_find(key);
        return {it, it == end() ? end() : it + 1};
    }

    auto equal_range(const Key& key) const -> std::pair<const_iterator, const_iterator> {
        auto it = do_find(key);
        return {it, it == end() ? end() : it + 1};
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto equal_range(K const& key) -> std::pair<iterator, iterator> {
        auto it = do_find(key);
        return {it, it == end() ? end() : it + 1};
    }

    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>
    auto equal_range(K const& key) const -> std::pair<const_iterator, const_iterator> {
        auto it = do_find(key);
        return {it, it == end() ? end() : it + 1};
    }

    // bucket interface ///////////////////////////////////////////////////////

    auto bucket_count() const noexcept -> size_t { // NOLINT(modernize-use-nodiscard)
        return m_buckets.size();
    }

    static constexpr auto max_bucket_count() noexcept -> size_t { // NOLINT(modernize-use-nodiscard)
        return max_size();
    }

    // hash policy ////////////////////////////////////////////////////////////

    [[nodiscard]] auto load_factor() const -> float {
        return bucket_count() ? static_cast<float>(size()) / static_cast<float>(bucket_count()) : 0.0F;
    }

    [[nodiscard]] auto max_load_factor() const -> float {
        return m_max_load_factor;
    }

    void max_load_factor(float ml) {
        m_max_load_factor = ml;
        if (bucket_count() != max_bucket_count()) {
            m_max_bucket_capacity = static_cast<value_idx_type>(static_cast<float>(bucket_count()) * max_load_factor());
        }
    }

    void rehash(size_t count) {
        count = (std::min)(count, max_size());
        auto shifts = calc_shifts_for_size((std::max)(count, size()));
        if (shifts != m_shifts) {
            m_shifts = shifts;
            deallocate_buckets();
            m_values.shrink_to_fit();
            allocate_buckets_from_shift();
            clear_and_fill_buckets_from_values();
        }
    }

    void reserve(size_t capa) {
        capa = (std::min)(capa, max_size());
        if constexpr (has_reserve<value_container_type>) {
            // std::deque doesn't have reserve(). Make sure we only call when available
            m_values.reserve(capa);
        }
        auto shifts = calc_shifts_for_size((std::max)(capa, size()));
        if (0 == bucket_count() || shifts < m_shifts) {
            m_shifts = shifts;
            deallocate_buckets();
            allocate_buckets_from_shift();
            clear_and_fill_buckets_from_values();
        }
    }

    // observers //////////////////////////////////////////////////////////////

    auto hash_function() const -> hasher {
        return m_hash;
    }

    auto key_eq() const -> key_equal {
        return m_equal;
    }

    // nonstandard API: expose the underlying values container
    [[nodiscard]] auto values() const noexcept -> value_container_type const& {
        return m_values;
    }

    // non-member functions ///////////////////////////////////////////////////

    friend auto operator==(table const& a, table const& b) -> bool {
        if (&a == &b) {
            return true;
        }
        if (a.size() != b.size()) {
            return false;
        }
        for (auto const& b_entry : b) {
            auto it = a.find(get_key(b_entry));
            if constexpr (is_map_v<T>) {
                // map: check that key is here, then also check that value is the same
                if (a.end() == it || !(b_entry.second == it->second)) {
                    return false;
                }
            } else {
                // set: only check that the key is here
                if (a.end() == it) {
                    return false;
                }
            }
        }
        return true;
    }

    friend auto operator!=(table const& a, table const& b) -> bool {
        return !(a == b);
    }
};

} // namespace detail

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class T,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class AllocatorOrContainer = std::allocator<std::pair<Key, T>>,
                                        class Bucket = bucket_type::standard,
                                        class BucketContainer = detail::default_container_t>
using map = detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, false>;

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class T,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class AllocatorOrContainer = std::allocator<std::pair<Key, T>>,
                                        class Bucket = bucket_type::standard,
                                        class BucketContainer = detail::default_container_t>
using segmented_map = detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, true>;

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class AllocatorOrContainer = std::allocator<Key>,
                                        class Bucket = bucket_type::standard,
                                        class BucketContainer = detail::default_container_t>
using set = detail::table<Key, void, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, false>;

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class AllocatorOrContainer = std::allocator<Key>,
                                        class Bucket = bucket_type::standard,
                                        class BucketContainer = detail::default_container_t>
using segmented_set = detail::table<Key, void, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, true>;

#    if defined(ANKERL_UNORDERED_DENSE_PMR)

namespace pmr {

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class T,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class Bucket = bucket_type::standard>
using map = detail::table<Key,
                          T,
                          Hash,
                          KeyEqual,
                          ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<std::pair<Key, T>>,
                          Bucket,
                          detail::default_container_t,
                          false>;

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class T,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class Bucket = bucket_type::standard>
using segmented_map = detail::table<Key,
                                    T,
                                    Hash,
                                    KeyEqual,
                                    ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<std::pair<Key, T>>,
                                    Bucket,
                                    detail::default_container_t,
                                    true>;

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class Bucket = bucket_type::standard>
using set = detail::table<Key,
                          void,
                          Hash,
                          KeyEqual,
                          ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<Key>,
                          Bucket,
                          detail::default_container_t,
                          false>;

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class Hash = hash<Key>,
                                        class KeyEqual = std::equal_to<Key>,
                                        class Bucket = bucket_type::standard>
using segmented_set = detail::table<Key,
                                    void,
                                    Hash,
                                    KeyEqual,
                                    ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<Key>,
                                    Bucket,
                                    detail::default_container_t,
                                    true>;

} // namespace pmr

#    endif

// deduction guides ///////////////////////////////////////////////////////////

// deduction guides for alias templates are only possible since C++20
// see https://en.cppreference.com/w/cpp/language/class_template_argument_deduction

} // namespace ANKERL_UNORDERED_DENSE_NAMESPACE
} // namespace ankerl::unordered_dense

// std extensions /////////////////////////////////////////////////////////////

namespace std { // NOLINT(cert-dcl58-cpp)

ANKERL_UNORDERED_DENSE_EXPORT template <class Key,
                                        class T,
                                        class Hash,
                                        class KeyEqual,
                                        class AllocatorOrContainer,
                                        class Bucket,
                                        class Pred,
                                        class BucketContainer,
                                        bool IsSegmented>
// NOLINTNEXTLINE(cert-dcl58-cpp)
auto erase_if(
    ankerl::unordered_dense::detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, IsSegmented>&
        map,
    Pred pred) -> size_t {
    using map_t = ankerl::unordered_dense::detail::
        table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, IsSegmented>;

    // going back to front because erase() invalidates the end iterator
    auto const old_size = map.size();
    auto idx = old_size;
    while (idx) {
        --idx;
        auto it = map.begin() + static_cast<typename map_t::difference_type>(idx);
        if (pred(*it)) {
            map.erase(it);
        }
    }

    return old_size - map.size();
}

} // namespace std

#endif
#endif


================================================
FILE: kimimaro/__init__.py
================================================
"""
Kimimaro: TEASAR derived skeletonization for 3D densely labeled images.

Kimimaro is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Kimimaro is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.
"""

from .intake import skeletonize, DimensionError, synapses_to_targets, connect_points
from .post import postprocess, join_close_components
from .utility import (
	extract_skeleton_from_binary_image,
	cross_sectional_area, 
	cross_sectional_area_single,
	oversegment,
)


================================================
FILE: kimimaro/intake.py
================================================
"""
This file is part of Kimimaro.

Kimimaro is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Kimimaro is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.
"""

from collections import defaultdict
from functools import partial
import gc
import multiprocessing as mp
import signal
import uuid

import numpy as np
import pathos.pools
import scipy.spatial
from tqdm import tqdm

from osteoid import Skeleton, Bbox

import cc3d # connected components
from crackle import CrackleArray
import edt # euclidean distance transform
import fastremap
import fill_voids

import kimimaro.skeletontricks
import kimimaro.trace

from . import sharedmemory as shm
from .utility import compute_cc_labels, find_objects

class DimensionError(Exception):
  pass

DEFAULT_TEASAR_PARAMS = {
  "scale": 1.5, 
  "const": 300,
  "pdrf_scale": 100000,
  "pdrf_exponent": 4,
  "soma_acceptance_threshold": 3500,
  "soma_detection_threshold": 750,
  "soma_invalidation_const": 300,
  "soma_invalidation_scale": 2
}

def skeletonize(
  all_labels, teasar_params=DEFAULT_TEASAR_PARAMS, anisotropy=(1,1,1),
  object_ids=None, dust_threshold=1000, 
  progress=True, fix_branching=True, in_place=False, 
  fix_borders=True, parallel=1, parallel_chunk_size=100,
  extra_targets_before=[], extra_targets_after=[],
  fill_holes=False, fix_avocados=False,
  voxel_graph=None
):
  """
  Skeletonize all non-zero labels in a given 2D or 3D image.

  Required:
    all_labels: a 2D or 3D numpy array of integer type (signed or unsigned) 

  Optional:
    anisotropy: the physical dimensions of each axis (e.g. 4nm x 4nm x 40nm)
    object_ids: If not none, zero out all labels other than those specified here.
    teasar_params: {
      scale: during the "rolling ball" invalidation phase, multiply 
          the DBF value by this.
      const: during the "rolling ball" invalidation phase, this 
          is the minimum radius in chosen physical units (i.e. nm).
      soma_detection_threshold: if object has a DBF value larger than this, 
          root will be placed at largest DBF value and special one time invalidation
          will be run over that root location (see soma_invalidation scale)
          expressed in chosen physical units (i.e. nm) 
      pdrf_scale: scale factor in front of dbf, used to weight dbf over euclidean distance (higher to pay more attention to dbf) (default 5000)
      pdrf_exponent: exponent in dbf formula on distance from edge, faster if factor of 2 (default 16)
      soma_invalidation_scale: the 'scale' factor used in the one time soma root invalidation (default .5)
      soma_invalidation_const: the 'const' factor used in the one time soma root invalidation (default 0)
                             (units in chosen physical units (i.e. nm))
      max_paths: max paths to trace on a single object. Moves onto the next object after this point.
    }
    dust_threshold: don't bother skeletonizing connected components smaller than
      this many voxels.
    fill_holes: preemptively run a void filling algorithm on all connected
      components and delete labels that get filled in. This can improve the
      quality of the reconstruction if holes in the shapes are artifacts introduced
      by the segmentation pipeline. This option incurs moderate overhead.

      WARNING: THIS WILL REMOVE INPUT LABELS THAT ARE DEEMED TO BE HOLES.

    extra_targets_before: List of x,y,z voxel coordinates that will all 
      be traced to from the root regardless of whether those points have 
      been invalidated. These targets will be applied BEFORE the regular
      target selection algorithm is run.      

      e.g. [ (x,y,z), (x,y,z) ]

    extra_targets_after: Same as extra_targets_before but the additional
      targets will be applied AFTER the usual algorithm runs.

    progress: if true, display a progress bar
    fix_branching: When enabled, zero the edge weights by of previously 
      traced paths. This causes branch points to occur closer to 
      the actual path divergence. However, there is a performance penalty
      associated with this as dijkstra's algorithm is computed once per a path
      rather than once per a skeleton.
    in_place: if true, allow input labels to be modified to reduce
      memory usage and possibly improve performance.
    fix_borders: ensure that segments touching the border place a 
      skeleton endpoint in a predictable place to make merging 
      adjacent chunks easier.
    fix_avocados: If nuclei are segmented seperately from somata
      then we can try to detect and fix this issue.
    voxel_graph: a connection graph that defines permissible 
      directions of motion between voxels. This is useful for
      dealing with self-touches. The graph is defined by the
      conventions used in cc3d.voxel_connectivity_graph 
      (https://github.com/seung-lab/connected-components-3d/blob/3.2.0/cc3d_graphs.hpp#L73-L92)
    parallel: number of subprocesses to use.
      <= 0: Use multiprocessing.count_cpu() 
         1: Only use the main process.
      >= 2: Use this number of subprocesses.
    parallel_chunk_size: default number of skeletons to 
      submit to each parallel process before returning results,
      updating the progress bar, and submitting a new task set. 
      Setting this number too low results in excess IPC overhead,
      and setting it too high can result in task starvation towards
      the end of a job and infrequent progress bar updates. If the
      chunk size is set higher than num tasks // parallel, that number
      is used instead.

  Returns: { $segid: osteoid.Skeleton, ... }
  """

  anisotropy = np.array(anisotropy, dtype=np.float32)

  all_labels = format_labels(all_labels, in_place=in_place)
  all_labels = apply_object_mask(all_labels, object_ids)

  if all_labels.size <= dust_threshold:
    return {}
  
  if isinstance(all_labels, CrackleArray):
    minlabel = all_labels.min()
    maxlabel = all_labels.max()
  else:
    minlabel, maxlabel = fastremap.minmax(all_labels)

  if minlabel == 0 and maxlabel == 0:
    return {}

  cc_labels, remapping = compute_cc_labels(all_labels, voxel_graph)
  del all_labels

  if isinstance(cc_labels, CrackleArray) and (fill_holes or voxel_graph or fix_avocados):
    cc_labels = cc_labels.numpy()

  if fill_holes:
    cc_labels = fill_all_holes(cc_labels, progress)

  extra_targets_before = points_to_labels(extra_targets_before, cc_labels)
  extra_targets_after = points_to_labels(extra_targets_after, cc_labels)

  def edtfn(labels):
    if isinstance(labels, CrackleArray):
      labels = labels[:]

    return edt.edt(labels, 
      anisotropy=anisotropy,
      black_border=(minlabel == maxlabel),
      parallel=parallel,
      voxel_graph=voxel_graph,
    )

  all_dbf = edtfn(cc_labels)
  
  if fix_avocados:
    cc_labels, all_dbf, remapping = engage_avocado_protection(
      cc_labels, all_dbf, remapping,
      soma_detection_threshold=teasar_params.get('soma_detection_threshold', 0),
      edtfn=edtfn,
      progress=progress,
    )

  if isinstance(cc_labels, CrackleArray):
    cc_ct_iterator = cc_labels.voxel_counts().items()
  else:
    cc_segids, pxct = fastremap.unique(cc_labels, return_counts=True)
    cc_ct_iterator = zip(cc_segids, pxct)
  
  cc_segids = [ sid for sid, ct in cc_ct_iterator if ct > dust_threshold and sid != 0 ]

  all_slices = find_objects(cc_labels)

  border_targets = defaultdict(list)
  if fix_borders:
    border_targets = compute_border_targets(cc_labels, anisotropy)

  print_quotes(parallel) # easter egg

  if parallel <= 0:
    parallel = mp.cpu_count()

  if parallel == 1:
    return skeletonize_subset(
      all_dbf, cc_labels, voxel_graph, remapping, 
      teasar_params, anisotropy, all_slices, 
      border_targets, extra_targets_before, extra_targets_after,
      progress, fix_borders, fix_branching, 
      cc_segids
    )
  else:
    # The following section can't be moved into 
    # skeletonize parallel because then all_dbf 
    # and cc_labels can't be deleted to save memory.
    suffix = uuid.uuid1().hex

    dbf_shm_location = 'kimimaro-shm-dbf-' + suffix
    cc_shm_location = 'kimimaro-shm-cc-labels-' + suffix
    vg_shm_location = 'kimimaro-shm-voxel-graph-' + suffix

    try:
      dbf_mmap, all_dbf_shm = shm.ndarray( all_dbf.shape, all_dbf.dtype, dbf_shm_location, order='F')
      all_dbf_shm[:] = all_dbf 
      del all_dbf 

      cc_mmap, cc_labels_shm = shm.ndarray( cc_labels.shape, cc_labels.dtype, cc_shm_location, order='F')    
      cc_labels_shm[:] = cc_labels[:]
      del cc_labels

      voxel_graph_shm = None
      vg_mmap = None
      if voxel_graph is not None:
        vg_mmap, voxel_graph_shm = shm.ndarray( voxel_graph.shape, voxel_graph.dtype, vg_shm_location, order='F')    
        voxel_graph_shm[:] = voxel_graph
        del voxel_graph

      skeletons = skeletonize_parallel(      
        all_dbf_shm, dbf_shm_location, 
        cc_labels_shm, cc_shm_location, remapping, 
        voxel_graph_shm, vg_shm_location,
        teasar_params, anisotropy, all_slices, 
        border_targets, extra_targets_before, extra_targets_after,
        progress, fix_borders, fix_branching, 
        cc_segids, parallel, parallel_chunk_size
      )
    finally:
      dbf_mmap.close()
      cc_mmap.close()
      shm.unlink(dbf_shm_location)
      shm.unlink(cc_shm_location)
      if voxel_graph is not None:
        vg_mmap.close()
        shm.unlink(vg_shm_location)

    return skeletons

def connect_points(
  labels, start, end,
  anisotropy=(1,1,1), 
  fill_holes=False, 
  in_place=False,
  pdrf_scale=100000, 
  pdrf_exponent=4,
):
  """
  Extract a single centerline skeleton between
  two preselected points from a binary image.

  labels: a 2D or 3D binary image
  start: an (x,y,z) tuple
  end: an (x,y,z) tuple

  anisotropy: the physical dimensions of each axis (e.g. 4nm x 4nm x 40nm)
  fill_holes: preemptively run a void filling algorithm on all connected
    components and delete labels that get filled in. This can improve the
    quality of the reconstruction if holes in the shapes are artifacts introduced
    by the segmentation pipeline.

  pdrf_scale: scale factor in front of dbf, used to weight dbf over euclidean distance (higher to pay more attention to dbf)
  pdrf_exponent: exponent in dbf formula on distance from edge, faster if factor of 2
  """
  anisotropy = np.array(anisotropy, dtype=np.float32)
  start = tuple(start)
  end = tuple(end)

  labels = labels.astype(np.bool)
  labels = format_labels(labels, in_place=in_place)

  cc_labels, remapping = compute_cc_labels(labels)
  if cc_labels[start] == 0 or cc_labels[start] != cc_labels[end]:
    raise ValueError("Cannot extract centerline from disconnected components.")
  del cc_labels
  del remapping

  skel = kimimaro.trace.point_to_point(
    labels, start, end,
    anisotropy=anisotropy, 
    pdrf_scale=pdrf_scale, pdrf_exponent=pdrf_exponent,
  )
  skel.vertices *= anisotropy
  skel.space = 'physical'
  return skel

def format_labels(labels, in_place):
  if isinstance(labels, CrackleArray):
    return labels

  if in_place:
    labels = fastremap.asfortranarray(labels)
  else:
    labels = np.copy(labels, order='F')

  if labels.dtype == bool:
    labels = labels.view(np.uint8)

  original_shape = labels.shape

  while labels.ndim < 3:
    labels = labels[..., np.newaxis ]

  while labels.ndim > 3:
    if labels.shape[-1] == 1:
      labels = labels[..., 0]
    else:
      raise DimensionError(
        "Input labels may be no more than three non-trivial dimensions. Got: {}".format(
          original_shape
        )
      )

  return labels

def skeletonize_parallel(
    all_dbf_shm, dbf_shm_location, 
    cc_labels_shm, cc_shm_location, remapping, 
    voxel_graph_shm, vg_shm_location,
    teasar_params, anisotropy, all_slices, 
    border_targets, extra_targets_before, extra_targets_after,
    progress, fix_borders, fix_branching, 
    cc_segids, parallel, chunk_size
  ):
    prevsigint = signal.getsignal(signal.SIGINT)
    prevsigterm = signal.getsignal(signal.SIGTERM)
    
    # Don't fork, spawn entirely new processes. This
    # avoids accidental deadlocks.
    mp.set_start_method("spawn", force=True)
    
    executor = pathos.pools.ProcessPool(parallel)

    def cleanup(signum, frame):
      shm.unlink(dbf_shm_location)
      shm.unlink(cc_shm_location)
      executor.terminate()

    signal.signal(signal.SIGINT, cleanup)
    signal.signal(signal.SIGTERM, cleanup)   

    vg_shape = voxel_graph_shm.shape if voxel_graph_shm else None
    vg_dtype = voxel_graph_shm.dtype if voxel_graph_shm else None

    skeletonizefn = partial(parallel_skeletonize_subset, 
      dbf_shm_location, all_dbf_shm.shape, all_dbf_shm.dtype, 
      cc_shm_location, cc_labels_shm.shape, cc_labels_shm.dtype,
      vg_shm_location, vg_shape, vg_dtype,
      remapping, teasar_params, anisotropy, all_slices, 
      border_targets, extra_targets_before, extra_targets_after, 
      False, # progress, use our own progress bar below
      fix_borders, fix_branching, 
    )

    ccids = []
    if chunk_size < len(cc_segids) // parallel:
      for i in range(0, len(cc_segids), chunk_size):
        ccids.append(cc_segids[i:i+chunk_size])
    else:
      for i in range(parallel):
        ccids.append(cc_segids[i::parallel])

    skeletons = defaultdict(list)
    with tqdm(total=len(cc_segids), disable=(not progress), desc="Skeletonizing Labels") as pbar:
      for skels in executor.uimap(skeletonizefn, ccids):
        for segid, skel in skels.items():
          skeletons[segid].append(skel)
        pbar.update(len(skels))
    executor.close()
    executor.join()
    executor.clear()

    signal.signal(signal.SIGINT, prevsigint)
    signal.signal(signal.SIGTERM, prevsigterm)
    
    shm.unlink(dbf_shm_location)
    shm.unlink(cc_shm_location)
    shm.unlink(vg_shm_location)

    return merge(skeletons)

def parallel_skeletonize_subset(    
    dbf_shm_location, dbf_shape, dbf_dtype, 
    cc_shm_location, cc_shape, cc_dtype, 
    vg_shm_location, vg_shape, vg_dtype,
    *args, **kwargs
  ):
  
  dbf_mmap, all_dbf = shm.ndarray( dbf_shape, dtype=dbf_dtype, location=dbf_shm_location, order='F')
  cc_mmap, cc_labels = shm.ndarray( cc_shape, dtype=cc_dtype, location=cc_shm_location, order='F')

  if vg_shape is None:
    vg_mmap, voxel_graph = None, None
  else:
    vg_mmap, voxel_graph = shm.ndarray( vg_shape, dtype=vg_dtype, location=vg_shm_location, order='F')

  skels = skeletonize_subset(all_dbf, cc_labels, voxel_graph, *args, **kwargs)

  dbf_mmap.close()
  cc_mmap.close()
  if vg_mmap:
    vg_mmap.close()

  return skels

def skeletonize_subset(
    all_dbf, cc_labels, voxel_graph, remapping, 
    teasar_params, anisotropy, all_slices, 
    border_targets, extra_targets_before, extra_targets_after,
    progress, fix_borders, fix_branching, 
    cc_segids
  ):

  skeletons = defaultdict(list)

  with tqdm(cc_segids, disable=(not progress), desc="Skeletonizing Labels") as pbar:
    for segid in pbar:

      pbar.set_postfix(label=str(remapping[segid]))

      # Crop DBF to ROI
      slices = all_slices[segid - 1]
      if slices is None:
        continue

      roi = Bbox.from_slices(slices)
      if roi.volume() <= 1:
        continue

      if isinstance(cc_labels, CrackleArray):
        labels = cc_labels.decompress(label=segid, crop=True)
        label_slcs = (slices[0], slices[1], slice(None))
        labels = np.asfortranarray(labels[label_slcs])
      else:
        labels = cc_labels[slices]
        labels = (labels == segid)

      dbf = np.where(labels, all_dbf[slices], 0.0)
      cropped_voxel_graph = (voxel_graph[slices] if voxel_graph is not None else None)

      manual_targets_before = []
      manual_targets_after = []
      root = None 

      def translate_to_roi(targets):
        targets = np.array(targets)
        targets -= roi.minpt.astype(np.uint32)
        return targets.tolist()      

      # We only source a predetermined root from 
      # border_targets because we understand that it's
      # located at a reasonable place at the edge of the
      # shape. In theory, extra targets can be positioned
      # anywhere within the shape or off the shape, making it 
      # a dicey proposition. 
      if len(border_targets[segid]) > 0:
        manual_targets_before = translate_to_roi(border_targets[segid])
        root = manual_targets_before.pop()

      if segid in extra_targets_before and len(extra_targets_before[segid]) > 0:
        manual_targets_before.extend( translate_to_roi(extra_targets_before[segid]) )

      if segid in extra_targets_after and len(extra_targets_after[segid]) > 0:
        manual_targets_after.extend( translate_to_roi(extra_targets_after[segid]) )

      skeleton = kimimaro.trace.trace(
        labels, 
        dbf, 
        anisotropy=anisotropy, 
        fix_branching=fix_branching, 
        manual_targets_before=manual_targets_before,
        manual_targets_after=manual_targets_after,
        root=root,
        voxel_graph=cropped_voxel_graph,
        **teasar_params
      )

      if skeleton.empty():
        continue

      skeleton.vertices += roi.minpt.astype(skeleton.vertices.dtype, copy=False)

      orig_segid = remapping[segid]
      skeleton.id = orig_segid
      skeleton.vertices = np.multiply(skeleton.vertices, anisotropy, dtype=np.float32)
      skeleton.space = 'physical'
      skeletons[orig_segid].append(skeleton)

  return merge(skeletons)

def apply_object_mask(all_labels, object_ids):
  if object_ids is None:
    return all_labels

  if isinstance(all_labels, CrackleArray):
    mask = all_labels.labels()
    mask = { u: 0 for u in mask }
    for segid in object_ids:
      mask[segid] = segid
    return all_labels.remap(mask).condense()

  if len(object_ids) == 1:
    all_labels = kimimaro.skeletontricks.zero_out_all_except(all_labels, object_ids[0]) # faster
  else:
    all_labels = fastremap.mask_except(all_labels, object_ids, in_place=True)

  return all_labels

def points_to_labels(pts, cc_labels):
  mapping = defaultdict(list)
  for pt in pts:
    pt = tuple(pt)
    mapping[ cc_labels[pt] ].append(pt)
  return mapping

def compute_border_targets(cc_labels, anisotropy):
  sx, sy, sz = cc_labels.shape

  if isinstance(cc_labels, CrackleArray):
    cc_labels = cc_labels.numpy()

  planes = (
    ( cc_labels[:,:,0], (0, 1), lambda x,y: (x, y, 0) ),     # top xy
    ( cc_labels[:,:,-1], (0, 1), lambda x,y: (x, y, sz-1) ), # bottom xy
    ( cc_labels[:,0,:], (0, 2), lambda x,z: (x, 0, z) ),     # left xz
    ( cc_labels[:,-1,:], (0, 2), lambda x,z: (x, sy-1, z) ), # right xz
    ( cc_labels[0,:,:], (1, 2), lambda y,z: (0, y, z) ),     # front yz
    ( cc_labels[-1,:,:], (1, 2), lambda y,z: (sx-1, y, z) )  # back yz
  )

  target_list = defaultdict(set)

  for plane, dims, rotatefn in planes:
    wx, wy = anisotropy[dims[0]], anisotropy[dims[1]]
    plane = np.copy(plane, order='F')
    cc_plane = cc3d.connected_components(np.ascontiguousarray(plane))
    dt_plane = edt.edt(cc_plane, black_border=True, anisotropy=(wx, wy))

    plane_targets = kimimaro.skeletontricks.find_border_targets(
      dt_plane, cc_plane, wx, wy
    )

    plane = plane[..., np.newaxis]
    cc_plane = cc_plane[..., np.newaxis]
    remapping = kimimaro.skeletontricks.get_mapping(plane, cc_plane)

    for label, pt in plane_targets.items():
      label = remapping[label]
      target_list[label].add(
        rotatefn( int(pt[0]), int(pt[1]) )
      )

  target_list.default_factory = lambda: np.array([], np.uint32)
  for label, pts in target_list.items():
    target_list[label] = np.array(list(pts), dtype=np.uint32)

  return target_list

def merge(skeletons):
  merged_skels = {}
  for segid, skels in skeletons.items():
    skel = Skeleton.simple_merge(skels)
    merged_skels[segid] = skel.consolidate()

  return merged_skels

def argmax(arr):
  if arr.flags['C_CONTIGUOUS']:
    return np.unravel_index(np.argmax(arr), arr.shape, order='C')
  return np.unravel_index(np.argmax(arr.T), arr.shape, order='F')

def engage_avocado_protection(
  cc_labels, all_dbf, remapping,
  soma_detection_threshold, edtfn, 
  progress
):
  orig_cc_labels = np.copy(cc_labels, order='F')

  unchanged = set()

  # This loop handles nested avocados
  # Unless there are deeply nested double avocados,
  # this should complete in 2-3 passes. We limit it
  # to 20 just to make sure this loop terminates no matter what.
  # Avocados aren't the end of the world.
  for _ in tqdm(range(20), disable=(not progress), desc="Avocado Pass"): 
    # Note: Divide soma_detection_threshold by a bit more than 2 because the nucleii are going to be
    # about a factor of 2 or less smaller than what we'd expect from a cell. For example,
    # in an avocado I saw, the DBF of the nucleus was 499 when the detection threshold was 
    # set to 1100.
    candidates = set(fastremap.unique(cc_labels * (all_dbf > soma_detection_threshold / 2.5)))
    candidates -= unchanged
    candidates.discard(0)

    cc_labels, unchanged_this_cycle, changes = engage_avocado_protection_single_pass(
      cc_labels, all_dbf,
      candidates=candidates,
      progress=progress,
    )
    unchanged |= unchanged_this_cycle

    if len(changes) == 0:
      break 
    
    all_dbf = edtfn(cc_labels)

  # Downstream logic assumes cc_labels is contigiously numbered
  cc_labels, _ = fastremap.renumber(cc_labels, in_place=True)
  cc_remapping = kimimaro.skeletontricks.get_mapping(orig_cc_labels, cc_labels)

  adjusted_remapping = {}
  for new_cc, cc in cc_remapping.items():
    if cc in remapping:
      adjusted_remapping[new_cc] = remapping[cc]

  return cc_labels, all_dbf, adjusted_remapping

def engage_avocado_protection_single_pass(
  cc_labels, all_dbf, 
  candidates=None, progress=False
):
  """
  For each candidate, check if there's a fruit around the
  avocado pit roughly from the center (the max EDT).
  """

  if candidates is None:
    candidates = fastremap.unique(cc_labels)

  candidates = [ label for label in candidates if label != 0 ]

  unchanged = set()
  changed = set()

  if len(candidates) == 0:
    return cc_labels, unchanged, changed

  def paint_walls(binimg):
    """
    Ensure that inclusions that touch the wall are handled
    by performing a 2D fill on each wall.
    """
    binimg[:,:,0 ] = fill_voids.fill(binimg[:,:,0 ])
    binimg[:,:,-1] = fill_voids.fill(binimg[:,:,-1])
    binimg[:,0,: ] = fill_voids.fill(binimg[:,0,: ])
    binimg[:,-1,:] = fill_voids.fill(binimg[:,-1,:])
    binimg[0,:,: ] = fill_voids.fill(binimg[0,:,: ])
    binimg[-1,:,:] = fill_voids.fill(binimg[-1,:,:])
    return binimg

  slcs = find_objects(cc_labels)

  for label in tqdm(candidates, disable=(not progress), desc="Fixing Avocados"):
    slc = slcs[label - 1]
    offset = Bbox.from_slices(slc).minpt
    binimg = paint_walls(cc_labels[slc] == label) # image of the pit
    coord = argmax(binimg * all_dbf[slc]) + offset

    (pit, fruit) = kimimaro.skeletontricks.find_avocado_fruit(
      cc_labels, coord[0], coord[1], coord[2]
    )
    if pit == fruit and pit not in changed:
      unchanged.add(pit)
    else:
      unchanged.discard(pit)
      unchanged.discard(fruit)
      changed.add(pit)
      changed.add(fruit)
      binimg |= (cc_labels[slc] == fruit)

    fruit = np.asarray(fruit, dtype=cc_labels.dtype)
    binimg, N = fill_voids.fill(binimg, in_place=True, return_fill_count=True)
    cc_labels[slc] *= ~binimg
    cc_labels[slc] += fruit * binimg

  return cc_labels, unchanged, changed

def synapses_to_targets(labels, synapses, progress=False):
  """
  Turn the output of synapse detection and assignment, usually 
  centroid + pre/post into actionable targets. For a given 
  labeled volume, take the centroid and a pre or post label
  and find the nearest voxel for that label and add the coordinates
  of that voxel to a list of targets.

  labels: a 3d array containing labels
  synapses: { label: [ (centroid, swc_label), (centroid, swc_label), ... ] }
    where centroid is an (x,y,z) float triple in voxel coordinate space
      where the origin is the same as for labels
    where swc_label is the label to be added to the vertex attributes for
      the resulting target.
    where label is a presynaptic OR a postsynaptic label
      (submit two items to cover both)

  Returns: { (x,y,z): swc_label, ... } targets for skeletonization
  """
  while labels.ndim > 3:
    labels = labels[...,0]

  targets = {}

  for label, pairs in tqdm(synapses.items(), disable=(not progress), desc='Converting Synapses to Targets'):
    point_cloud = np.vstack((labels == label).nonzero()).T # [ [x,y,z], ... ]
    if len(point_cloud) == 0:
      continue

    swc_labels = defaultdict(list) 
    for centroid, swc_label in pairs:
      swc_labels[swc_label].append(centroid)

    for swc_label, centroids in swc_labels.items():
      distances = scipy.spatial.distance.cdist(point_cloud, centroids)
      minima = np.unique(np.argmin(distances, axis=0))
      tmp_targets = [ tuple(point_cloud[idx]) for idx in minima ]
      targets.update({ target: swc_label for target in tmp_targets })

  return targets

def fill_all_holes(cc_labels, progress=False, return_fill_count=False):
  """
  Fills the holes in each connected component and removes components that
  get filled in. The idea is that holes (entirely contained labels or background) 
  are artifacts in cell segmentations. A common example is a nucleus segmented 
  separately from the rest of the cell or errors in a manual segmentation leaving
  a void in a dendrite.

  cc_labels: an image containing connected components with labels smaller than
    the number of voxels in the image.
  progress: Display a progress bar or not.
  return_fill_count: if specified, return a tuple (filled_image, N) where N is
    the number of voxels that were filled in.

  Returns: filled_in_labels
  """
  labels = fastremap.unique(cc_labels)
  labels_set = set(labels)
  labels_set.discard(0)

  all_slices = find_objects(cc_labels)
  pixels_filled = 0

  for label in tqdm(labels, disable=(not progress), desc="Filling Holes"):
    if label not in labels_set:
      continue

    slices = all_slices[label - 1]
    if slices is None:
      continue

    binary_image = (cc_labels[slices] == label)
    binary_image, N = fill_voids.fill(
      binary_image, in_place=True, 
      return_fill_count=True
    )
    pixels_filled += N
    if N == 0:
      continue 

    sub_labels = set(fastremap.unique(cc_labels[slices] * binary_image))
    sub_labels.remove(label)
    labels_set -= sub_labels
    cc_labels[slices] = cc_labels[slices] * ~binary_image + label * binary_image

  if return_fill_count:
    return cc_labels, pixels_filled
  return cc_labels

def print_quotes(parallel):
  if parallel == -1:
    print("Against the power of will I possess... The capability of my body is nothing.")
  elif parallel == -2:
    print("I will see the truth of this world... OROCHIMARU-SAMA WILL SHOW ME!!!")

  if -2 <= parallel < 0:
    print("CURSED SEAL OF THE EARTH!!!")  


================================================
FILE: kimimaro/post.py
================================================
"""
Postprocessing for joining skeletons chunks generated by
skeletonizing adjacent image chunks. 

Authors: Alex Bae and Will Silversmith
Affiliation: Seung Lab, Princeton Neuroscience Institue
Date: June 2018 - June 2019

This file is part of Kimimaro.

Kimimaro is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Kimimaro is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.
"""
from typing import Sequence

from collections import defaultdict

import fastremap
import networkx as nx
import numpy as np

from scipy import spatial
from scipy.sparse import lil_matrix
from scipy.sparse.csgraph import dijkstra
import scipy.sparse.csgraph as csgraph
import scipy.spatial.distance

try:
  from pykdtree.kdtree import KDTree
except ImportError:
  from scipy.spatial import cKDTree as KDTree

from osteoid import Skeleton, Bbox

import kimimaro.skeletontricks

## Public API of Module

def postprocess(
  skeleton:Skeleton, 
  dust_threshold:float = 1500.0, 
  tick_threshold:float = 3000.0,
) -> Skeleton:
  """
  Postprocessing of a skeleton enables aggregation of adjacent
  or overlapping skeletonized image chunks to be fused into a
  single coherent skeleton.  

  The following steps are applied:
  1) Remove disconnected components smaller than the 
      dust threshold (measured in physical distance).
  2) Skeletons are supposed to be trees, so we remove
    any loops that were introduced by joining chunks 
    together. Loops that occur inside the lumen of a 
    neuron might be collapsed into their centroid. Loops
    that occur due to, e.g. mergers are broken arbitarily.
  3) Disconnected components that are closer than the sum
     of their boundary distance are connected.
  4) Small "ticks", or branches from the main skeleton, are
     removed one at a time, from smallest to largest. Branches
     larger than the physical tick_threshold are preserved. 

  Returns: Skeleton
  """
  label = skeleton.id

  # necessary for removing trivial loops etc
  # remove_loops and remove_ticks assume a 
  # clean representation
  skeleton = skeleton.consolidate() 

  skeleton = remove_dust(skeleton, dust_threshold) 
  skeleton = remove_loops(skeleton)
  skeleton = join_close_components(skeleton, restrict_by_radius=True)
  skeleton = remove_ticks(skeleton, tick_threshold)
  skeleton.id = label
  return skeleton.consolidate()

def join_close_components(
  skeletons:Sequence[Skeleton], 
  radius:float = np.inf,
  restrict_by_radius:bool = False,
) -> Skeleton:
  """
  Given a set of skeletons which may contain multiple connected components,
  attempt to connect each component to the nearest other component via the
  nearest two vertices. Repeat until no components remain or no points closer
  than `radius` are available.

  radius: in same units as skeletons, don't join pieces 
    further away than this.
  restrict_by_radius: If the skeletons have a radius property,
    don't join pieces if the neighboring nodes are further away
    than r1 + r2.

  Returns: Skeleton
  """
  if radius is None:
    radius = np.inf

  if radius is not None and radius <= 0:
    raise ValueError("radius must be greater than zero: " + str(radius))

  try:
    iter(skeletons)
  except TypeError:
    skeletons = [ skeletons ]

  skels = []
  for skeleton in skeletons:
    skels += skeleton.components()

  skels = [ skl.consolidate() for skl in skels if not skl.empty() ]

  if len(skels) == 1:
    return skels[0]
  elif len(skels) == 0:
    return Skeleton()

  N = len(skels)
  radii_matrix = np.full( (N, N), np.inf, dtype=np.float32 )
  index_matrix = np.full( (N, N, 2), np.iinfo(np.uint32).max, dtype=np.uint32 )

  if restrict_by_radius:
    radius = 2 * np.max([ np.max(s.radii) for s in skels ])
    radius = max(radius, 0)

  def compute_nearest(tree, i, j):
    s1, s2 = skels[i], skels[j]
    r, idx = tree.query(
      s2.vertices, 
      k=1, 
      distance_upper_bound=(radius + 0.000001), # < bound, so +epsilon
    )
    idx_s2 = np.argmin(r)
    idx_s1 = idx[idx_s2]

    local_radius = r[idx_s2]

    if (
      restrict_by_radius
      and not np.isinf(local_radius)
      and hasattr(s1, "radii")
      and hasattr(s2, "radii")
      and local_radius > (s1.radii[idx_s1] + s2.radii[idx_s2])
    ):
      local_radius = np.inf

    radii_matrix[i,j] = local_radius
    radii_matrix[j,i] = local_radius

    index_matrix[i,j] = ( idx_s1, idx_s2 )
    index_matrix[j,i] = index_matrix[j,i]

  def symmetric_delete(matrix, k):
    matrix = np.delete(matrix, k, axis=0)
    return np.delete(matrix, k, axis=1)

  for i in range(N):
    tree = KDTree(skels[i].vertices)
    for j in range(i + 1, N):  # compute upper triangle only
      compute_nearest(tree, i, j)
    del tree

  while len(skels) > 1:
    
    if np.all(radii_matrix) == np.inf:
      break

    min_radius = np.min(radii_matrix)
    if np.isinf(min_radius) or min_radius > radius:
      break

    i, j = np.unravel_index( np.argmin(radii_matrix), radii_matrix.shape )
    s1, s2 = skels[i], skels[j]
    fused = Skeleton.simple_merge([s1, s2])

    fused.edges = np.concatenate([
      fused.edges,
      [[ index_matrix[i,j,0], index_matrix[i,j,1] + s1.vertices.shape[0] ]]
    ])
    skels[i] = None
    skels[j] = None
    skels = [ fused ] + [ _ for _ in skels if _ is not None ]

    radii_matrix = symmetric_delete(radii_matrix, i)
    radii_matrix = symmetric_delete(radii_matrix, j - 1)
    
    N = len(skels)
    radii_matrix2 = np.full((N,N), np.inf, dtype=np.float32)
    radii_matrix2[1:,1:] = radii_matrix
    radii_matrix = radii_matrix2
    del radii_matrix2

    index_matrix = symmetric_delete(index_matrix, i)
    index_matrix = symmetric_delete(index_matrix, j - 1)
    
    index_matrix2 = np.full((N,N,2), np.iinfo(np.uint32).max, dtype=np.uint32 )
    index_matrix2[1:,1:] = index_matrix
    index_matrix = index_matrix2
    del index_matrix2

    tree = KDTree(skels[0].vertices)
    for j in range(1,N):
      compute_nearest(tree, 0, j)
    del tree

  return Skeleton.simple_merge(skels).consolidate()

## Implementation Details Below

def remove_dust(skeleton, dust_threshold):
  """Dust threshold in physical cable length."""
  
  if skeleton.empty() or dust_threshold == 0:
    return skeleton

  skels = [] 
  for skel in skeleton.components():
    if skel.cable_length() > dust_threshold:
      skels.append(skel)

  return Skeleton.simple_merge(skels)

def remove_ticks(skeleton, threshold):
  """
  Simple merging of individual TESAR cubes results in lots of little 
  ticks due to the edge effect. We can remove them by thresholding
  the path length from a given branch to the "main body" of the neurite. 
  We successively remove paths from shortest to longest until no branches
  below threshold remain.

  If TEASAR parameters were chosen such that they allowed for spines to
  be traced, this is also an opportunity to correct for that.

  This algorithm is O(N^2) in the number of terminal nodes.

  Parameters:
    threshold: The maximum length in nanometers that may be culled.

  Returns: tick free skeleton
  """
  if skeleton.empty() or threshold == 0:
    return skeleton

  skels = []
  for component in skeleton.components():
    skels.append(_remove_ticks(component, threshold))

  return Skeleton.simple_merge(skels).consolidate(remove_disconnected_vertices=False)

def _remove_ticks(skeleton, threshold):
  """
  For a single connected component, remove "ticks" below a threshold. 
  Ticks are a path connecting a terminal node to a branch point that
  are physically shorter than the specified threshold. 

  Every time a tick is removed, it potentially changes the topology
  of the components. Once a branch point's number of edges drops to
  two, the two paths connecting to it can be unified into one. Sometimes
  a single object exists that has no branches but is below threshold. We
  do not delete these objects as there would be nothing left.

  Each time the minimum length tick is removed, it can change which 
  tick is the new minimum tick and requires reevaluation of the whole 
  skeleton. Previously, we did not perform this reevaluation and it 
  resulted in the ends of neurites being clipped. 

  This makes the algorithm quadratic in the number of terminal branches.
  As high resolution skeletons can have tens of thousands of nodes and 
  dozens of branches, a full topological reevaluation becomes relatively 
  expensive. However, we only need to know the graph of distances between
  critical points, defined as the set of branch points and terminal points, 
  in the skeleton in order to evaluate the topology. 

  Therefore, we first compute this distance graph before proceeding with
  tick removal. The algorithm remains quadratic in the number of terminal
  points, but the constant speed up is very large as we move from a regime
  of tens of thousands to hundreds of thousands of points needing reevaluation
  to at most hundreds and often only a handful in typical cases. In the 
  pathological case of a skeleton with numerous single point extrusions,
  the performance of the algorithm collapses approximately to the previous
  regime (though without the assistence of the constant factor of numpy speed).

  Requires:
    skeleton: a Skeleton that is guaranteed to be a single 
      connected component.
    threshold: distance in nanometers below which a branch is considered
      a "tick" eligible to be removed.

  Returns: a "tick" free Skeleton
  """
  if skeleton.empty():
    return skeleton

  dgraph = kimimaro.skeletontricks.create_distance_graph(skeleton)
  vertices = skeleton.vertices
  edges = skeleton.edges

  unique_nodes, unique_counts = fastremap.unique(edges, return_counts=True)
  terminal_nodes = set(unique_nodes[ unique_counts == 1 ])

  branch_idx = np.where(unique_counts >= 3)[0]

  branch_counts = defaultdict(int)
  for i in branch_idx:
    branch_counts[unique_nodes[i]] = unique_counts[i]

  G = nx.Graph()
  G.add_edges_from(edges)

  terminal_superedges = set([ edg for edg in dgraph.keys() if (edg[0] in terminal_nodes or edg[1] in terminal_nodes) ])

  def fuse_edge(edg1):
    unify = [ edg for edg in dgraph.keys() if edg1 in edg ]
    new_dist = 0.0
    for edg in unify:
      terminal_superedges.discard(edg)
      new_dist += dgraph[edg]
      del dgraph[edg]
    unify = set([ item for sublist in unify for item in sublist ])
    unify.remove(edg1)
    dgraph[tuple(unify)] = new_dist
    terminal_superedges.add(tuple(unify))
    branch_counts[edg1] = 0

  while len(dgraph) > 1:
    min_edge = min(terminal_superedges, key=dgraph.get)
    e1, e2 = min_edge

    if branch_counts[e1] == 1 and branch_counts[e2] == 1:
      break
    elif dgraph[min_edge] >= threshold:
      break

    path = nx.shortest_path(G, e1, e2)
    path = [ (path[i], path[i+1]) for i in range(len(path) - 1) ]
    G.remove_edges_from(path)

    del dgraph[min_edge]
    terminal_superedges.remove(min_edge)
    branch_counts[e1] -= 1
    branch_counts[e2] -= 1

    if branch_counts[e1] == 2:
      fuse_edge(e1)
    if branch_counts[e2] == 2:
      fuse_edge(e2)

  skel = skeleton.clone()
  skel.edges = np.array(list(G.edges), dtype=np.uint32)
  return skel

def _create_distance_graph(skeleton):
  """
  Creates the distance "supergraph" from a single connected component 
  skeleton as described in _remove_ticks.

  Returns: a distance "supergraph" describing the physical distance
    between the critical points in the skeleton's structure.

  Example skeleton with output:

      60nm   60nm   60nm     
    1------2------3------4
      30nm |  70nm \
           5        ----6

  { 
    (1,2): 60,  
    (2,3): 60,
    (2,5): 30,
    (3,4): 60,
    (3,6): 70,
  }
  """
  vertices = skeleton.vertices
  edges = skeleton.edges

  unique_nodes, unique_counts = fastremap.unique(edges, return_counts=True)
  terminal_nodes = unique_nodes[ unique_counts == 1 ]
  branch_nodes = set(unique_nodes[ unique_counts >= 3 ])
  
  critical_points = set(terminal_nodes)
  critical_points.update(branch_nodes)

  tree = defaultdict(set)

  for e1, e2 in edges:
    tree[e1].add(e2)
    tree[e2].add(e1)

  # The below depth first search would be
  # more elegantly implemented as recursion,
  # but it quickly blows the stack, mandating
  # an iterative implementation.

  stack = [ terminal_nodes[0] ]
  parents = [ -1 ]
  dist_stack = [ 0.0 ]
  root_stack = [ terminal_nodes[0] ]
  distgraph = defaultdict(float) # the distance "supergraph"

  while stack:
    node = stack.pop()
    dist = dist_stack.pop()
    root = root_stack.pop()
    parent = parents.pop()

    if node in critical_points and node != root:
      distgraph[ (root, node) ] = dist
      dist = 0.0
      root = node

    for child in tree[node]:
      if child != parent:
        stack.append(child)
        parents.append(node)
        dist_stack.append(
          dist + np.linalg.norm(vertices[node,:] - vertices[child,:])
        )
        root_stack.append(root)

  return distgraph

def remove_loops(skeleton):
  if skeleton.empty():
    return skeleton

  skels = []
  for component in skeleton.components():
    skels.append(_remove_loops(component))

  return Skeleton.simple_merge(skels).consolidate(remove_disconnected_vertices=False)

def _remove_loops(skeleton):
  nodes = skeleton.vertices
  edges = np.copy(skeleton.edges).astype(np.int32)

  while True: # Loop until all cycles are removed
    edges = edges.astype(np.int32)
    cycle_path = kimimaro.skeletontricks.find_cycle(edges)
    # cycle_path = kimimaro.skeletontricks.find_cycle_cython(edges)

    if len(cycle_path) == 0:
      break

    edges_cycle = path2edge(cycle_path)

    edges_cycle = np.array(edges_cycle, dtype=np.uint32)
    edges_cycle.sort(axis=1, kind='quicksort')

    nodes_cycle = fastremap.unique(edges_cycle)
    nodes_cycle = nodes_cycle.astype(np.int32)
    
    unique_nodes, unique_counts = fastremap.unique(edges, return_counts=True)
    branch_nodes = unique_nodes[ unique_counts >= 3 ]

    # branch cycles are cycle nodes that coincide with a branch point
    branch_cycle = nodes_cycle[np.isin(nodes_cycle,branch_nodes)]
    branch_cycle = branch_cycle.astype(np.int32)

    # Summary:
    # 0 external branches: isolated loop, just remove it
    # 1 external branch  : remove the loop but draw a line
    #   from the branch point to the farthest node in the loop.
    # 2 external branches: remove the shortest path between
    #   the two entry/exit points. 
    # 3+ external branches: collapse the cycle into its centroid
    #   if the radius of the centroid is less than the EDT radius
    #   of the pixel located at the centroid. Otherwise, arbitrarily
    #   cut an edge from the cycle to break it. This radius rule prevents
    #   issues where we collapse to a point outside of the neurite.

    # Loop with a tail
    if branch_cycle.shape[0] == 1:
      branch_cycle_point = nodes[branch_cycle, :]
      cycle_points = nodes[nodes_cycle, :]

      dist = np.sum((cycle_points - branch_cycle_point) ** 2, 1)
      end_node = nodes_cycle[np.argmax(dist)]

      edges = remove_row(edges, edges_cycle)        
      new_edge = np.array([[branch_cycle[0], end_node]], dtype=np.int32) 
      edges = np.concatenate((edges, new_edge), 0)

    # Loop with an entrance and an exit
    elif branch_cycle.shape[0] == 2:

      # compute the shortest path between the two branch points
      path = np.array(cycle_path[1:])
      pos = np.where(np.isin(path, branch_cycle))[0]
      if (pos[1] - pos[0]) < len(path) / 2:
        path = path[pos[0]:pos[1]+1]
      else:
        path = np.concatenate((path[pos[1]:], path[:pos[0]+1]), 0)

      edge_path = path2edge(path)
      edge_path.sort(axis=1, kind='quicksort')

      row_valid = np.ones(edges_cycle.shape[0])
      for i in range(edge_path.shape[0]):
        row_valid -= (edges_cycle[:,0] == edge_path[i,0]) * (edges_cycle[:,1] == edge_path[i,1])

      row_valid = row_valid.astype(bool)
      edge_path = edges_cycle[row_valid,:]

      edges = remove_row(edges, edge_path)

    # Totally isolated loop
    elif branch_cycle.shape[0] == 0:
      edges = remove_row(edges, edges_cycle)

    # Loops with many ways in and out
    # looks like here we unify them into their
    # centroid. This doesn't work well if the loop
    # is large.
    else:
      branch_cycle_points = nodes[branch_cycle,:]

      centroid = np.mean(branch_cycle_points, axis=0)
      dist = (nodes - centroid)
      dist *= dist
      dist = np.sum(dist, axis=1)
      intersect_node = np.argmin(dist)
      intersect_point = nodes[intersect_node,:]

      dist = np.sum((branch_cycle_points - intersect_point) ** 2, 1)
      dist = np.sqrt(np.max(dist))

      # Fix the "stargate" issue where a large loop
      # can join lots of things to the near center
      # by just making a tiny snip if the distance
      # is greater than the radius of the connected node.
      if dist > skeleton.radii[ intersect_node ]:
        edges = remove_row(edges, edges_cycle[:1,:])
        continue

      edges = remove_row(edges, edges_cycle)      

      new_edges = np.zeros((branch_cycle.shape[0], 2))
      new_edges[:,0] = branch_cycle
      new_edges[:,1] = intersect_node

      if np.isin(intersect_node, branch_cycle):
        idx = np.where(branch_cycle == intersect_node)
        new_edges = np.delete(new_edges, idx, 0)

      edges = np.concatenate((edges,new_edges), 0)

  skeleton.vertices = nodes
  skeleton.edges = edges.astype(np.uint32)
  return skeleton

def path2edge(path):
  """
  path: sequence of nodes

  Returns: sequence separated into edges
  """
  edges = np.zeros([len(path) - 1, 2], dtype=np.uint32)
  edges[:,0] = path[0:-1]
  edges[:,1] = path[1:]
  return edges

def remove_row(array, rows2remove): 
  array.sort(axis=1, kind='quicksort')
  if array.size == 0:
    return array.astype(np.int32, copy=False)

  rows2remove.sort(axis=1, kind='quicksort')

  for i in range(rows2remove.shape[0]):  
    idx = find_row(array,rows2remove[i,:])  
    if np.sum(idx == -1) == 0: 
      array = np.delete(array, idx, axis=0) 
  
  return array.astype(np.int32, copy=False)

def find_row(array, row): 
  """ 
  array: array to search for  
  row: row to find  
   Returns: row indices 
  """ 
  matches = (array[:,0] == row[0])
  matches &= (array[:,1] == row[1])
  idx = np.where(matches)
  if len(idx) == 0:
    return -1
  return idx[0]


================================================
FILE: kimimaro/sharedmemory.py
================================================
from collections import defaultdict
import errno
import mmap
import os
import sys
import time

import multiprocessing as mp

import numpy as np

from osteoid import Bbox, Vec

from .utility import mkdir

SHM_DIRECTORY = '/dev/shm/'
EMULATED_SHM_DIRECTORY = '/tmp/kimimaro-shm'

EMULATE_SHM = not os.path.isdir(SHM_DIRECTORY)
PLATFORM_SHM_DIRECTORY = SHM_DIRECTORY if not EMULATE_SHM else EMULATED_SHM_DIRECTORY

class SharedMemoryReadError(Exception):
  pass

class SharedMemoryAllocationError(Exception):
  pass

def ndarray(shape, dtype, location, order='F', readonly=False, lock=None, **kwargs):
  """
  Create a shared memory numpy array. 
  Lock is only necessary while doing multiprocessing on 
  platforms without /dev/shm type  shared memory as 
  filesystem emulation will be used instead.

  Allocating the shared array requires cleanup on your part.
  A shared memory file will be located at sharedmemory.PLATFORM_SHM_DIRECTORY + location
  and must be unlinked when you're done. It will outlive the program.

  You should also call .close() on the mmap file handle when done. However,
  this is less of a problem because the operating system will close the
  file handle on process termination.

  Parameters:
  shape: same as numpy.ndarray
  dtype: same as numpy.ndarray
  location: the shared memory filename 
  lock: (optional) multiprocessing.Lock

  Returns: (mmap filehandle, shared ndarray)
  """
  if EMULATE_SHM:
    return ndarray_fs(
      shape, dtype, location, lock, 
      readonly, order, emulate_shm=True, **kwargs
    )
  return ndarray_shm(shape, dtype, location, readonly, order, **kwargs)

def ndarray_fs(
    shape, dtype, location, lock, 
    readonly=False, order='F', emulate_shm=False,
    **kwargs
  ):
  """Emulate shared memory using the filesystem."""
  dbytes = np.dtype(dtype).itemsize
  nbytes = Vec(*shape).rectVolume() * dbytes

  if emulate_shm:
    directory = mkdir(EMULATED_SHM_DIRECTORY)
    filename = os.path.join(directory, location)
  else:
    filename = location

  if lock:
    lock.acquire()

  try:
    allocate_shm_file(filename, nbytes, dbytes, readonly)
  finally:
    if lock:
      lock.release()

  with open(filename, 'r+b') as f:
    array_like = mmap.mmap(f.fileno(), 0) # map entire file
  
  renderbuffer = np.ndarray(buffer=array_like, dtype=dtype, shape=shape, order=order, **kwargs)
  renderbuffer.setflags(write=(not readonly))
  return array_like, renderbuffer

def allocate_shm_file(filename, nbytes, dbytes, readonly):
  try:
    size = os.path.getsize(filename)
    exists = True
  except FileNotFoundError:
    size = 0
    exists = False

  if readonly and not exists:
    raise SharedMemoryReadError(filename + " has not been allocated. Requested " + str(nbytes) + " bytes.")
  elif readonly and size != nbytes:
    raise SharedMemoryReadError("{} exists, but the allocation size ({} bytes) does not match the request ({} bytes).".format(
      filename, size, nbytes
    ))

  if exists: 
    if size > nbytes:
      with open(filename, 'wb') as f:
        os.ftruncate(f.fileno(), nbytes)
    elif size < nbytes:
      # too small? just remake it below
      os.unlink(filename) 

  exists = os.path.exists(filename)

  if not exists:
    # Previously we were writing out real files full of zeros, 
    # but a) that takes forever and b) modern OSes support sparse
    # files (i.e. gigabytes of zeros that take up only a few real bytes).
    #
    # The following should take advantage of this functionality and be faster.
    # It should work on Python 2.7 Unix, and Python 3.5+ on Unix and Windows.
    #
    # References:
    #   https://stackoverflow.com/questions/8816059/create-file-of-particular-size-in-python
    #   https://docs.python.org/3/library/os.html#os.ftruncate
    #   https://docs.python.org/2/library/os.html#os.ftruncate
    #
    with open(filename, 'wb') as f:
      os.ftruncate(f.fileno(), nbytes)

def ndarray_shm(shape, dtype, location, readonly=False, order='F', **kwargs):
  """Create a shared memory numpy array. Requires /dev/shm to exist."""
  import posix_ipc
  from posix_ipc import O_CREAT
  import psutil

  nbytes = Vec(*shape).rectVolume() * np.dtype(dtype).itemsize
  available = psutil.virtual_memory().available

  preexisting = 0
  # This might only work on Ubuntu
  shmloc = os.path.join(SHM_DIRECTORY, location)
  if os.path.exists(shmloc):
    preexisting = os.path.getsize(shmloc)
  elif readonly:
    raise SharedMemoryReadError(shmloc + " has not been allocated. Requested " + str(nbytes) + " bytes.")

  if readonly and preexisting != nbytes:
    raise SharedMemoryReadError("{} exists, but the allocation size ({} bytes) does not match the request ({} bytes).".format(
      shmloc, preexisting, nbytes
    ))

  if (nbytes - preexisting) > available:
    overallocated = nbytes - preexisting - available
    overpercent = (100 * overallocated / (preexisting + available))
    raise SharedMemoryAllocationError("""
      Requested more memory than is available. 

      Shared Memory Location:  {}

      Shape:                   {}
      Requested Bytes:         {} 
      
      Available Bytes:         {} 
      Preexisting Bytes*:      {} 

      Overallocated Bytes*:    {} (+{:.2f}%)

      * Preexisting is only correct on linux systems that support /dev/shm/""" \
        .format(location, shape, nbytes, available, preexisting, overallocated, overpercent))

  # This might seem like we're being "extra safe" but consider
  # a threading condition where the condition of the shared memory
  # was adjusted between the check above and now. Better to make sure
  # that we don't accidently change anything if readonly is set.
  flags = 0 if readonly else O_CREAT 
  size = 0 if readonly else int(nbytes) 

  try:
    shared = posix_ipc.SharedMemory(location, flags=flags, size=size)
    array_like = mmap.mmap(shared.fd, shared.size)
    os.close(shared.fd)
    renderbuffer = np.ndarray(buffer=array_like, dtype=dtype, shape=shape, order=order, **kwargs)
  except OSError as err:
    if err.errno == errno.ENOMEM: # Out of Memory
      posix_ipc.unlink_shared_memory(location)      
    raise

  renderbuffer.setflags(write=(not readonly))
  return array_like, renderbuffer

def unlink(location):
  if EMULATE_SHM:
    return unlink_fs(location)
  return unlink_shm(location)

def unlink_shm(location):
  import posix_ipc
  try:
    posix_ipc.unlink_shared_memory(location)
  except posix_ipc.ExistentialError:
    return False
  return True

def unlink_fs(location):
  directory = mkdir(EMULATED_SHM_DIRECTORY)
  try:
    filename = os.path.join(directory, location)
    os.unlink(filename)
    return True
  except OSError:
    return False


================================================
FILE: kimimaro/trace.py
================================================
"""
Skeletonization algorithm based on TEASAR (Sato et al. 2000).

Authors: Alex Bae and Will Silversmith
Affiliation: Seung Lab, Princeton Neuroscience Institue
Date: June 2018 - Februrary 2025

This file is part of Kimimaro.

Kimimaro is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Kimimaro is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.
"""
from collections import defaultdict
from math import log

import dijkstra3d
import edt
import fill_voids
import numpy as np
from scipy import ndimage

import kimimaro.skeletontricks

from osteoid import Skeleton

def trace(
    labels, DBF, 
    scale=10, const=10, anisotropy=(1,1,1), 
    soma_detection_threshold=1100, 
    soma_acceptance_threshold=4000, 
    pdrf_scale=5000, pdrf_exponent=16,
    soma_invalidation_scale=0.5,
    soma_invalidation_const=0,
    fix_branching=True,
    manual_targets_before=[],
    manual_targets_after=[],
    root=None,
    max_paths=None,
    voxel_graph=None,
  ):
  """
  Given the euclidean distance transform of a label ("Distance to Boundary Function"), 
  convert it into a skeleton using an algorithm based on TEASAR. 

  DBF: Result of the euclidean distance transform. Must represent a single label,
       assumed to be expressed in chosen physical units (i.e. nm)
  scale: during the "rolling ball" invalidation phase, multiply the DBF value by this.
  const: during the "rolling ball" invalidation phase, this is the minimum radius in chosen physical units (i.e. nm).
  anisotropy: (x,y,z) conversion factor for voxels to chosen physical units (i.e. nm)
  soma_detection_threshold: if object has a DBF value larger than this, 
    root will be placed at largest DBF value and special one time invalidation
    will be run over that root location (see soma_invalidation scale)
    expressed in chosen physical units (i.e. nm) 
  pdrf_scale: scale factor in front of dbf, used to weight dbf over euclidean distance (higher to pay more attention to dbf) (default 5000)
  pdrf_exponent: exponent in dbf formula on distance from edge, faster if factor of 2 (default 16)
  soma_invalidation_scale: the 'scale' factor used in the one time soma root invalidation (default .5)
  soma_invalidation_const: the 'const' factor used in the one time soma root invalidation (default 0)
                           (units in chosen physical units (i.e. nm))
  fix_branching: When enabled, zero out the graph edge weights traversed by 
    of previously found paths. This causes branch points to occur closer to 
    the actual path divergence. However, there is a large performance penalty
    associated with this as dijkstra's algorithm is computed once per a path
    rather than once per a skeleton.
  manual_targets_before: list of (x,y,z) that correspond to locations that must 
    have paths drawn to. Used for specifying root and border targets for
    merging adjacent chunks out-of-core. Targets are applied before ordinary
    target selection.
  manual_targets_after: Same as manual_targets_before but the additional 
    targets are applied after the usual algorithm runs. The current 
    invalidation status of the shape makes no difference.
  max_paths: If a label requires drawing this number of paths or more,
    abort and move onto the next label.
  root: If you want to force the root to be a particular voxel, you can
    specify it here.
  voxel_graph: a connection graph that defines permissible 
    directions of motion between voxels. This is useful for
    dealing with self-touches. The graph is defined by the
    conventions used in cc3d.voxel_connectivity_graph 
    (https://github.com/seung-lab/connected-components-3d/blob/3.2.0/cc3d_graphs.hpp#L73-L92)

  Based on the algorithm by:

  M. Sato, I. Bitter, M. Bender, A. Kaufman, and M. Nakajima. 
  "TEASAR: tree-structure extraction algorithm for accurate and robust skeletons"  
    Proc. the Eighth Pacific Conference on Computer Graphics and Applications. Oct. 2000.
    doi:10.1109/PCCGA.2000.883951 (https://ieeexplore.ieee.org/document/883951/)

  Returns: Skeleton object
  """
  dbf_max = np.max(DBF)
  labels = np.asfortranarray(labels)
  DBF = np.asfortranarray(DBF)

  soma_mode = False
  # > 5000 nm, gonna be a soma or blood vessel
  # For somata: specially handle the root by 
  # placing it at the approximate center of the soma
  if dbf_max > soma_detection_threshold:
    labels, num_voxels_filled = fill_voids.fill(labels, in_place=True, return_fill_count=True)
    if num_voxels_filled > 0:
      del DBF
      DBF = edt.edt(
        labels, 
        anisotropy=anisotropy, 
        black_border=np.all(labels),
        voxel_graph=voxel_graph,
      )
    dbf_max = np.max(DBF) 
    soma_mode = dbf_max > soma_acceptance_threshold

  soma_radius = 0.0

  if soma_mode:
    if root is not None:
      manual_targets_before.insert(0, root)
    root = find_soma_root(DBF, dbf_max)    
    soma_radius = dbf_max * soma_invalidation_scale + soma_invalidation_const
  elif root is None:
    root = find_root(labels, anisotropy, voxel_graph)
  
  if root is None:
    return Skeleton()
 
  free_space_radius = 0 if not soma_mode else DBF[root]
  # DBF: Distance to Boundary Field
  # DAF: Distance from any voxel Field (distance from root field)
  # PDRF: Penalized Distance from Root Field
  DBF = kimimaro.skeletontricks.zero2inf(DBF) # DBF[ DBF == 0 ] = np.inf
  DAF, target = dijkstra3d.euclidean_distance_field(
    labels, root, 
    anisotropy=anisotropy, 
    free_space_radius=free_space_radius,
    voxel_graph=voxel_graph,
    return_max_location=True,
  )
  DAF = kimimaro.skeletontricks.inf2zero(DAF) # DAF[ DAF == np.inf ] = 0
  target_finder = kimimaro.skeletontricks.CachedTargetFinder(labels, DAF)
  PDRF = compute_pdrf(dbf_max, pdrf_scale, pdrf_exponent, DBF, DAF, DAF[target])
  del DAF

  # Use dijkstra propogation w/o a target to generate a field of
  # pointers from each voxel to its parent. Then we can rapidly
  # compute multiple paths by simply hopping pointers using path_from_parents
  if not fix_branching:
    parents = dijkstra3d.parental_field(PDRF, root, voxel_graph=voxel_graph)
    del PDRF
  else:
    parents = PDRF

  if soma_mode:
    invalidated, labels = kimimaro.skeletontricks.roll_invalidation_ball_inside_component(
      labels, DBF, 
      soma_invalidation_scale,
      soma_invalidation_const, 
      anisotropy,
      [root],
      voxel_connectivity_graph=voxel_graph,
    )
  # This target is only valid if no 
  # invalidations have occured yet.
  elif len(manual_targets_before) == 0:
    manual_targets_before.append(target)
  
  paths = compute_paths(
    root, labels, DBF, target_finder, 
    parents, scale, const, anisotropy, 
    soma_mode, soma_radius, fix_branching,
    manual_targets_before, manual_targets_after, 
    max_paths, voxel_graph
  )

  skel = Skeleton.simple_merge(
    [ Skeleton.from_path(path) for path in paths if len(path) > 0 ]
  ).consolidate()

  verts = skel.vertices.flatten().astype(np.uint32)
  skel.radii = DBF[verts[::3], verts[1::3], verts[2::3]]
  skel.transform = np.array([
    [anisotropy[0], 0, 0, 0],
    [0, anisotropy[1], 0, 0],
    [0, 0, anisotropy[2], 0],
  ], dtype=np.float32)

  return skel

def compute_paths(
    root, labels, DBF, target_finder, 
    parents, scale, const, anisotropy, 
    soma_mode, soma_radius, fix_branching,
    manual_targets_before, manual_targets_after,
    max_paths, voxel_graph
  ):
  """
  Given the labels, DBF, DAF, dijkstra parents,
  and associated invalidation knobs, find the set of paths 
  that cover the object. Somas are given special treatment
  in that we attempt to cull vertices within a radius of the
  root vertex.
  """
  paths = []
  valid_labels = np.count_nonzero(labels)
  root = tuple(root)

  if max_paths is None:
    max_paths = valid_labels

  if len(manual_targets_before) + len(manual_targets_after) >= max_paths:
    return []

  parents[tuple(root)] = 0 # provide initial rail for dijkstra.railroad

  while (valid_labels > 0 or manual_targets_before or manual_targets_after) \
    and len(paths) < max_paths:

    if manual_targets_before:
      target = manual_targets_before.pop()
    elif valid_labels == 0:
      target = manual_targets_after.pop()
    else:
      target = target_finder.find_target(labels)

    if fix_branching:
      # Draw a path (a "road") from the target to the nearest zero weighted
      # path (a "rail"). This has some minor efficiencies vs drawing
      # from a target all the way to the source. Also, target -> source
      # is much more efficient than source -> target for three reasons.
      # (a) target -> catches a rail instead of exploring all rails
      # (b) target has a natural edge effect that restrict exploration
      # (c) in soma, target -> source follows gradients vs fights them
      path = dijkstra3d.railroad(
        parents, target, voxel_graph=voxel_graph
      )
    else:
      path = dijkstra3d.path_from_parents(parents, target)
    
    if soma_mode:
      dist_to_soma_root = np.linalg.norm(anisotropy * (path - root), axis=1)
      # remove all path points which are within soma_radius of root
      path = np.concatenate(
        (path[:1,:], path[dist_to_soma_root > soma_radius, :])
      )

    if valid_labels > 0:
      invalidated, labels = kimimaro.skeletontricks.roll_invalidation_ball_inside_component(
        labels, DBF, scale, const, 
        anisotropy, path,
        voxel_connectivity_graph=voxel_graph,
      )      
      valid_labels -= invalidated

    for vertex in path:
      if fix_branching:
        parents[tuple(vertex)] = 0.0

    paths.append(path)

  return paths

def find_soma_root(DBF, dbf_max):
  """
  This perhaps overcomplicates things, but it's possible,
  for example in a rectangular cuboid, for there to be
  many multiple maxima at the center of a shape. We pick
  the one closest to the centroid of the shape to ensure
  the choice is sensible.

  Returns: (x,y,z) as integers
  """
  maxima = (DBF == dbf_max)
  com = ndimage.measurements.center_of_mass(maxima)
  com = np.asarray(com, dtype=np.float32)
  
  coords = np.where(maxima)
  coords = np.vstack( coords ).T
  root = np.argmin(
    np.sum((coords - com) ** 2, axis=1)
  )

  return tuple(coords[root].astype(np.uint32))

def find_root(labels, anisotropy, voxel_graph):
  """
  "4.4 DAF:  Compute distance from any voxel field"
  Compute DAF, but we immediately convert to the PDRF
  The extremal point of the PDRF is a valid root node
  even if the DAF is computed from an arbitrary pixel.
  """
  any_voxel = kimimaro.skeletontricks.first_label(labels)   
  if any_voxel is None: 
    return None

  DAF, target = dijkstra3d.euclidean_distance_field(
    labels, any_voxel, 
    anisotropy=anisotropy,
    return_max_location=True,
    voxel_graph=voxel_graph,
  )
  return target

def is_power_of_two(num):
  if int(num) != num:
    return False
  return num != 0 and ((num & (num - 1)) == 0)

def compute_pdrf(
  dbf_max, pdrf_scale, 
  pdrf_exponent, DBF, DAF,
  max_daf
):
  """
  Add p(v) to the DAF (pp. 4, section 4.5)
  "4.5 PDRF: Compute penalized distance from root voxel field"
  Let M > max(DBF)
  p(v) = 5000 * (1 - DBF(v) / M)^16
  5000 is chosen to allow skeleton segments to be up to 3000 voxels
  long without exceeding floating point precision.

  IMPLEMENTATION NOTE: 
  Appearently repeated *= is much faster than "** f(16)" 
  12,740.0 microseconds vs 4 x 560 = 2,240 microseconds (5.69x)

  More clearly written:
  PDRF = DAF + 5000 * ((1 - DBF * M) ** 16)
  """
  f = lambda x: np.float32(x)
  M = f( 1 / (dbf_max ** 1.01) )

  # First branch is much faster than ** which presumably
  # uses logarithms to do the exponentiation.
  PDRF = np.empty(DBF.shape, dtype=np.float32, order="F")
  np.multiply(DBF, M, out=PDRF)
  np.subtract(f(1), PDRF, out=PDRF)
  if is_power_of_two(pdrf_exponent) and (pdrf_exponent < (2 ** 16)):
    for _ in range(int(np.log2(pdrf_exponent))):
      PDRF *= PDRF # ^pdrf_exponent
  else: 
    np.power(PDRF, pdrf_exponent, out=PDRF)

  PDRF *= f(pdrf_scale)

  # provide trickle of gradient so open spaces don't collapse
  if max_daf != 0:
    DAF *= (1 / max_daf)
    PDRF += DAF

  return np.asfortranarray(PDRF)

def point_to_point(
  binary_img, start, end,
  anisotropy=(1,1,1), 
  pdrf_scale=100000, 
  pdrf_exponent=4,
):
  """
  Trace a single centerline path from 
  start to end.
  """
  DBF = edt.edt(
    binary_img, 
    anisotropy=anisotropy,
    black_border=True,
  )
  dbf_max = np.max(DBF)

  DBF = kimimaro.skeletontricks.zero2inf(DBF) # DBF[ DBF == 0 ] = np.inf
  DAF, target = dijkstra3d.euclidean_distance_field(
    binary_img, start, 
    anisotropy=anisotropy,
    return_max_location=True,
  )
  DAF = kimimaro.skeletontricks.inf2zero(DAF) # DAF[ DAF == np.inf ] = 0
  PDRF = compute_pdrf(dbf_max, pdrf_scale, pdrf_exponent, DBF, DAF, DAF[target])
  del DAF

  path = dijkstra3d.dijkstra(PDRF, end, start)
  skel = Skeleton.from_path(path)

  verts = skel.vertices.flatten().astype(np.uint32)
  skel.radii = DBF[verts[::3], verts[1::3], verts[2::3]]
  return skel


================================================
FILE: kimimaro/utility.py
================================================
from typing import Dict, Union, List, Tuple, Optional

from collections import defaultdict
import copy
import os

import numpy as np
import numpy.typing as npt
import scipy.ndimage
from tqdm import tqdm

from osteoid import Skeleton, Bbox, Vec

import kimimaro.skeletontricks

import cc3d
from crackle import CrackleArray
import dijkstra3d
import fastremap
import fill_voids
import xs3d

XS_PROP = {
  "id": "cross_sectional_area",
  "data_type": "float32",
  "num_components": 1,
}

XS_CONTACT_PROP = {
  "id": "cross_sectional_area_contacts",
  "data_type": "uint8",
  "num_components": 1,  
}

def toabs(path):
  path = os.path.expanduser(path)
  return os.path.abspath(path)

def mkdir(path):
  path = toabs(path)

  try:
    if path != '' and not os.path.exists(path):
      os.makedirs(path)
  except OSError as e:
    if e.errno == 17: # File Exists
      time.sleep(0.1)
      return mkdir(path)
    else:
      raise

  return path

def extract_skeleton_from_binary_image(image):
  verts, edges = kimimaro.skeletontricks.extract_edges_from_binary_image(image)
  return Skeleton(verts, edges)

def compute_cc_labels(all_labels, voxel_graph = None):
  if isinstance(all_labels, CrackleArray):
    if voxel_graph is not None:
      all_labels = all_labels[:]
    else:
      return all_labels.connected_components(
        connectivity=26,
        memory_target=int(500e6), 
        return_mapping=True,
      )

  tmp_labels = all_labels
  if np.dtype(all_labels.dtype).itemsize > 1:
    tmp_labels, remapping = fastremap.renumber(all_labels, in_place=False)

  if voxel_graph is not None:
    cc_labels = cc3d.color_connectivity_graph(voxel_graph, connectivity=26)
    cc_labels *= all_labels > 0
  else:
    cc_labels = cc3d.connected_components(tmp_labels)
  
  cc_labels = fastremap.refit(cc_labels)

  del tmp_labels
  remapping = kimimaro.skeletontricks.get_mapping(all_labels, cc_labels) 
  return cc_labels, remapping

def find_objects(labels):
  """  
  scipy.ndimage.find_objects performs about 7-8x faster on C 
  ordered arrays, so we just do it that way and convert
  the results if it's in F order.
  """
  if isinstance(labels, CrackleArray):
    bbxes = labels.bounding_boxes()
    bbxes.pop(0)
    result = list(bbxes.items())
    result.sort(key=lambda x: x[0])
    return [ x[1] for x in result ]

  if labels.flags['C_CONTIGUOUS']:
    return scipy.ndimage.find_objects(labels)
  else:
    all_slices = scipy.ndimage.find_objects(labels.T)
    return [ (slcs and slcs[::-1]) for slcs in all_slices ]    

def add_property(skel, prop):
  needs_prop = True
  for skel_prop in skel.extra_attributes:
    if skel_prop["id"] == prop["id"]:
      needs_prop = False
      break

  if needs_prop:
    skel.extra_attributes.append(prop)

def shape_iterator(all_labels, skeletons, fill_holes, in_place, progress, fn):
  iterator = skeletons
  if type(skeletons) == dict:
    iterator = skeletons.values()
    total = len(skeletons)
  elif hasattr(skeletons, "vertices"):
    iterator = [ skeletons ]
    total = 1
  else:
    total = len(skeletons)

  if all_labels.dtype == bool:
    remapping = { True: 1, False: 0, 1:1, 0:0 }
  else:
    all_labels, remapping = fastremap.renumber(all_labels, in_place=in_place)

  all_slices = find_objects(all_labels)

  with tqdm(iterator, desc="Labels", disable=(not progress), total=total) as pbar:
    for skel in pbar:
      if all_labels.dtype == bool:
        label = 1
      else:
        label = skel.id

      pbar.set_postfix(label=str(label))

      if label == 0:
        continue

      if label not in remapping:
        continue

      label = remapping[label]
      slices = all_slices[label - 1]
      if slices is None:
        continue

      roi = Bbox.from_slices(slices)
      if roi.volume() <= 1:
        continue

      roi.grow(1)
      roi.minpt = Vec.clamp(roi.minpt, Vec(0,0,0), roi.maxpt)
      slices = roi.to_slices()

      binimg = np.asfortranarray(all_labels[slices] == label)
      if fill_holes:
        binimg = fill_voids.fill(binimg, in_place=True)

      fn(skel, binimg, roi)

  return iterator

def cross_sectional_area_single(
  binimg:npt.NDArray[np.bool_], 
  skel:Skeleton, 
  roi:Optional[Bbox] = None,
  anisotropy:npt.NDArray[np.float32] = np.array([1,1,1], dtype=np.float32),
  smoothing_window:int = 1,
  progress:bool = False,
  in_place:bool = False,
  multipass:bool = False,
  repair_contacts:bool = False,
  visualize_section_planes:bool = False,
  step:int = 1,
) -> Skeleton:
  """
  Analyze the cross sectional area for a single skeleton given 
  an overlapping binary image. For many skeletons at once, 
  use cross_sectional_area which may be faster.

  When the smoothing_window is >1, these plane normal 
  vectors will be smoothed with a rolling average. This
  is useful since there can be high frequency
  oscillations in the skeleton.

  This function will add the following attributes to
  each skeleton provided.

  skel.cross_sectional_area: float32 array of cross 
    sectional area per a vertex.

  skel.cross_sectional_area_contacts: uint8 array
    where non-zero entries indicate that the image
    border was contacted during the cross section
    computation, indicating a possible underestimate.

    The first six bits are a bitfield xxyyzz that
    tell you which image faces were touched and
    alternate from low (0) to high (size-1).

  multipass: When True, preserve existing cross_sectional_area
    and contact values and allow values with zero to be recalculated.
    This is useful for example, when using a large skeleton with
    different sections of an image. Very similar to repair_contacts,
    except that any vertex can be considered, not just contacts.

  repair_contacts: When True, only examine vertices
    that have a nonzero value for 
    skel.cross_sectional_area_contacts. This is intended
    to be used as a second pass after widening the image.

  visualize_section_planes: For debugging, paint section planes
    and display them using microviewer.

  step: when > 1, skip (step-1) vertices. This can be used to
    go faster. These days, evaluating a single vertex takes 
    between a few hundred microseconds to a few thousand microseconds.
      example calculation: 
      1 msec x 100,000 vertices = 100 sec
      A neuron I recently examined had over 300,000 vertices across 
      the entire dataset.
      Kimimaro's benchmark task produced 622,293 vertices over 1667 objects 
      using reasonable parameters and took a little over 4 minutes on an M3 
      processor (or about 2.5 msec/vertex). The most expensive shape was the soma.
  """
  assert step > 0
  assert smoothing_window > 0

  cross_sections = None
  if visualize_section_planes:
    cross_sections = np.zeros(binimg.shape, dtype=np.uint32, order="F")

  if skel.space == "physical":
    all_verts = (skel.vertices / anisotropy).round().astype(int)
  else:
    all_verts = np.copy(skel.vertices)

  if roi is not None:
    all_verts -= roi.minpt

  mapping = { tuple(v): i for i, v in enumerate(all_verts) }

  visited = np.zeros([ all_verts.shape[0] ], dtype=bool)

  if repair_contacts or (multipass and hasattr(skel, "cross_sectional_area")):
    areas = skel.cross_sectional_area
    contacts = skel.cross_sectional_area_contacts
  else:
    areas = np.zeros([all_verts.shape[0]], dtype=np.float32)
    contacts = np.zeros([all_verts.shape[0]], dtype=np.uint8)

  branch_pts = set(skel.branches())
  branch_pt_vals = defaultdict(list)

  paths = skel.paths()

  normal = np.array([1,0,0], dtype=np.float32)

  shape = np.array(binimg.shape)

  try:
    xs3d.set_shape(binimg)
    
    for path in tqdm(paths, disable=(not progress), desc="Cross Section Analysis Paths"):
      if skel.space == "physical":
        path = (path / anisotropy).round().astype(int)
      if roi is not None:
        path -= roi.minpt

      normals = (path[1:] - path[:-1]).astype(np.float32)
      normals = np.concatenate([ normals, [normals[-1]] ])

      # Running the filter in the forward and then backwards
      # direction eliminates phase shift.
      normals = moving_average(normals, smoothing_window)
      normals = moving_average(normals[::-1], smoothing_window)[::-1]

      normals /= np.linalg.norm(normals, axis=1, keepdims=True)   

      end_i = len(path) - 1
      ct = 0

      for i, vert in enumerate(path):
        ct += 1

        if ct < step and not (i == 0 or i == end_i):
          continue
        elif ct == step:
          ct = 0

        if ( 
             (vert[0] < 0) 
          or (vert[0] >= shape[0])
          or (vert[1] < 0) 
          or (vert[1] >= shape[1])
          or (vert[2] < 0) 
          or (vert[2] >= shape[2])
        ):
          continue

        idx = mapping[tuple(vert)]
        normal = normals[i]

        if (
          areas[idx] == 0 
          or (idx in branch_pts) 
          or (repair_contacts and contacts[idx] > 0 and not visited[idx])
        ):
          visited[idx] = True
          areas[idx], contact = xs3d.cross_sectional_area(
            binimg, vert, 
            normal, anisotropy,
            return_contact=True,
            use_persistent_data=True,
          )
          if repair_contacts:
            contacts[idx] = contact
          else:
            contacts[idx] |= contact # accumulate for branch points
          if idx in branch_pts:
            branch_pt_vals[idx].append(areas[idx])
          if visualize_section_planes:
            img = xs3d.cross_section(
              binimg, vert, 
              normal, anisotropy,
            )
            cross_sections[img > 0] = idx
  finally:
    xs3d.clear_shape()

  if visualize_section_planes:
    import microviewer
    microviewer.view(cross_sections, seg=True)

  for idx, vals in branch_pt_vals.items():
    areas[idx] = sum(vals) / len(vals)

  skel.cross_sectional_area = areas
  skel.cross_sectional_area_contacts = contacts

  add_property(skel, XS_PROP)
  add_property(skel, XS_CONTACT_PROP)

  return skel

def cross_sectional_area(
  all_labels:np.ndarray, 
  skeletons:Union[Dict[int,Skeleton],List[Skeleton],Skeleton],
  anisotropy:np.ndarray = np.array([1,1,1], dtype=np.float32),
  smoothing_window:int = 1,
  progress:bool = False,
  in_place:bool = False,
  fill_holes:bool = False,
  multipass:bool = False,
  repair_contacts:bool = False,
  visualize_section_planes:bool = False,
  step:int = 1,
) -> Union[Dict[int,Skeleton],List[Skeleton],Skeleton]:
  """
  Given a set of skeletons, find the cross sectional area
  for each vertex indicated by the sectioning plane
  defined by the vector pointing to the next vertex.

  When the smoothing_window is >1, these plane normal 
  vectors will be smoothed with a rolling average. This
  is useful since there can be high frequency
  oscillations in the skeleton.

  This function will add the following attributes to
  each skeleton provided.

  skel.cross_sectional_area: float32 array of cross 
    sectional area per a vertex.

  skel.cross_sectional_area_contacts: uint8 array
    where non-zero entries indicate that the image
    border was contacted during the cross section
    computation, indicating a possible underestimate.

    The first six bits are a bitfield xxyyzz that
    tell you which image faces were touched and
    alternate from low (0) to high (size-1).

  multipass: When True, preserve existing cross_sectional_area
    and contact values and allow values with zero to be recalculated.
    This is useful for example, when using a large skeleton with
    different sections of an image. Very similar to repair_contacts,
    except that any vertex can be considered, not just contacts.

  repair_contacts: When True, only examine vertices
    that have a nonzero value for 
    skel.cross_sectional_area_contacts. This is intended
    to be used as a second pass after widening the image.

  visualize_section_planes: For debugging, paint section planes
    and display them using microviewer.

  step: when > 1, skip (step-1) vertices. This can be used to
    go faster. These days, evaluating a single vertex takes 
    between a few hundred microseconds to a few thousand microseconds.
      example calculation: 
      1 msec x 100,000 vertices = 100 sec
      A neuron I recently examined had over 300,000 vertices across 
      the entire dataset.
      Kimimaro's benchmark task produced 622,293 vertices over 1667 objects 
      using reasonable parameters and took a little over 4 minutes on an M3 
      processor (or about 2.5 msec/vertex). The most expensive shape was the soma.
  """
  assert step > 0
  assert smoothing_window > 0

  def cross_sectional_area_helper(skel, binimg, roi):
    cross_sections = None
    if visualize_section_planes:
      cross_sections = np.zeros(binimg.shape, dtype=np.uint32, order="F")

    if skel.space == "physical":
      all_verts = (skel.vertices / anisotropy).round().astype(int)
    else:
      all_verts = np.copy(skel.vertices)

    all_verts -= roi.minpt

    mapping = { tuple(v): i for i, v in enumerate(all_verts) }

    visited = np.zeros([ all_verts.shape[0] ], dtype=bool)

    if repair_contacts or (multipass and hasattr(skel, "cross_sectional_area")):
      areas = skel.cross_sectional_area
      contacts = skel.cross_sectional_area_contacts
    else:
      areas = np.zeros([all_verts.shape[0]], dtype=np.float32)
      contacts = np.zeros([all_verts.shape[0]], dtype=np.uint8)

    branch_pts = set(skel.branches())
    branch_pt_vals = defaultdict(list)

    paths = skel.paths()

    normal = np.array([1,0,0], dtype=np.float32)

    shape = np.array(binimg.shape)

    for path in paths:
      if skel.space == "physical":
        path = (path / anisotropy).round().astype(int)
      path -= roi.minpt

      normals = (path[1:] - path[:-1]).astype(np.float32)
      normals = np.concatenate([ normals, [normals[-1]] ])

      # Running the filter in the forward and then backwards
      # direction eliminates phase shift.
      normals = moving_average(normals, smoothing_window)
      normals = moving_average(normals[::-1], smoothing_window)[::-1]

      normals /= np.linalg.norm(normals, axis=1, keepdims=True)   

      end_i = len(path) - 1
      ct = 0

      for i, vert in enumerate(path):
        ct += 1

        if ct < step and not (i == 0 or i == end_i):
          continue
        elif ct == step:
          ct = 0

        if ( 
             (vert[0] < 0) 
          or (vert[0] >= shape[0])
          or (vert[1] < 0) 
          or (vert[1] >= shape[1])
          or (vert[2] < 0) 
          or (vert[2] >= shape[2])
        ):
          continue

        idx = mapping[tuple(vert)]
        normal = normals[i]

        if (
          areas[idx] == 0 
          or (idx in branch_pts) 
          or (repair_contacts and contacts[idx] > 0 and not visited[idx])
        ):
          visited[idx] = True
          areas[idx], contact = xs3d.cross_sectional_area(
            binimg, vert, 
            normal, anisotropy,
            return_contact=True,
            use_persistent_data=True,
          )
          if repair_contacts:
            contacts[idx] = contact
          else:
            contacts[idx] |= contact # accumulate for branch points
          if idx in branch_pts:
            branch_pt_vals[idx].append(areas[idx])
          if visualize_section_planes:
            img = xs3d.cross_section(
              binimg, vert, 
              normal, anisotropy,
            )
            cross_sections[img > 0] = idx

    if visualize_section_planes:
      import microviewer
      microviewer.view(cross_sections, seg=True)

    for idx, vals in branch_pt_vals.items():
      areas[idx] = sum(vals) / len(vals)

    skel.cross_sectional_area = areas
    skel.cross_sectional_area_contacts = contacts

  try:
    xs3d.set_shape(all_labels)
    if isinstance(all_labels, CrackleArray):
      bboxes = all_labels.bounding_boxes()
      iterator = tqdm(
        all_labels.each(crop=True, labels=list(skeletons.keys())),
        disable=(not progress),
        desc="Cross Section Analysis Paths"
      )
      for label, binimg in iterator:
        slc = Bbox.from_slices(bboxes[label])
        cross_sectional_area_helper(skeletons[label], binimg, slc)
    else:
      shape_iterator(
        all_labels, skeletons, 
        fill_holes, in_place, progress, 
        cross_sectional_area_helper
      )
  finally:
    xs3d.clear_shape()

  if hasattr(skeletons, "vertices"):
    skelitr = [ skeletons ]
  elif isinstance(skeletons, dict):
    skelitr = skeletons.values()
  else:
    skelitr = iter(skeletons)

  for skel in skelitr:
    add_property(skel, XS_PROP)
    add_property(skel, XS_CONTACT_PROP)

    if not hasattr(skel, "cross_sectional_area"):
      skel.cross_sectional_area = np.full(len(skel.vertices), -1, dtype=np.float32, order="F")
    if not hasattr(skel, "cross_sectional_area_contacts"):
      skel.cross_sectional_area_contacts = np.zeros(len(skel.vertices), dtype=np.uint8, order="F")

  return skeletons

def oversegment(
  all_labels:np.ndarray, 
  skeletons:Union[Dict[int,Skeleton],List[Skeleton],Skeleton],
  anisotropy:np.ndarray = np.array([1,1,1], dtype=np.float32),
  progress:bool = False,
  fill_holes:bool = False,
  in_place:bool = False,
  downsample:int = 0,
) -> Tuple[np.ndarray, Union[Dict[int,Skeleton],List[Skeleton],Skeleton]]:
  """
  Use skeletons to create an oversegmentation of a pre-existing set
  of labels. This is useful for proofreading systems that work by merging
  labels.

  For each skeleton, get the feature map from its euclidean distance
  field. The final image is the composite of all these feature maps
  numbered from 1.

  Each skeleton will have a new property skel.segments that associates
  a label to each vertex.
  """
  prop = {
    "id": "segments",
    "data_type": "uint64",
    "num_components": 1,
  }

  skeletons = copy.deepcopy(skeletons)

  # Initialize segments attribute for all skeletons
  if hasattr(skeletons, "vertices"):
    skeleton_list = [skeletons]
  elif isinstance(skeletons, dict):
    skeleton_list = list(skeletons.values())
  else:
    skeleton_list = skeletons
    
  all_features = np.zeros(all_labels.shape, dtype=np.uint64, order="F")
  next_label = 0

  def oversegment_helper(skel, binimg, roi):
    nonlocal next_label
    nonlocal all_features

    segment_skel = skel
    if downsample > 0:
      segment_skel = skel.downsample(downsample)

    vertices = (segment_skel.vertices / anisotropy).round().astype(int)
    vertices -= roi.minpt

    field, feature_map = dijkstra3d.euclidean_distance_field(
      binimg, vertices, 
      anisotropy=anisotropy, 
      return_feature_map=True
    )
    del field

    add_property(skel, prop)

    # Fortran order efficient version of:
    # feature_map[binimg] += next_label

    flat_binary_image = binimg.ravel('F')
    flat_feature_map = feature_map.ravel('F')
    flat_feature_map[flat_binary_image] += next_label
    
    next_label += vertices.shape[0]
    all_features[roi.to_slices()] += feature_map

  # iterator is an iterable list of skeletons, not the shape iterator
  iterator = shape_iterator(
    all_labels, skeletons, fill_holes, in_place, progress, 
    oversegment_helper
  )

  all_features, mapping = fastremap.renumber(all_features)
  
  for skel in skeleton_list:
    vertices = (skel.vertices / anisotropy).round().astype(int)
    skel.segments = all_features[vertices[:,0], vertices[:,1], vertices[:,2]]

  return all_features, skeletons

# From SO: https://stackoverflow.com/questions/14313510/how-to-calculate-rolling-moving-average-using-python-numpy-scipy
def moving_average(a:np.ndarray, n:int, mode:str = "symmetric") -> np.ndarray:
  if n <= 0:
    raise ValueError(f"Window size ({n}), must be >= 1.")
  elif n == 1:
    return a

  if len(a) == 0:
    return a

  if a.ndim == 2:
    a = np.pad(a, [[n, n],[0,0]], mode=mode)
  else:
    a = np.pad(a, [n, n], mode=mode)

  ret = np.cumsum(a, dtype=float, axis=0)
  ret = (ret[n:] - ret[:-n])[:-n]
  ret /= float(n)
  return ret


================================================
FILE: kimimaro_cli/LICENSE
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.


================================================
FILE: kimimaro_cli/__init__.py
================================================
import os

import click
import numpy as np
from osteoid import Skeleton

import kimimaro
from kimimaro.utility import mkdir
import fastremap
from tqdm import tqdm

from . import codecs

class Tuple3(click.ParamType):
  """A command line option type consisting of 3 comma-separated integers."""
  name = 'tuple3'
  def convert(self, value, param, ctx):
    if isinstance(value, str):
      try:
        value = tuple(map(int, value.split(',')))
      except ValueError:
        self.fail(f"'{value}' does not contain a comma delimited list of 3 integers.")
      if len(value) != 3:
        self.fail(f"'{value}' does not contain a comma delimited list of 3 integers.")
    return value


@click.group()
def main():
  """
  Skeletonize all labels in a segmented volumetric image
  by applying a TEASAR based algorithm and outputs them
  as SWC.

  Does not accept continuously valued images such as raw
  microscopy images.

  Input File Formats Supported: npy
  
  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version. Run "igneous license" for details.  
  """
  pass

@main.command()
@click.argument("src")
@click.option('--scale', type=float, default=4, help="Adds multiple of boundary distance to invalidation zone. (You should set this!)", show_default=True)
@click.option('--const', type=float, default=10, help="Adds constant physical distance to invalidation zone. (You should set this!)", show_default=True)
@click.option('--pdrf-scale', type=int, default=1e5, help="Constant multiplier of penalty field.", show_default=True)
@click.option('--pdrf-exponent', type=int, default=4, help="Exponent of penalty field. Powers of two are faster. Too big can cause floating point errors.", show_default=True)
@click.option('--soma-detect', type=float, default=750, help="Perform more expensive check for somas for distance to boundary values above this threshold. e.g. 750 nm", show_default=True)
@click.option('--soma-accept', type=float, default=1100, help="Distance to boundary values above this threshold trigger special soma processing. e.g. 750 nm", show_default=True)
@click.option('--soma-scale', type=float, default=2, help="Adds multiple of boundary distance to invalidation zone around a soma. (You should set this!)", show_default=True)
@click.option('--soma-const', type=float, default=300, help="Adds constant physical distance to invalidation zone around a soma. (You should set this!)", show_default=True)
@click.option('--anisotropy', type=Tuple3(), default="1,1,1", help="Physical size of voxel in x,y,z axes.", show_default=True)
@click.option('--dust', type=int, default=1000, help="Skip connected components with fewer voxels than this.", show_default=True)
@click.option('--progress', is_flag=True, default=False, help="Show progress bar.", show_default=True)
@click.option('--fill-holes/--no-fill-holes', is_flag=True, default=True, help="Fill holes in each connected component. (slower)", show_default=True)
@click.option('--fix-avocados', is_flag=True, default=False, help="Use heuristics to combine nucleii with cell bodies. (slower)", show_default=True)
@click.option('--fix-borders', is_flag=True, default=False, help="Center the skeleton where the shape contacts the border.", show_default=True)
@click.option('--fix-branches', is_flag=True, default=True, help="Improves quality of forked shapes. (slower for highly branched shapes)", show_default=True)
@click.option('--max-paths', type=int, default=None, help="Maximum number of paths to trace per object.", show_default=True)
@click.option('-p', '--parallel', type=int, default=1, help="Number of processes to use.", show_default=True)
@click.option('-o', '--outdir', type=str, default="kimimaro_out", help="Where to write the SWC files.", show_default=True)
@click.option("--cross-section", type=int, default=0, help="Turn on cross section analysis. The integer value gives the normal smoothing window, 0=off.", show_default=True)
def forge(
  src,
  scale, const, pdrf_scale, pdrf_exponent,
  soma_detect, soma_accept, soma_scale, soma_const,
  anisotropy, dust, progress, fill_holes, 
  fix_avocados, fix_branches, fix_borders,
  parallel, max_paths, outdir, cross_section,
):
  """Skeletonize an input image and write out SWCs."""
  labels = codecs.load(src)

  skels = kimimaro.skeletonize(
    labels,
    teasar_params={
      "scale": scale,
      "const": const,
      "pdrf_scale": pdrf_scale,
      "pdrf_exponent": pdrf_exponent,
      "soma_detection_threshold": soma_detect,
      "soma_acceptance_threshold": soma_accept,
      "soma_invalidation_scale": soma_scale,
      "soma_invalidation_const": soma_const,
      "max_paths": max_paths,
    },
    anisotropy=anisotropy,
    dust_threshold=dust,
    progress=progress,
    fill_holes=fill_holes,
    fix_avocados=fix_avocados,
    fix_branching=fix_branches,
    fix_borders=fix_borders,
    parallel=parallel,
  )

  directory = mkdir(outdir)

  for label, skel in skels.items():
    fname = os.path.join(directory, f"{label}.swc")
    with open(fname, "wt") as f:
      f.write(skel.to_swc())

  if progress:
    print(f"kimimaro: wrote {len(skels)} skeletons to {directory}")

  if cross_section > 0:
    skels = kimimaro.cross_sectional_area(
      labels, 
      skels,
      anisotropy=anisotropy,
      progress=progress,
      smoothing_window=cross_section,
      fill_holes=fill_holes,
    )

    for label, skel in skels.items():
      fname = os.path.join(directory, f"{label}_xs_area.npy")
      np.save(fname, skel.cross_sectional_area)
      fname = os.path.join(directory, f"{label}_xs_area_contacts.npy")
      np.save(fname, skel.cross_sectional_area_contacts)

    if progress:
      print(f"Wrote cross sectional area and border contacts to {directory}")  

@main.group()
def swc():
  """Utilities for managing SWC files. Use forge to create new skeletons."""
  pass

@swc.command("from")
@click.argument("src", nargs=-1)
def from_image(src):
  """Convert a binary image that has already been skeletonized by a thinning algorithm into an swc."""

  for srcpath in tqdm(src):
    try:
      image = codecs.load(srcpath)
    except ImportError:
      print(f"kimimaro: {srcpath} format not installed.")
      return

    skel = kimimaro.extract_skeleton_from_binary_image(image)

    with open(f"{basename}.swc", "wt") as f:
      f.write(skel.to_swc())

@swc.command("to")
@click.argument("src", nargs=-1)
@click.option('--format', type=str, default="npy", help="Which format to use. Options: npy, tiff", show_default=True)
def to_image(src, format):
  """Convert an swc into a binary image."""
  if format not in ("npy", "tiff"):
    print(f"kimimaro: invalid format {format}. npy or tiff allowed.")

  for srcpath in tqdm(src):
    with open(srcpath, 'rt') as f:
      skel = Skeleton.from_swc(f.read())

    xmin, xmax = fastremap.minmax(skel.vertices[:,0])
    ymin, ymax = fastremap.minmax(skel.vertices[:,1])
    zmin, zmax = fastremap.minmax(skel.vertices[:,2])

    image = np.zeros((int(zmax-zmin), int(ymax-ymin), int(xmax-xmin)), dtype=bool, order='F')
    
    minpt = np.array([int(xmin),int(ymin),int(zmin)])
    drawpts = skel.vertices - minpt
    drawpts = np.asfortranarray(drawpts, dtype=np.int32)
    
    image[np.where((drawpts[:, 0] >= xmin) & (drawpts[:, 0] < xmax) & 
                   (drawpts[:, 1] >= ymin) & (drawpts[:, 1] < ymax) & 
                   (drawpts[:, 2] >= zmin) & (drawpts[:, 2] < zmax))] = True

    basename, ext = os.path.splitext(srcpath)

    if format == "npy":
      np.save(f"{basename}.npy", image)
    elif format == "tiff":
      try:
        import tifffile
        tifffile.imwrite(f"{basename}.tiff", 
                         image.astype(np.float32, copy=False), 
                         photometric='minisblack',
                         metadata={'axes': 'ZYX'},
                         imagej=True)
      except ImportError:
        print("kimimaro: tifffile not installed. Run pip install tifffile.")
        return
    else:
      raise ValueError("should never happen")

@main.command()
@click.argument("filename")
@click.option('--port', type=int, default=8080, help="Which port to run the microviewer on for npy files.", show_default=True)
@click.option('--color-by', type=str, default='r', help="For skeleton visualization. r = radius, c = components, x = cross sectional area (if available).", show_default=True)
def view(filename, port, color_by):
  """Visualize a .swc or .npy file."""
  import microviewer

  basename, ext = os.path.splitext(filename)

  if ext == ".swc":
    with open(filename, "rt") as swc:
      skel = Skeleton.from_swc(swc.read())
    microviewer.objects([ skel ], skeleton_color_by=color_by)
  elif ext == ".npy":
    labels = np.load(filename)
    microviewer.view(labels, seg=True, port=port)
  elif ext == ".ckl":
    import crackle
    labels = crackle.load(filename)
    microviewer.view(labels, seg=True, port=port)
  else:
    print("kimimaro: {filename} was not a .swc, .npy, or .ckl file.")

@main.command()
def license():
  """Prints the license for this library and cli tool."""
  path = os.path.join(os.path.dirname(__file__), 'LICENSE')
  with open(path, 'rt') as f:
    print(f.read())


================================================
FILE: kimimaro_cli/codecs.py
================================================
import numpy as np
import os
import gzip

def normalize_file_ext(filename):
  filename, ext = os.path.splitext(filename)

  two_pass = ('.ckl', '.cpso')

  if ext in two_pass:
    return ext

  while True:
    filename, ext2 = os.path.splitext(filename)
    if ext2 in two_pass:
      return ext2
    elif ext2 == '':
      return ext
    ext = ext2

def load(filename):
  ext = normalize_file_ext(filename)

  if ext == ".ckl":
    import crackle
    return crackle.aload(filename)
  elif ext == ".npy":
    if filename.endswith(".gz"):
      with gzip.GzipFile(filename, "rb") as f:
        image = np.load(f)
    else:
      image = np.load(filename)
  elif ext == ".nrrd":
    import nrrd
    image, header = nrrd.read(filename)
    if image.shape[0] == 3 and image.ndim == 3:
      image = image[...,np.newaxis]
      image = np.transpose(image, axes=[1,2,3,0])
    return image
  elif ext == ".nii":
    import nibabel as nib
    image = nib.load(filename)
    image = np.array(image.dataobj)
  elif ext in (".tif", ".tiff"):
    import tifffile
    image = tifffile.imread(srcpath)
  else:
    raise ValueError("Data type not supported: " + ext)

  return np.asfortranarray(image)


================================================
FILE: manual_testing/manual_test.py
================================================
import kimimaro
import numpy as np

from PIL import Image 

img = Image.open('./crossstreet.png').asarray()
print(img)

================================================
FILE: manylinux2010.Dockerfile
================================================
FROM quay.io/pypa/manylinux2010_x86_64 
MAINTAINER William Silversmith

ADD . /kimimaro

WORKDIR "/kimimaro"

ENV CC "gcc"
ENV CXX "g++"

RUN rm -rf *.so build __pycache__ dist 

RUN /opt/python/cp36-cp36m/bin/pip3.6 install pip --upgrade
RUN /opt/python/cp36-cp36m/bin/pip3.6 install numpy
RUN /opt/python/cp36-cp36m/bin/pip3.6 install -r requirements.txt
RUN /opt/python/cp36-cp36m/bin/python3.6 setup.py develop
RUN /opt/python/cp36-cp36m/bin/python3.6 -m pytest -v -x automated_test.py

RUN /opt/python/cp37-cp37m/bin/pip3.7 install pip --upgrade
RUN /opt/python/cp37-cp37m/bin/pip3.7 install numpy
RUN /opt/python/cp37-cp37m/bin/pip3.7 install -r requirements.txt
RUN /opt/python/cp37-cp37m/bin/python3.7 setup.py develop
RUN /opt/python/cp37-cp37m/bin/python3.7 -m pytest -v -x automated_test.py

RUN /opt/python/cp38-cp38/bin/pip3.8 install pip --upgrade
RUN /opt/python/cp38-cp38/bin/pip3.8 install numpy
RUN /opt/python/cp38-cp38/bin/pip3.8 install -r requirements.txt
RUN /opt/python/cp38-cp38/bin/python3.8 setup.py develop
RUN /opt/python/cp38-cp38/bin/python3.8 -m pytest -v -x automated_test.py

RUN /opt/python/cp36-cp36m/bin/python3.6 setup.py bdist_wheel
RUN /opt/python/cp37-cp37m/bin/python3.7 setup.py bdist_wheel
RUN /opt/python/cp38-cp38/bin/python3.8 setup.py bdist_wheel

RUN for whl in `ls dist/*.whl`; do auditwheel repair $whl --plat manylinux2010_x86_64; done

================================================
FILE: manylinux2014.Dockerfile
================================================
FROM quay.io/pypa/manylinux2014_x86_64 
MAINTAINER William Silversmith

ADD . /kimimaro

WORKDIR "/kimimaro"

ENV CC "gcc"
ENV CXX "g++"

RUN rm -rf *.so build __pycache__ dist 

RUN /opt/python/cp36-cp36m/bin/pip3.6 install pip --upgrade
RUN /opt/python/cp36-cp36m/bin/pip3.6 install numpy
RUN /opt/python/cp36-cp36m/bin/pip3.6 install -r requirements.txt
RUN /opt/python/cp36-cp36m/bin/python3.6 setup.py develop
RUN /opt/python/cp36-cp36m/bin/python3.6 -m pytest -v -x automated_test.py

RUN /opt/python/cp37-cp37m/bin/pip3.7 install pip --upgrade
RUN /opt/python/cp37-cp37m/bin/pip3.7 install numpy
RUN /opt/python/cp37-cp37m/bin/pip3.7 install -r requirements.txt
RUN /opt/python/cp37-cp37m/bin/python3.7 setup.py develop
RUN /opt/python/cp37-cp37m/bin/python3.7 -m pytest -v -x automated_test.py

RUN /opt/python/cp38-cp38/bin/pip3.8 install pip --upgrade
RUN /opt/python/cp38-cp38/bin/pip3.8 install numpy
RUN /opt/python/cp38-cp38/bin/pip3.8 install -r requirements.txt
RUN /opt/python/cp38-cp38/bin/python3.8 setup.py develop
RUN /opt/python/cp38-cp38/bin/python3.8 -m pytest -v -x automated_test.py

RUN /opt/python/cp36-cp36m/bin/python3.6 setup.py bdist_wheel
RUN /opt/python/cp37-cp37m/bin/python3.7 setup.py bdist_wheel
RUN /opt/python/cp38-cp38/bin/python3.8 setup.py bdist_wheel

RUN /opt/python/cp39-cp39/bin/pip3.9 install pip --upgrade
RUN /opt/python/cp39-cp39/bin/pip3.9 install numpy
RUN /opt/python/cp39-cp39/bin/pip3.9 install -r requirements.txt
RUN /opt/python/cp39-cp39/bin/python3.9 setup.py develop
RUN /opt/python/cp39-cp39/bin/python3.9 -m pytest -v -x automated_test.py

RUN /opt/python/cp36-cp36m/bin/python3.6 setup.py bdist_wheel
RUN /opt/python/cp37-cp37m/bin/python3.7 setup.py bdist_wheel
RUN /opt/python/cp38-cp38/bin/python3.8 setup.py bdist_wheel
RUN /opt/python/cp39-cp39/bin/python3.9 setup.py bdist_wheel

RUN for whl in `ls dist/*.whl`; do auditwheel repair $whl --plat manylinux2014_x86_64; done


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = [
    "setuptools>=61.0.0",
    "wheel",
    "cython",
    "numpy>=1.16.1"
]
build-backend = "setuptools.build_meta"

[project]
name = "kimimaro"
version = "5.8.1"
authors = [
    {name = "William Silversmith", email = "ws9@princeton.edu"},
    {name = "Alex Bae"},
    {name = "Forrest Collman"},
    {name = "Peter Li"},
    {name = "Nina Shamsi"}
]
description = "Skeletonize densely labeled image volumes."
readme = "README.md"
requires-python = ">=3.9.0,<4.0.0"
license = "GPL-3.0-or-later"
keywords = [
    "volumetric-data",
    "numpy",
    "teasar",
    "skeletonization",
    "centerline",
    "medial-axis-transform",
    "centerline-extraction",
    "computer-vision-algorithms",
    "connectomics",
    "image-processing",
    "biomedical-image-processing",
    "voxel"
]
classifiers = [
    "Intended Audience :: Developers",
    "Development Status :: 5 - Production/Stable",
    "Programming Language :: Python",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Topic :: Scientific/Engineering",
    "Intended Audience :: Science/Research",
    "Operating System :: POSIX",
    "Operating System :: MacOS",
    "Operating System :: Microsoft :: Windows :: Windows 10"
]

dependencies = [
    "click",
    "connected-components-3d>=3.16.0",
    "dijkstra3d>=1.15.0",
    "fill-voids>=2.0.0",
    "edt>=2.1.0",
    "fastremap>=1.10.2",
    "networkx",
    "numpy>=1.16.1",
    "osteoid",
    "pathos",
    "pytest",
    "scipy>=1.1.0",
    "tqdm",
    "xs3d>=1.2.0,<2"
]

[project.optional-dependencies]
accel = [
    "pykdtree",
]
view = [ 
    "microviewer",
    "crackle-codec",
    "vtk",
]
tif = [ 
    "tifffile",
]
nii = [
    "nibabel",
]
nrrd = [
    "pynrrd",
]
all_formats = [
    "tifffile",
    "nibabel",
    "pynrrd",
]
all = [ 
    "tifffile",
    "nibabel",
    "pynrrd",
    "microviewer",
    "vtk",
    "pykdtree",
]

[project.urls]
Homepage = "https://github.com/seung-lab/kimimaro/"

[project.scripts]
kimimaro = "kimimaro_cli:main"

[tool.setuptools]
packages = ["kimimaro", "kimimaro_cli"]
include-package-data = true

[tool.setuptools.package-dir]
kimimaro = "kimimaro"
kimimaro_cli = "kimimaro_cli"


================================================
FILE: requirements-dev.txt
================================================
pytest
crackle-codec

================================================
FILE: requirements.txt
================================================
click
connected-components-3d>=3.16.0
crackle-codec>=0.33.0
dijkstra3d>=1.15.0
fill-voids>=2.0.0
edt>=3.0.0
fastremap>=1.10.2
microviewer
networkx
numpy>=1.16.1
osteoid
pathos
posix_ipc
psutil
scipy>=1.1.0
tqdm
xs3d>=1.11.0,<2


================================================
FILE: setup.py
================================================
#!/usr/bin/env python
import os
import setuptools
import sys

class NumpyImport:
  def __repr__(self):
    import numpy as np

    return np.get_include()

  __fspath__ = __repr__

extra_compile_args = []
if sys.platform == 'win32':
  extra_compile_args += [
    '/std:c++17', '/O2'
  ]
else:
  extra_compile_args += [
    '-std=c++17', '-O3'
  ]

if sys.platform == 'darwin':
  extra_compile_args += [ '-stdlib=libc++', '-mmacosx-version-min=10.9' ]

setuptools.setup(
    ext_modules=[
      setuptools.Extension(
        'kimimaro.skeletontricks',
        sources=[ './ext/skeletontricks/skeletontricks.pyx' ],
        language='c++',
        include_dirs=[ str(NumpyImport()) ],
        extra_compile_args=extra_compile_args,
      ),
    ],
)

================================================
FILE: tox.ini
================================================
[tox]
envlist = py38,py39,py310,py311,py312

[testenv]
platform = darwin
deps = 
	setuptools
	wheel
	cython
	-rrequirements.txt
	oldest-supported-numpy

commands = 
	python setup.py develop
	python setup.py bdist_wheel