[
  {
    "path": ".dockerignore",
    "content": "build\n*.egg-info\nbenchmarks\n__pycache__\nmanual_testing\n.eggs\n.git\n.tox\n.pytest_cache"
  },
  {
    "path": ".github/workflows/build_wheel.yml",
    "content": "name: Build Wheels\n\non:\n  workflow_dispatch:\n  push:\n    tags:\n      - '*'\nenv:\n  CIBW_SKIP: pp* *-musllinux* cp36* cp37* cp38*\n\njobs:\n  build_wheels:\n    name: Build wheels on ${{matrix.arch}} for ${{ matrix.os }}\n    runs-on: ${{ matrix.os }}\n    strategy:\n      matrix:\n        os: [ubuntu-latest, windows-latest, macos-latest]\n        arch: [auto]\n        include:\n          - os: ubuntu-latest\n            arch: aarch64\n\n    steps:\n      - uses: actions/checkout@v4\n\n      - name: Set up QEMU\n        if:  ${{ matrix.arch == 'aarch64' }}\n        uses: docker/setup-qemu-action@v1\n\n      - name: Build wheels\n        uses: pypa/cibuildwheel@v3.2.0\n        # to supply options, put them in 'env', like:\n        env:\n          CIBW_ARCHS_LINUX: ${{matrix.arch}}\n          CIBW_BEFORE_BUILD: pip install numpy setuptools wheel cython \n          CIBW_ARCHS_MACOS: \"x86_64 arm64\"\n\n      - name: Upload built wheels\n        uses: actions/upload-artifact@v4\n        with:\n          name: built-wheels-${{ matrix.os }}-${{ matrix.arch }}\n          path: ./wheelhouse/*.whl\n          if-no-files-found: warn"
  },
  {
    "path": ".github/workflows/test.yml",
    "content": "# This workflow will install Python dependencies, run tests and lint with a variety of Python versions\n# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions\n\nname: Test Suite\n\non:\n  push:\n    branches: [ master ]\n  pull_request:\n    branches: [ master ]\n\njobs:\n  build:\n\n    runs-on: ubuntu-latest\n    strategy:\n      matrix:\n        python-version: [\"3.9\", \"3.10\", \"3.11\", \"3.12\", \"3.13\"]\n\n    steps:\n    - uses: actions/checkout@v2\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v2\n      with:\n        python-version: ${{ matrix.python-version }}\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install cython numpy setuptools wheel build\n        pip install -r requirements.txt -r requirements-dev.txt \n        python -m build --wheel\n        pip install dist/*.whl\n    - name: Test with pytest\n      run: |\n        python setup.py develop\n        python -m pytest -v -x automated_test.py\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n\ntest.py\n.DS_Store\n\n# Itelli J\n.idea/\n\next/skeletontricks/skeletontricks.cpp\n"
  },
  {
    "path": "AUTHORS",
    "content": "Jingpeng Wu <jingpeng.wu@gmail.com>\nWilliam Silversmith <william.silversmith@gmail.com>\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.1.0\nmessage: \"If you use this software, please cite it as below.\"\nauthors:\n- family-names: \"Silversmith\"\n  given-names: \"William\"\n  orcid: \"https://orcid.org/0000-0002-5485-5341\"\n- family-names: \"Bae\"\n  given-names: \"J. Alexander\"\n  orcid: \"https://orcid.org/0000-0002-4681-6342\"\n- family-names: \"Li\"\n  given-names: \"Peter H.\"\n  orcid: \"https://orcid.org/0000-0001-6193-4454\"\n- family-names: \"Wilson\"\n  given-names: \"A.M.\"\n  orcid: \"https://orcid.org/0000-0002-3822-5200\"\ntitle: \"Kimimaro: Skeletonize densely labeled 3D image segmentations\"\nversion: 3.0.0\ndate-released: 2021-09-29\ndoi: 10.5281/zenodo.5539913 \n"
  },
  {
    "path": "ChangeLog",
    "content": "CHANGES\n=======\n\n2.0.2\n-----\n\n* test: faster execution for cube and solid color tests\n* fix(trace): skip adding DAF if max is 0\n* test: check extremely sparse images (one or two voxels with no dust threshold)\n* chore: drop py35 testing add .dockerignore\n\n2.0.1\n-----\n\n* fix(windows): use np.uintp before casting to size\\_t\n* fix: appveyor needs numpy installed first\n* chore: new build system for binary distribution\n\n2.0.0\n-----\n\n* fix(intake): solid color blocks were causing errors (#56)\n* perf: faster somas (#55)\n* fix: python3.8 compiles cpp code (#52)\n* chore: update travis to use python 3.7 and 3.8\n* add python3.8 test\n\n1.6.0\n-----\n\n* feat: avocado protection (🥑) (#43)\n* chore: update ChangeLog\n\n1.5.0\n-----\n\n* chore: add skeleton for manual testing\n* feat: add fill\\_holes argument (#50)\n\n1.4.2\n-----\n\n* chore: loosen networkx requirement (#49)\n* Update README.md\n* docs: update memory usage diagram for version 1.4.0\n\n1.4.1\n-----\n\n* perf: switch source and target for dijkstra\n\n1.3.3\n-----\n\n* refactor: make type of 0L clear to std::max on Windows\n* Revert \"fix: don't assume vertices are uint32\"\n* fix: don't assume vertices are uint32\n* chore: update ChangeLog\n\n1.3.2\n-----\n\n* fix: several additional algorithms required 64-bit addressable changes\n\n1.3.1\n-----\n\n* chore: bump dijkstra requirement\n* fix: 64-bit addressable \\_roll\\_invalidation\\_cube (#42)\n* docs: shout out to fill\\_voids\n* fix: remove unnecessary PIL import\n\n1.3.0\n-----\n\n* docs: describe max\\_paths in the function docstring\n* fix: soma center was being overriden by fix\\_borders\n* perf: only recompute EDT for soma if some voxels were filled\n* perf: use bidirectional dijkstra on somata (increases peak memory usage)\n\n1.2.1\n-----\n\n* docs: remove non-ascii character from README.md\n* docs: link back to papers using Kimimaro\n\n1.2.0\n-----\n\n* docs: show how to use synapses\\_to\\_targets\n* feat: facility for converting synapse centroids into targets (#37)\n* refactor+perf: use new fill-voids package\n\n1.1.0\n-----\n\n* perf: implemented flood fill based binary\\_fill\\_holes (#38)\n\n1.0.4\n-----\n\n* perf: increase postprocess speed (#35)\n* perf: more judicious use of consolidate in postprocess\n\n1.0.3\n-----\n\n* docs: update ChangeLog\n* fix: preserve skeleton id during postprocessing\n\n1.0.2\n-----\n\n* fix: allow multiple invocations of a pathos process pool\n* perf: skip processing if dust\\_threshold larger than image\n\n1.0.1\n-----\n\n* fix: accept any root converable to a tuple\n* fix: progress bars were disrupted in parallel feature\n* docs: upload changelog\n\n1.0.0\n-----\n\n* feat: specify extra\\_targets\\_before and after (#33)\n* docs: fix spelling & grammar\n\n0.7.0\n-----\n\n* docs: add parallel\\_chunk\\_size to README\n* perf+feat: Reduce Parallel Task Starvation + Better Parallel Progress Bar (#32)\n* docs: add example of join\\_close\\_components\n\n0.6.0\n-----\n\n* feat: adds join\\_close\\_components to postprocess (#27)\n* docs: link to tutorial wiki articles\n* docs: add advice on tweaking parameters\n\n0.5.4\n-----\n\n* fix: sometimes get\\_mapping doesn't get everything\n\n0.5.3\n-----\n\n* fix: object\\_ids were being masked instead of mask\\_excepted\n* docs: show performance chart for v0.5.2\n\n0.5.2\n-----\n\n* perf: improve performance of find\\_objects 7x\n\n0.5.1\n-----\n\n* perf: ~20x faster unique(label, return\\_counts=True) (#26)\n* docs: changelog update + small formatting adjustment to example\n\n0.5.0\n-----\n\n* docs: example of how to use postprocess\n* feat: import out-of-core postprocessing logic from Igneous\n* docs: add object\\_ids to example\n* perf: improve speed of skeletontricks.get\\_mapping\n* fix: accept binary images of type bool\n* perf: take advantage of faster segid finding if dust\\_threshold == 0\n* fix: compilation warning for \\_roll\\_invalidation\\_cube\n* test: add some manual visualization tests\n* chore: update ChangeLog\n\n0.4.2\n-----\n\n* release: 0.4.2\n* chore: tell PyPI we're using markdown\n* fix: ensure we pick max dbf close to centroid of detected somata\n* chore: update ChangeLog\n* docs: various corrections to the README\n\n0.4.1\n-----\n\n* fix: add defense against setting the dust threshold lower than 1\n* chore: formatting around all\\_labels\n* test: x and y joinability\n* test: show that two 1px overlapping volumes join properly\n* Update README.md\n* feat: accept N-dimensional arrays with trivial dimensions above 3\n* docs: add Google TEASAR run to boslster case for popularity\n* fix: prevent duplicate border targets\n* feat: parallel edt implementation\n* fix: add support for anisotropy to distance calculations\n* test: add distortion to border test\n* wip: propogate anisotropy to fix\\_borders calls\n* fix: cuboid soma processing\n* fix: bump edt to 1.2.4 to correct part of large anisotropy issue\n* perf: faster masking operations with newer fastremap\n* docs: encouraged the use of parallel processing in README.md\n* chore: add GPLv3 classifer to setup.cfg\n* chore: add ChangeLog\n\n0.4.0\n-----\n\n* feat: parallel implementation (Cursed Seal Mode) (#10)\n\n0.3.1\n-----\n\n* fix: INTEGER type did not include all integers\n\n0.3.0\n-----\n\n* docs: updated credits with fix\\_borders\n* feat: add fix\\_borders parameter & max\\_paths parameter (#9)\n* test+fix: remove \"cd python\"\n* docs: add Travis CI badge\n* chore: add Travis CI\n* test: add basic test for skeletonizing diagonal of square and cube\n* perf: improve memory consumption of object masking\n* perf: introduce in\\_place flag to make it safe to modify input data\n* perf: use fastremap's new in\\_place flag for lower memory and perf\n* docs: updated credits\n\n0.2.2\n-----\n\n* fix: accept C order arrays (#7)\n* docs: reduce redundancy in example vs performance\n* docs: add benchmark description\n* docs: added benchmark photo\n* docs: add link to citation 4\n* docs: use citations 3 and 4\n* docs: described \"roll invalidation cube\"\n* docs: described algorithm in steps\n* docs: describing the algorithm\n\n0.2.1\n-----\n\n* fix: black volumes should return dict not None\n\n0.2.0\n-----\n\n* docs: add PyPI badge\n* feat: fix branching (#1)\n* docs: adding sections to README\n\n0.1.0\n-----\n\n* chore: clean up dockerfile and metadata\n* docs: draft discussion of motivation and usage\n* feat: export DimensionError exception (so it can be caught)\n* refactor: remove path\\_downsample from trace function\n* docs: described parameters of skeletonize function\n* chore: files required for building distributions\n* wip: importing skeletonization procedure\n* Initial commit\n"
  },
  {
    "path": "LICENSE",
    "content": "                    GNU GENERAL PUBLIC LICENSE\n                       Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>\n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n                            Preamble\n\n  The GNU General Public License is a free, copyleft license for\nsoftware and other kinds of works.\n\n  The licenses for most software and other practical works are designed\nto take away your freedom to share and change the works.  By contrast,\nthe GNU General Public License is intended to guarantee your freedom to\nshare and change all versions of a program--to make sure it remains free\nsoftware for all its users.  We, the Free Software Foundation, use the\nGNU General Public License for most of our software; it applies also to\nany other work released this way by its authors.  You can apply it to\nyour programs, too.\n\n  When we speak of free software, we are referring to freedom, not\nprice.  Our General Public Licenses are designed to make sure that you\nhave the freedom to distribute copies of free software (and charge for\nthem if you wish), that you receive source code or can get it if you\nwant it, that you can change the software or use pieces of it in new\nfree programs, and that you know you can do these things.\n\n  To protect your rights, we need to prevent others from denying you\nthese rights or asking you to surrender the rights.  Therefore, you have\ncertain responsibilities if you distribute copies of the software, or if\nyou modify it: responsibilities to respect the freedom of others.\n\n  For example, if you distribute copies of such a program, whether\ngratis or for a fee, you must pass on to the recipients the same\nfreedoms that you received.  You must make sure that they, too, receive\nor can get the source code.  And you must show them these terms so they\nknow their rights.\n\n  Developers that use the GNU GPL protect your rights with two steps:\n(1) assert copyright on the software, and (2) offer you this License\ngiving you legal permission to copy, distribute and/or modify it.\n\n  For the developers' and authors' protection, the GPL clearly explains\nthat there is no warranty for this free software.  For both users' and\nauthors' sake, the GPL requires that modified versions be marked as\nchanged, so that their problems will not be attributed erroneously to\nauthors of previous versions.\n\n  Some devices are designed to deny users access to install or run\nmodified versions of the software inside them, although the manufacturer\ncan do so.  This is fundamentally incompatible with the aim of\nprotecting users' freedom to change the software.  The systematic\npattern of such abuse occurs in the area of products for individuals to\nuse, which is precisely where it is most unacceptable.  Therefore, we\nhave designed this version of the GPL to prohibit the practice for those\nproducts.  If such problems arise substantially in other domains, we\nstand ready to extend this provision to those domains in future versions\nof the GPL, as needed to protect the freedom of users.\n\n  Finally, every program is threatened constantly by software patents.\nStates should not allow patents to restrict development and use of\nsoftware on general-purpose computers, but in those that do, we wish to\navoid the special danger that patents applied to a free program could\nmake it effectively proprietary.  To prevent this, the GPL assures that\npatents cannot be used to render the program non-free.\n\n  The precise terms and conditions for copying, distribution and\nmodification follow.\n\n                       TERMS AND CONDITIONS\n\n  0. Definitions.\n\n  \"This License\" refers to version 3 of the GNU General Public License.\n\n  \"Copyright\" also means copyright-like laws that apply to other kinds of\nworks, such as semiconductor masks.\n\n  \"The Program\" refers to any copyrightable work licensed under this\nLicense.  Each licensee is addressed as \"you\".  \"Licensees\" and\n\"recipients\" may be individuals or organizations.\n\n  To \"modify\" a work means to copy from or adapt all or part of the work\nin a fashion requiring copyright permission, other than the making of an\nexact copy.  The resulting work is called a \"modified version\" of the\nearlier work or a work \"based on\" the earlier work.\n\n  A \"covered work\" means either the unmodified Program or a work based\non the Program.\n\n  To \"propagate\" a work means to do anything with it that, without\npermission, would make you directly or secondarily liable for\ninfringement under applicable copyright law, except executing it on a\ncomputer or modifying a private copy.  Propagation includes copying,\ndistribution (with or without modification), making available to the\npublic, and in some countries other activities as well.\n\n  To \"convey\" a work means any kind of propagation that enables other\nparties to make or receive copies.  Mere interaction with a user through\na computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays \"Appropriate Legal Notices\"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\ntells the user that there is no warranty for the work (except to the\nextent that warranties are provided), that licensees may convey the\nwork under this License, and how to view a copy of this License.  If\nthe interface presents a list of user commands or options, such as a\nmenu, a prominent item in the list meets this criterion.\n\n  1. Source Code.\n\n  The \"source code\" for a work means the preferred form of the work\nfor making modifications to it.  \"Object code\" means any non-source\nform of a work.\n\n  A \"Standard Interface\" means an interface that either is an official\nstandard defined by a recognized standards body, or, in the case of\ninterfaces specified for a particular programming language, one that\nis widely used among developers working in that language.\n\n  The \"System Libraries\" of an executable work include anything, other\nthan the work as a whole, that (a) is included in the normal form of\npackaging a Major Component, but which is not part of that Major\nComponent, and (b) serves only to enable use of the work with that\nMajor Component, or to implement a Standard Interface for which an\nimplementation is available to the public in source code form.  A\n\"Major Component\", in this context, means a major essential component\n(kernel, window system, and so on) of the specific operating system\n(if any) on which the executable work runs, or a compiler used to\nproduce the work, or an object code interpreter used to run it.\n\n  The \"Corresponding Source\" for a work in object code form means all\nthe source code needed to generate, install, and (for an executable\nwork) run the object code and to modify the work, including scripts to\ncontrol those activities.  However, it does not include the work's\nSystem Libraries, or general-purpose tools or generally available free\nprograms which are used unmodified in performing those activities but\nwhich are not part of the work.  For example, Corresponding Source\nincludes interface definition files associated with source files for\nthe work, and the source code for shared libraries and dynamically\nlinked subprograms that the work is specifically designed to require,\nsuch as by intimate data communication or control flow between those\nsubprograms and other parts of the work.\n\n  The Corresponding Source need not include anything that users\ncan regenerate automatically from other parts of the Corresponding\nSource.\n\n  The Corresponding Source for a work in source code form is that\nsame work.\n\n  2. Basic Permissions.\n\n  All rights granted under this License are granted for the term of\ncopyright on the Program, and are irrevocable provided the stated\nconditions are met.  This License explicitly affirms your unlimited\npermission to run the unmodified Program.  The output from running a\ncovered work is covered by this License only if the output, given its\ncontent, constitutes a covered work.  This License acknowledges your\nrights of fair use or other equivalent, as provided by copyright law.\n\n  You may make, run and propagate covered works that you do not\nconvey, without conditions so long as your license otherwise remains\nin force.  You may convey covered works to others for the sole purpose\nof having them make modifications exclusively for you, or provide you\nwith facilities for running those works, provided that you comply with\nthe terms of this License in conveying all material for which you do\nnot control copyright.  Those thus making or running the covered works\nfor you must do so exclusively on your behalf, under your direction\nand control, on terms that prohibit them from making any copies of\nyour copyrighted material outside their relationship with you.\n\n  Conveying under any other circumstances is permitted solely under\nthe conditions stated below.  Sublicensing is not allowed; section 10\nmakes it unnecessary.\n\n  3. Protecting Users' Legal Rights From Anti-Circumvention Law.\n\n  No covered work shall be deemed part of an effective technological\nmeasure under any applicable law fulfilling obligations under article\n11 of the WIPO copyright treaty adopted on 20 December 1996, or\nsimilar laws prohibiting or restricting circumvention of such\nmeasures.\n\n  When you convey a covered work, you waive any legal power to forbid\ncircumvention of technological measures to the extent such circumvention\nis effected by exercising rights under this License with respect to\nthe covered work, and you disclaim any intention to limit operation or\nmodification of the work as a means of enforcing, against the work's\nusers, your or third parties' legal rights to forbid circumvention of\ntechnological measures.\n\n  4. Conveying Verbatim Copies.\n\n  You may convey verbatim copies of the Program's source code as you\nreceive it, in any medium, provided that you conspicuously and\nappropriately publish on each copy an appropriate copyright notice;\nkeep intact all notices stating that this License and any\nnon-permissive terms added in accord with section 7 apply to the code;\nkeep intact all notices of the absence of any warranty; and give all\nrecipients a copy of this License along with the Program.\n\n  You may charge any price or no price for each copy that you convey,\nand you may offer support or warranty protection for a fee.\n\n  5. Conveying Modified Source Versions.\n\n  You may convey a work based on the Program, or the modifications to\nproduce it from the Program, in the form of source code under the\nterms of section 4, provided that you also meet all of these conditions:\n\n    a) The work must carry prominent notices stating that you modified\n    it, and giving a relevant date.\n\n    b) The work must carry prominent notices stating that it is\n    released under this License and any conditions added under section\n    7.  This requirement modifies the requirement in section 4 to\n    \"keep intact all notices\".\n\n    c) You must license the entire work, as a whole, under this\n    License to anyone who comes into possession of a copy.  This\n    License will therefore apply, along with any applicable section 7\n    additional terms, to the whole of the work, and all its parts,\n    regardless of how they are packaged.  This License gives no\n    permission to license the work in any other way, but it does not\n    invalidate such permission if you have separately received it.\n\n    d) If the work has interactive user interfaces, each must display\n    Appropriate Legal Notices; however, if the Program has interactive\n    interfaces that do not display Appropriate Legal Notices, your\n    work need not make them do so.\n\n  A compilation of a covered work with other separate and independent\nworks, which are not by their nature extensions of the covered work,\nand which are not combined with it such as to form a larger program,\nin or on a volume of a storage or distribution medium, is called an\n\"aggregate\" if the compilation and its resulting copyright are not\nused to limit the access or legal rights of the compilation's users\nbeyond what the individual works permit.  Inclusion of a covered work\nin an aggregate does not cause this License to apply to the other\nparts of the aggregate.\n\n  6. Conveying Non-Source Forms.\n\n  You may convey a covered work in object code form under the terms\nof sections 4 and 5, provided that you also convey the\nmachine-readable Corresponding Source under the terms of this License,\nin one of these ways:\n\n    a) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by the\n    Corresponding Source fixed on a durable physical medium\n    customarily used for software interchange.\n\n    b) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by a\n    written offer, valid for at least three years and valid for as\n    long as you offer spare parts or customer support for that product\n    model, to give anyone who possesses the object code either (1) a\n    copy of the Corresponding Source for all the software in the\n    product that is covered by this License, on a durable physical\n    medium customarily used for software interchange, for a price no\n    more than your reasonable cost of physically performing this\n    conveying of source, or (2) access to copy the\n    Corresponding Source from a network server at no charge.\n\n    c) Convey individual copies of the object code with a copy of the\n    written offer to provide the Corresponding Source.  This\n    alternative is allowed only occasionally and noncommercially, and\n    only if you received the object code with such an offer, in accord\n    with subsection 6b.\n\n    d) Convey the object code by offering access from a designated\n    place (gratis or for a charge), and offer equivalent access to the\n    Corresponding Source in the same way through the same place at no\n    further charge.  You need not require recipients to copy the\n    Corresponding Source along with the object code.  If the place to\n    copy the object code is a network server, the Corresponding Source\n    may be on a different server (operated by you or a third party)\n    that supports equivalent copying facilities, provided you maintain\n    clear directions next to the object code saying where to find the\n    Corresponding Source.  Regardless of what server hosts the\n    Corresponding Source, you remain obligated to ensure that it is\n    available for as long as needed to satisfy these requirements.\n\n    e) Convey the object code using peer-to-peer transmission, provided\n    you inform other peers where the object code and Corresponding\n    Source of the work are being offered to the general public at no\n    charge under subsection 6d.\n\n  A separable portion of the object code, whose source code is excluded\nfrom the Corresponding Source as a System Library, need not be\nincluded in conveying the object code work.\n\n  A \"User Product\" is either (1) a \"consumer product\", which means any\ntangible personal property which is normally used for personal, family,\nor household purposes, or (2) anything designed or sold for incorporation\ninto a dwelling.  In determining whether a product is a consumer product,\ndoubtful cases shall be resolved in favor of coverage.  For a particular\nproduct received by a particular user, \"normally used\" refers to a\ntypical or common use of that class of product, regardless of the status\nof the particular user or of the way in which the particular user\nactually uses, or expects or is expected to use, the product.  A product\nis a consumer product regardless of whether the product has substantial\ncommercial, industrial or non-consumer uses, unless such uses represent\nthe only significant mode of use of the product.\n\n  \"Installation Information\" for a User Product means any methods,\nprocedures, authorization keys, or other information required to install\nand execute modified versions of a covered work in that User Product from\na modified version of its Corresponding Source.  The information must\nsuffice to ensure that the continued functioning of the modified object\ncode is in no case prevented or interfered with solely because\nmodification has been made.\n\n  If you convey an object code work under this section in, or with, or\nspecifically for use in, a User Product, and the conveying occurs as\npart of a transaction in which the right of possession and use of the\nUser Product is transferred to the recipient in perpetuity or for a\nfixed term (regardless of how the transaction is characterized), the\nCorresponding Source conveyed under this section must be accompanied\nby the Installation Information.  But this requirement does not apply\nif neither you nor any third party retains the ability to install\nmodified object code on the User Product (for example, the work has\nbeen installed in ROM).\n\n  The requirement to provide Installation Information does not include a\nrequirement to continue to provide support service, warranty, or updates\nfor a work that has been modified or installed by the recipient, or for\nthe User Product in which it has been modified or installed.  Access to a\nnetwork may be denied when the modification itself materially and\nadversely affects the operation of the network or violates the rules and\nprotocols for communication across the network.\n\n  Corresponding Source conveyed, and Installation Information provided,\nin accord with this section must be in a format that is publicly\ndocumented (and with an implementation available to the public in\nsource code form), and must require no special password or key for\nunpacking, reading or copying.\n\n  7. Additional Terms.\n\n  \"Additional permissions\" are terms that supplement the terms of this\nLicense by making exceptions from one or more of its conditions.\nAdditional permissions that are applicable to the entire Program shall\nbe treated as though they were included in this License, to the extent\nthat they are valid under applicable law.  If additional permissions\napply only to part of the Program, that part may be used separately\nunder those permissions, but the entire Program remains governed by\nthis License without regard to the additional permissions.\n\n  When you convey a copy of a covered work, you may at your option\nremove any additional permissions from that copy, or from any part of\nit.  (Additional permissions may be written to require their own\nremoval in certain cases when you modify the work.)  You may place\nadditional permissions on material, added by you to a covered work,\nfor which you have or can give appropriate copyright permission.\n\n  Notwithstanding any other provision of this License, for material you\nadd to a covered work, you may (if authorized by the copyright holders of\nthat material) supplement the terms of this License with terms:\n\n    a) Disclaiming warranty or limiting liability differently from the\n    terms of sections 15 and 16 of this License; or\n\n    b) Requiring preservation of specified reasonable legal notices or\n    author attributions in that material or in the Appropriate Legal\n    Notices displayed by works containing it; or\n\n    c) Prohibiting misrepresentation of the origin of that material, or\n    requiring that modified versions of such material be marked in\n    reasonable ways as different from the original version; or\n\n    d) Limiting the use for publicity purposes of names of licensors or\n    authors of the material; or\n\n    e) Declining to grant rights under trademark law for use of some\n    trade names, trademarks, or service marks; or\n\n    f) Requiring indemnification of licensors and authors of that\n    material by anyone who conveys the material (or modified versions of\n    it) with contractual assumptions of liability to the recipient, for\n    any liability that these contractual assumptions directly impose on\n    those licensors and authors.\n\n  All other non-permissive additional terms are considered \"further\nrestrictions\" within the meaning of section 10.  If the Program as you\nreceived it, or any part of it, contains a notice stating that it is\ngoverned by this License along with a term that is a further\nrestriction, you may remove that term.  If a license document contains\na further restriction but permits relicensing or conveying under this\nLicense, you may add to a covered work material governed by the terms\nof that license document, provided that the further restriction does\nnot survive such relicensing or conveying.\n\n  If you add terms to a covered work in accord with this section, you\nmust place, in the relevant source files, a statement of the\nadditional terms that apply to those files, or a notice indicating\nwhere to find the applicable terms.\n\n  Additional terms, permissive or non-permissive, may be stated in the\nform of a separately written license, or stated as exceptions;\nthe above requirements apply either way.\n\n  8. Termination.\n\n  You may not propagate or modify a covered work except as expressly\nprovided under this License.  Any attempt otherwise to propagate or\nmodify it is void, and will automatically terminate your rights under\nthis License (including any patent licenses granted under the third\nparagraph of section 11).\n\n  However, if you cease all violation of this License, then your\nlicense from a particular copyright holder is reinstated (a)\nprovisionally, unless and until the copyright holder explicitly and\nfinally terminates your license, and (b) permanently, if the copyright\nholder fails to notify you of the violation by some reasonable means\nprior to 60 days after the cessation.\n\n  Moreover, your license from a particular copyright holder is\nreinstated permanently if the copyright holder notifies you of the\nviolation by some reasonable means, this is the first time you have\nreceived notice of violation of this License (for any work) from that\ncopyright holder, and you cure the violation prior to 30 days after\nyour receipt of the notice.\n\n  Termination of your rights under this section does not terminate the\nlicenses of parties who have received copies or rights from you under\nthis License.  If your rights have been terminated and not permanently\nreinstated, you do not qualify to receive new licenses for the same\nmaterial under section 10.\n\n  9. Acceptance Not Required for Having Copies.\n\n  You are not required to accept this License in order to receive or\nrun a copy of the Program.  Ancillary propagation of a covered work\noccurring solely as a consequence of using peer-to-peer transmission\nto receive a copy likewise does not require acceptance.  However,\nnothing other than this License grants you permission to propagate or\nmodify any covered work.  These actions infringe copyright if you do\nnot accept this License.  Therefore, by modifying or propagating a\ncovered work, you indicate your acceptance of this License to do so.\n\n  10. Automatic Licensing of Downstream Recipients.\n\n  Each time you convey a covered work, the recipient automatically\nreceives a license from the original licensors, to run, modify and\npropagate that work, subject to this License.  You are not responsible\nfor enforcing compliance by third parties with this License.\n\n  An \"entity transaction\" is a transaction transferring control of an\norganization, or substantially all assets of one, or subdividing an\norganization, or merging organizations.  If propagation of a covered\nwork results from an entity transaction, each party to that\ntransaction who receives a copy of the work also receives whatever\nlicenses to the work the party's predecessor in interest had or could\ngive under the previous paragraph, plus a right to possession of the\nCorresponding Source of the work from the predecessor in interest, if\nthe predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\nrights granted under this License, and you may not initiate litigation\n(including a cross-claim or counterclaim in a lawsuit) alleging that\nany patent claim is infringed by making, using, selling, offering for\nsale, or importing the Program or any portion of it.\n\n  11. Patents.\n\n  A \"contributor\" is a copyright holder who authorizes use under this\nLicense of the Program or a work on which the Program is based.  The\nwork thus licensed is called the contributor's \"contributor version\".\n\n  A contributor's \"essential patent claims\" are all patent claims\nowned or controlled by the contributor, whether already acquired or\nhereafter acquired, that would be infringed by some manner, permitted\nby this License, of making, using, or selling its contributor version,\nbut do not include claims that would be infringed only as a\nconsequence of further modification of the contributor version.  For\npurposes of this definition, \"control\" includes the right to grant\npatent sublicenses in a manner consistent with the requirements of\nthis License.\n\n  Each contributor grants you a non-exclusive, worldwide, royalty-free\npatent license under the contributor's essential patent claims, to\nmake, use, sell, offer for sale, import and otherwise run, modify and\npropagate the contents of its contributor version.\n\n  In the following three paragraphs, a \"patent license\" is any express\nagreement or commitment, however denominated, not to enforce a patent\n(such as an express permission to practice a patent or covenant not to\nsue for patent infringement).  To \"grant\" such a patent license to a\nparty means to make such an agreement or commitment not to enforce a\npatent against the party.\n\n  If you convey a covered work, knowingly relying on a patent license,\nand the Corresponding Source of the work is not available for anyone\nto copy, free of charge and under the terms of this License, through a\npublicly available network server or other readily accessible means,\nthen you must either (1) cause the Corresponding Source to be so\navailable, or (2) arrange to deprive yourself of the benefit of the\npatent license for this particular work, or (3) arrange, in a manner\nconsistent with the requirements of this License, to extend the patent\nlicense to downstream recipients.  \"Knowingly relying\" means you have\nactual knowledge that, but for the patent license, your conveying the\ncovered work in a country, or your recipient's use of the covered work\nin a country, would infringe one or more identifiable patents in that\ncountry that you have reason to believe are valid.\n\n  If, pursuant to or in connection with a single transaction or\narrangement, you convey, or propagate by procuring conveyance of, a\ncovered work, and grant a patent license to some of the parties\nreceiving the covered work authorizing them to use, propagate, modify\nor convey a specific copy of the covered work, then the patent license\nyou grant is automatically extended to all recipients of the covered\nwork and works based on it.\n\n  A patent license is \"discriminatory\" if it does not include within\nthe scope of its coverage, prohibits the exercise of, or is\nconditioned on the non-exercise of one or more of the rights that are\nspecifically granted under this License.  You may not convey a covered\nwork if you are a party to an arrangement with a third party that is\nin the business of distributing software, under which you make payment\nto the third party based on the extent of your activity of conveying\nthe work, and under which the third party grants, to any of the\nparties who would receive the covered work from you, a discriminatory\npatent license (a) in connection with copies of the covered work\nconveyed by you (or copies made from those copies), or (b) primarily\nfor and in connection with specific products or compilations that\ncontain the covered work, unless you entered into that arrangement,\nor that patent license was granted, prior to 28 March 2007.\n\n  Nothing in this License shall be construed as excluding or limiting\nany implied license or other defenses to infringement that may\notherwise be available to you under applicable patent law.\n\n  12. No Surrender of Others' Freedom.\n\n  If conditions are imposed on you (whether by court order, agreement or\notherwise) that contradict the conditions of this License, they do not\nexcuse you from the conditions of this License.  If you cannot convey a\ncovered work so as to satisfy simultaneously your obligations under this\nLicense and any other pertinent obligations, then as a consequence you may\nnot convey it at all.  For example, if you agree to terms that obligate you\nto collect a royalty for further conveying from those to whom you convey\nthe Program, the only way you could satisfy both those terms and this\nLicense would be to refrain entirely from conveying the Program.\n\n  13. Use with the GNU Affero General Public License.\n\n  Notwithstanding any other provision of this License, you have\npermission to link or combine any covered work with a work licensed\nunder version 3 of the GNU Affero General Public License into a single\ncombined work, and to convey the resulting work.  The terms of this\nLicense will continue to apply to the part which is the covered work,\nbut the special requirements of the GNU Affero General Public License,\nsection 13, concerning interaction through a network will apply to the\ncombination as such.\n\n  14. Revised Versions of this License.\n\n  The Free Software Foundation may publish revised and/or new versions of\nthe GNU General Public License from time to time.  Such new versions will\nbe similar in spirit to the present version, but may differ in detail to\naddress new problems or concerns.\n\n  Each version is given a distinguishing version number.  If the\nProgram specifies that a certain numbered version of the GNU General\nPublic License \"or any later version\" applies to it, you have the\noption of following the terms and conditions either of that numbered\nversion or of any later version published by the Free Software\nFoundation.  If the Program does not specify a version number of the\nGNU General Public License, you may choose any version ever published\nby the Free Software Foundation.\n\n  If the Program specifies that a proxy can decide which future\nversions of the GNU General Public License can be used, that proxy's\npublic statement of acceptance of a version permanently authorizes you\nto choose that version for the Program.\n\n  Later license versions may give you additional or different\npermissions.  However, no additional obligations are imposed on any\nauthor or copyright holder as a result of your choosing to follow a\nlater version.\n\n  15. Disclaimer of Warranty.\n\n  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM\nIS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\nALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n  16. Limitation of Liability.\n\n  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\nWILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\nTHE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\nGENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\nUSE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\nDATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\nPARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\nEVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\nSUCH DAMAGES.\n\n  17. Interpretation of Sections 15 and 16.\n\n  If the disclaimer of warranty and limitation of liability provided\nabove cannot be given local legal effect according to their terms,\nreviewing courts shall apply local law that most closely approximates\nan absolute waiver of all civil liability in connection with the\nProgram, unless a warranty or assumption of liability accompanies a\ncopy of the Program in return for a fee.\n\n                     END OF TERMS AND CONDITIONS\n\n            How to Apply These Terms to Your New Programs\n\n  If you develop a new program, and you want it to be of the greatest\npossible use to the public, the best way to achieve this is to make it\nfree software which everyone can redistribute and change under these terms.\n\n  To do so, attach the following notices to the program.  It is safest\nto attach them to the start of each source file to most effectively\nstate the exclusion of warranty; and each file should have at least\nthe \"copyright\" line and a pointer to where the full notice is found.\n\n    <one line to give the program's name and a brief idea of what it does.>\n    Copyright (C) <year>  <name of author>\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation, either version 3 of the License, or\n    (at your option) any later version.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU General Public License for more details.\n\n    You should have received a copy of the GNU General Public License\n    along with this program.  If not, see <https://www.gnu.org/licenses/>.\n\nAlso add information on how to contact you by electronic and paper mail.\n\n  If the program does terminal interaction, make it output a short\nnotice like this when it starts in an interactive mode:\n\n    <program>  Copyright (C) <year>  <name of author>\n    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.\n    This is free software, and you are welcome to redistribute it\n    under certain conditions; type `show c' for details.\n\nThe hypothetical commands `show w' and `show c' should show the appropriate\nparts of the General Public License.  Of course, your program's commands\nmight be different; for a GUI interface, you would use an \"about box\".\n\n  You should also get your employer (if you work as a programmer) or school,\nif any, to sign a \"copyright disclaimer\" for the program, if necessary.\nFor more information on this, and how to apply and follow the GNU GPL, see\n<https://www.gnu.org/licenses/>.\n\n  The GNU General Public License does not permit incorporating your program\ninto proprietary programs.  If your program is a subroutine library, you\nmay consider it more useful to permit linking proprietary applications with\nthe library.  If this is what you want to do, use the GNU Lesser General\nPublic License instead of this License.  But first, please read\n<https://www.gnu.org/licenses/why-not-lgpl.html>.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "recursive-include ext *\ninclude LICENSE"
  },
  {
    "path": "README.md",
    "content": "[![PyPI version](https://badge.fury.io/py/kimimaro.svg)](https://badge.fury.io/py/kimimaro)  \n\n# Kimimaro: Skeletonize Densely Labeled Images\n\n```bash\n# Produce SWC files from volumetric images.\nkimimaro forge labels.npy --progress # writes to ./kimimaro_out/\nkimimaro view kimimaro_out/10.swc\n```\n\nRapidly skeletonize all non-zero labels in 2D and 3D numpy arrays using a TEASAR derived method. The returned list of skeletons is in the format used by [cloud-volume](https://github.com/seung-lab/cloud-volume/wiki/Advanced-Topic:-Skeletons). A skeleton is a stick figure 1D representation of a 2D or 3D object that consists of a graph of verticies linked by edges. A skeleton where the verticies also carry a distance to the nearest boundary they were extracted from is called a \"Medial Axis Transform\", which Kimimaro provides.\n\nSkeletons are a compact representation that can be used to visualize objects, trace the connectivity of an object, or otherwise analyze the object's geometry. Kimimaro was designed for use with high resolution neurons extracted from electron microscopy data via AI segmentation, but it can be applied to many different fields.  \n\nOn an Apple Silicon M1 arm64 chip (Firestorm cores 3.2 GHz max frequency), this package processed a 512x512x100 volume with 333 labels in 20 seconds. It processed a 512x512x512 volume (`connectomics.npy`) with 2124 labels in 187 seconds.\n\n<p style=\"font-style: italics;\" align=\"center\">\n<img height=512 width=512 src=\"https://raw.githubusercontent.com/seung-lab/kimimaro/master/mass_skeletonization.png\" alt=\"A Densely Labeled Volume Skeletonized with Kimimaro\" /><br>\nFig. 1: A Densely Labeled Volume Skeletonized with Kimimaro\n</p>\n\n## `pip` Installation \n\nIf a binary is available for your platform:\n\n```bash\npip install kimimaro \n# installs additional libraries to accelerate some\n# operations like join_close_components\npip install \"kimimaro[accel]\"\n# Makes the kimimaro view command work\npip install \"kimimaro[view]\"\n# Enables TIFF generation on the CLI\npip install \"kimimaro[tif]\"\n# Enables reading NIBABEL, NRRD, TIFF, CRACKLE on the CLI\npip install \"kimimaro[all_formats]\"\n# Install all optional dependencies\npip install \"kimimaro[all]\"\n```\n\nOtherwise, you'll also need a C++ compiler:\n\n```bash\nsudo apt-get install python3-dev g++ # ubuntu linux\n```\n\n## Example\n\n<p style=\"font-style: italics;\" align=\"center\">\n<img height=512 src=\"https://raw.githubusercontent.com/seung-lab/kimimaro/master/kimimaro_512x512x512_benchmark.png\" alt=\"A Densely Labeled Volume Skeletonized with Kimimaro\" /><br>\nFig. 2: Memory Usage on a 512x512x512 Densely Labeled Volume (`connectomics.npy`)\n</p>\n\nFigure 2 shows the memory usage and processessing time (~390 seconds, about 6.5 minutes) required when Kimimaro 1.4.0 was applied to a 512x512x512 cutout, *labels*, from a connectomics dataset, `connectomics.npy` containing 2124 connected components. The different sections of the algorithm are depicted. Grossly, the preamble runs for about half a minute, skeletonization for about six minutes, and finalization within seconds. The peak memory usage was about 4.5 GB. The code below was used to process *labels*. The processing of the glia was truncated in due to a combination of *fix_borders* and max_paths.  \n\nKimimaro has come a long way. Version 0.2.1 took over 15 minutes and had a Preamble run time twice as long on the same dataset.\n\nOn a Macbook Pro M3, the same settings now complete in 94 seconds (1.6 minutes) on version 5.4.0. With xs3d 1.11.0, cross section analysis takes 215 seconds (3.6 minutes).\n\n### Python Interface\n\n```python\n# LISTING 1: Producing Skeletons from a labeled image.\n\nimport kimimaro\n\n# To obtain this 512 MB segmentation sample volume:\n# pip install crackle-codec \n\nimport crackle\nlabels = crackle.load(\"benchmarks/connectomics.npy.ckl.gz\") \n\nskels = kimimaro.skeletonize(\n  labels, \n  teasar_params={\n    \"scale\": 1.5, \n    \"const\": 300, # physical units\n    \"pdrf_scale\": 100000,\n    \"pdrf_exponent\": 4,\n    \"soma_acceptance_threshold\": 3500, # physical units\n    \"soma_detection_threshold\": 750, # physical units\n    \"soma_invalidation_const\": 300, # physical units\n    \"soma_invalidation_scale\": 2,\n    \"max_paths\": 300, # default None\n  },\n  # object_ids=[ ... ], # process only the specified labels\n  # extra_targets_before=[ (27,33,100), (44,45,46) ], # target points in voxels\n  # extra_targets_after=[ (27,33,100), (44,45,46) ], # target points in voxels\n  dust_threshold=1000, # skip connected components with fewer than this many voxels\n  anisotropy=(16,16,40), # default True\n  fix_branching=True, # default True\n  fix_borders=True, # default True\n  fill_holes=False, # default False\n  fix_avocados=False, # default False\n  progress=True, # default False, show progress bar\n  parallel=1, # <= 0 all cpu, 1 single process, 2+ multiprocess\n  parallel_chunk_size=100, # how many skeletons to process before updating progress bar\n)\n\n# LISTING 2: Combining skeletons produced from \n#            adjacent or overlapping images.\n\nimport kimimaro\nfrom osteoid import Skeleton\n\nskels = ... # a set of skeletons produced from the same label id\nskel = Skeleton.simple_merge(skels).consolidate()\nskel = kimimaro.postprocess(\n  skel, \n  dust_threshold=1000, # physical units\n  tick_threshold=3500 # physical units\n)\n\n# LISTING 3: Adding cross sectional area to skeletons\n# Cross section planes are defined by normal vectors. Those\n# vectors come from the difference between adjacent vertices.\nskels = ... # one or more skeletons produced from a single image\nskels = kimimaro.cross_sectional_area(\n  labels, skels, \n  anisotropy=(16,16,40), \n  smoothing_window=5, # rolling average window of plane normals\n  progress=True,\n)\nskel = skels[0]\nskel.cross_sectional_area # array of cross sectional areas\nskel.cross_sectional_area_contacts # non-zero contacted the image border\n\n# Split input skeletons into connected components and\n# then join the two nearest vertices within `radius` distance\n# of each other until there is only a single connected component\n# or no pairs of points nearer than `radius` exist. \n# Fuse all remaining components into a single skeleton.\nskel = kimimaro.join_close_components([skel1, skel2], radius=1500) # 1500 units threshold\nskel = kimimaro.join_close_components([skel1, skel2], radius=None) # no threshold\n\n# Given synapse centroids (in voxels) and the SWC integer label you'd \n# like to assign (e.g. for pre-synaptic and post-synaptic) this finds the \n# nearest voxel to the centroid for that label.\n# Input: { label: [ ((x,y,z), swc_label), ... ] }\n# Returns: { (x,y,z): swc_label, ... }\nextra_targets = kimimaro.synapses_to_targets(labels, synapses)\n\n\n# LISTING 4: Drawing a centerline between\n#   preselected points on a binary image.\n#   This is a much simpler option for when\n#   you know exactly what you want, but may\n#   be less efficient for large scale procesing.\n\nskel = kimimaro.connect_points(\n  labels == 67301298,\n  start=(3, 215, 202), \n  end=(121, 426, 227),\n  anisotropy=(32,32,40),\n)\n\n# LISTING 5: Using skeletons to oversegment existing\n#  segmentations for integration into proofreading systems \n#  that on merging atomic labels. oversegmented_labels \n#  is returned numbered from 1. skels is a copy returned \n#  with the property skel.segments that associates a label\n#  to each vertex (labels will not be unique if downsampling \n#  is used)\noversegmented_labels, skels = kimimaro.oversegment(\n  labels, skels, \n  anisotropy=(32,32,40), \n  downsample=10,\n)\n```\n\n`connectomics.npy` is multilabel connectomics data derived from pinky40, a 2018 experimental automated segmentation of ~1.5 million cubic micrometers of mouse visual cortex. It is an early predecessor to the now public pinky100_v185 segmentation that can be found at https://microns-explorer.org/phase1 You will need to run `lzma -d connectomics.npy.lzma` to obtain the 512x512x512 uint32 volume at 32x32x40 nm<sup>3</sup> resolution.  \n\n### CLI Interface\n\nThe CLI supports producing skeletons from a single image as SWCs and viewing the resulting SWC files one at a time. By default, the SWC files are written to `./kimimaro_out/$LABEL.swc`.\n\nHere's an equivalent example to the code above.\n\n```bash\nkimimaro forge labels.npy --scale 4 --const 10 --soma-detect 1100 --soma-accept 3500 --soma-scale 1 --soma-const 300 --anisotropy 16,16,40 --fix-borders --progress \n```\n\nVisualize the your data:\n\n```bash\nkimimaro view 1241241.swc # visualize skeleton\nkimimaro view labels.npy # visualize segmentation\n```\n\nIt can also convert binary image skeletons produced by thinning algorithms into SWC files and back. This can be helpful for comparing different skeletonization algorithms or even just using their results.\n\n```bash\nkimimaro swc from binary_image.tiff # -> binary_image.swc\nkimimaro swc to --format tiff binary_image.swc # -> binary_image.tiff or npy\n```\n\n## Tweaking `kimimaro.skeletonize` Parameters\n\nThis algorithm works by finding a root point on a 3D object and then serially tracing paths via dijksta's shortest path algorithm through a penalty field to the most distant unvisited point. After each pass, there is a sphere (really a circumscribing cube) that expands around each vertex in the current path that marks part of the object as visited.  \n\nFor a visual tutorial on the basics of the skeletonization procedure, check out this wiki article: [A Pictorial Guide to TEASAR Skeletonization](https://github.com/seung-lab/kimimaro/wiki/A-Pictorial-Guide-to-TEASAR-Skeletonization)\n\nFor more detailed information, [read below](https://github.com/seung-lab/kimimaro#ii-skeletonization) or the [TEASAR paper](https://ieeexplore.ieee.org/abstract/document/883951/) (though we [deviate from TEASAR](https://github.com/seung-lab/kimimaro#teasar-derived-algorthm) in a few places). [1]\n\n### `scale` and `const`\n\nUsually, the most important parameters to tweak are `scale` and `const` which control the radius of this invalidation sphere according to the equation `r(x,y,z) = scale * DBF(x,y,z) + const` where the dimensions are physical (e.g. nanometers, i.e. corrected for anisotropy). `DBF(x,y,z)` is the physical distance from the shape boundary at that point.  \n\nCheck out this [wiki article](https://github.com/seung-lab/kimimaro/wiki/Intuition-for-Setting-Parameters-const-and-scale) to help refine your intuition.\n\n### `anisotropy`\n\nRepresents the physical dimension of each voxel. For example, a connectomics dataset might be scanned with an electron microscope at 4nm x 4nm per pixel and stacked in slices 40nm thick. i.e. `anisotropy=(4,4,40)`. You can use any units so long as you are consistent.\n\n### `dust_threshold`\n\nThis threshold culls connected components that are smaller than this many voxels.  \n\n### `extra_targets_after` and `extra_targets_before`  \n\n`extra_targets_after` provides additional voxel targets to trace to after the morphological tracing algorithm completes. For example, you might add known synapse locations to the skeleton.   \n\n`extra_targets_before` is the same as `extra_targets_after` except that the additional targets are front-loaded and the paths that they cover are invalidated. This may affect the results of subsequent morphological tracing.\n\n### `max_paths`  \n\nLimits the number of paths that can be drawn for the given label. Certain cells, such as glia, that may not be important for the current analysis may be expensive to process and can be aborted early.  \n\n### `pdrf_scale` and `pdrf_exponent`\n\nThe `pdrf_scale` and `pdrf_exponent` represent parameters to the penalty equation that takes the euclidean distance field (**D**) and augments it so that cutting closer to the border is very penalized to make dijkstra take paths that are more centered.   \n\nP<sub>r</sub> = `pdrf_scale` * (1 - **D** / max(**D**)) <sup>`pdrf_exponent`</sup> + (directional gradient < 1.0).  \n\nThe default settings should work fairly well, but under large anisotropies or with cavernous morphologies, it's possible that you might need to tweak it. If you see the skeleton go haywire inside a large area, it could be a collapse of floating point precision.  \n\n### `soma_acceptance_threshold` and `soma_detection_threshold`\n\nWe process somas specially because they do not have a tubular geometry and instead should be represented in a hub and spoke manner. `soma_acceptance_threshold` is the physical radius (e.g. in nanometers) beyond which we classify a connected component of the image as containing a soma. The distance transform's output is depressed by holes in the label, which are frequently produced by segmentation algorithms on somata. We can fill them, but the hole filling algorithm we use is slow so we would like to only apply it occasionally. Therefore, we set a lower threshold, the `soma_acceptance_threshold`, beyond which we fill the holes and retest the soma.  \n\n### `soma_invalidation_scale` and `soma_invalidation_const`   \n\nOnce we have classified a region as a soma, we fix root of the skeletonization algorithm at one of the  points of maximum distance from the boundary (usually there is only one). We then mark as visited all voxels around that point in a spherical radius described by `r(x,y,z) = soma_invalidation_scale * DBF(x,y,z) + soma_invalidation_const` where DBF(x,y,z) is the physical distance from the shape boundary at that point. If done correctly, this can prevent skeletons from being drawn to the boundaries of the soma, and instead pulls the skeletons mainly into the processes extending from the cell body.  \n\n### `fix_borders`\n\nThis feature makes it easier to connect the skeletons of adjacent image volumes that do not fit in RAM. If enabled, skeletons will be deterministically drawn to the approximate center of the 2D contact area of each place where the shape contacts the border. This can affect the performance of the operation positively or negatively depending on the shape and number of contacts.  \n\n### `fix_branching`  \n\nYou'll probably never want to disable this, but base TEASAR is infamous for forking the skeleton at branch points way too early. This option makes it preferential to fork at a more reasonable place at a significant performance penalty. \n\n### `fill_holes`\n\n_Warning: This will remove input labels that are deemed to be holes._\n\nIf your segmentation contains artifacts that cause holes to appear in labels, you can preprocess the entire image to eliminate background holes and holes caused by entirely contained inclusions. This option adds a moderate amount of additional processing time at the beginning (perhaps ~30%). \n\n### `fix_avocados`\n\nAvocados are segmentations of cell somata that classify the nucleus separately from the cytoplasm. This is a common problem in automatic segmentations due to the visual similarity of a cell membrane and a nuclear membrane combined with insufficient context.  \n\nSkeletonizing an avocado results in a poor skeletonization of the cell soma that will disconnect the nucleus and usually results in too many paths traced around the nucleus. Setting `fix_avocados=True` attempts to detect and fix these problems. Currently we handle non-avocados, avocados, cells with inclusions, and nested avocados. You can see examples [here](https://github.com/seung-lab/kimimaro/pull/43).\n\n### `progress`\n\nShow a progress bar once the skeletonization phase begins.\n\n### `parallel`  \n\nUse a pool of processors to skeletonize faster. Each process allocatable task is the skeletonization of one connected component (so it won't help with a single label that takes a long time to skeletonize). This option also affects the speed of the initial euclidean distance transform, which is parallel enabled and is the most expensive part of the Preamble (described below).  \n\n### `parallel_chunk_size`  \n\nThis only applies when using parallel. This sets the number of skeletons a subprocess will extract before returning control to the main thread, updating the progress bar, and acquiring a new task. If this value is set too low (e.g. < 10-20) the cost of interprocess communication can become significant and even dominant. If it is set too high, task starvation may occur for the other subprocesses if a subprocess gets a particularly hard skeleton and they complete quickly. Progress bar updates will be infrequent if the value is too high as well.  \n\nThe actual chunk size used will be `min(parallel_chunk_size, len(cc_labels) // parallel)`. `cc_labels` represents the number of connected components in the sample.  \n\n### Performance Tips\n\n- If you only need a few labels skeletonized, pass in `object_ids` to bypass processing all the others. If `object_ids` contains only a single label, the masking operation will run faster.\n- Larger TEASAR parameters scale and const require processing larger invalidation regions per path.\n- Set `pdrf_exponent` to a small power of two (e.g. 1, 2, 4, 8, 16) for a small speedup.\n- If you are willing to sacrifice the improved branching behavior, you can set `fix_branching=False` for a moderate 1.1x to 1.5x speedup (assuming your TEASAR parameters and data allow branching).\n- If your dataset contains important cells (that may in fact be the seat of consciousness) but they take significant processing power to analyze, you can save them to savor for later by setting `max_paths` to some reasonable level which will abort and proceed to the next label after the algorithm detects that that at least that many paths will be needed.\n- Parallel distributes work across connected components and is generally a good idea if you have the cores and memory. Not only does it make single runs proceed faster, but you can also practically use a much larger context; that improves soma processing as they are less likely to be cut off. The Preamble of the algorithm (detailed below) is still single threaded at the moment, so task latency increases with size. \n- If `parallel_chunk_size` is set very low (e.g. < 10) during parallel operation, interprocess communication can become a significant overhead. Try raising this value.  \n\n## Motivation\n\nThe connectomics field commonly generates very large densely labeled volumes of neural tissue. Skeletons are one dimensional representations of two or three dimensional objects. They have many uses, a few of which are visualization of neurons, calculating global topological features, rapidly measuring electrical distances between objects, and imposing tree structures on neurons (useful for computation and user interfaces). There are several ways to compute skeletons and a few ways to define them [4]. After some experimentation, we found that the TEASAR [1] approach gave fairly good results. Other approaches include topological thinning (\"onion peeling\") and finding the centerline described by maximally inscribed spheres. Ignacio Arganda-Carreras, an alumnus of the Seung Lab, wrote a topological thinning plugin for Fiji called [Skeletonize3d](https://imagej.net/Skeletonize3D). \n\nThere are several implementations of TEASAR used in the connectomics field [3][5], however it is commonly understood that implementations of TEASAR are slow and can use tens of gigabytes of memory. Our goal to skeletonize all labels in a petavoxel scale image quickly showed clear that existing sparse implementations are impractical. While adapting a sparse approach to a cloud pipeline, we noticed that there are inefficiencies in repeated evaluation of the Euclidean Distance Transform (EDT), the repeated evaluation of the connected components algorithm, in the construction of the graph used by Dijkstra's algorithm where the edges are implied by the spatial relationships between voxels, in the memory cost, quadratic in the number of voxels, of representing a graph that is implicit in image, in the unnecessarily large data type used to represent relatively small cutouts, and in the repeated downloading of overlapping regions. We also found that the naive implmentation of TEASAR's \"rolling invalidation ball\" unnecessarily reevaluated large numbers of voxels in a way that could be loosely characterized as quadratic in the skeleton path length.   \n\nWe further found that commodity implementations of the EDT supported only binary images. We were unable to find any available Python or C++ libraries for performing Dijkstra's shortest path on an image. Commodity implementations of connected components algorithms for images supported only binary images. Therefore, several libraries were devised to remedy these deficits (see Related Projects). \n\n## Why TEASAR?\n\nTEASAR: Tree-structure Extraction Algorithm for Accurate and Robust skeletons, a 2000 paper by M. Sato and others [1], is a member of a family of algorithms that transform two and three dimensional structures into a one dimensional \"skeleton\" embedded in that higher dimension. One might concieve of a skeleton as extracting a stick figure drawing from a binary image. This problem is more difficult than it might seem. There are different situations one must consider when making such a drawing. For example, a stick drawing of a banana might merely be a curved centerline and a drawing of a doughnut might be a closed loop. In our case of analyzing neurons, sometimes we want the skeleton to include spines, short protrusions from dendrites that usually have synapses attached, and sometimes we want only the characterize the run length of the main trunk of a neurite.  \n\nAdditionally, data quality issues can be challenging as well. If one is skeletonizing a 2D image of a doughnut, but the angle were sufficiently declinated from the ring's orthogonal axis, would it even be possible to perform this task accurately? In a 3D case, if there are breaks or mergers in the labeling of a neuron, will the algorithm function sensibly? These issues are common in both manual and automatic image sementations.\n\nIn our problem domain of skeletonizing neurons from anisotropic voxel labels, our chosen algorithm should produce tree structures, handle fine or coarse detail extraction depending on the circumstances, handle voxel anisotropy, and be reasonably efficient in CPU and memory usage. TEASAR fufills these criteria. Notably, TEASAR doesn't guarantee the centeredness of the skeleton within the shape, but it makes an effort. The basic TEASAR algorithm is known to cut corners around turns and branch too early. A 2001 paper by members of the original TEASAR team describes a method for reducing the early branching issue on page 204, section 4.2.2. [2]\n\n## TEASAR Derived Algorithm\n\nWe implemented TEASAR but made several deviations from the published algorithm in order to improve path centeredness, increase performance, handle bulging cell somas, and enable efficient chunked evaluation of large images. We opted not to implement the gradient vector field step from [2] as our implementation is already quite fast. The paper claims a reduction of 70-85% in input voxels, so it might be worth investigating.  \n\nIn order to work with images that contain many labels, our general strategy is to perform as many actions as possible in such a way that all labels are treated in a single pass. Several of the component algorithms (e.g. connected components, euclidean distance transform) in our implementation can take several seconds per a pass, so it is important that they not be run hundreds or thousands of times. A large part of the engineering contribution of this package lies in the efficiency of these operations which reduce the runtime from the scale of hours to minutes.  \n\nGiven a 3D labeled voxel array, *I*, with N >= 0 labels, and ordered triple describing voxel anisotropy *A*, our algorithm can be divided into three phases, the pramble, skeletonization, and finalization in that order.\n\n### I. Preamble\n\nThe Preamble takes a 3D image containing *N* labels and efficiently generates the connected components, distance transform, and bounding boxes needed by the skeletonization phase.\n\n1. To enhance performance, if *N* is 0 return an empty set of skeletons.\n2. Label the M connected components, *I<sub>cc</sub>*, of *I*.\n3. To save memory, renumber the connected components in order from 1 to *M*. Adjust the data type of the new image to the smallest uint type that will contain *M* and overwrite *I<sub>cc</sub>*.\n4. Generate a mapping of the renumbered *I<sub>cc</sub>* to *I* to assign meaningful labels to skeletons later on and delete *I* to save memory.\n5. Compute *E*, the multi-label anisotropic Euclidean Distance Transform of *I<sub>cc</sub>* given *A*. *E* treats all interlabel edges as transform edges, but not the boundaries of the image. Black pixels are considered background.\n6. Gather a list, *L<sub>cc</sub>* of unique labels from *I<sub>cc</sub>* and threshold which ones to process based on the number of voxels they represent to remove \"dust\".\n7. In one pass, compute the list of bounding boxes, *B*, corresponding to each label in *L<sub>cc</sub>*.\n\n### II. Skeletonization \n\nIn this phase, we extract the tree structured skeleton from each connected component label. Below, we reference variables defined in the Preamble. For clarity, we omit the soma specific processing and hold `fix_branching=True`. \n\nFor each label *l* in *L<sub>cc</sub>* and *B*...\n\n1. Extract *I<sub>l</sub>*, the cropped binary image tightly enclosing *l* from *I<sub>cc</sub>* using *B<sub>l</sub>*\n2. Using *I<sub>l</sub>* and *B<sub>l</sub>*, extract *E<sub>l</sub>* from *E*. *E<sub>l</sub>* is the cropped tightly enclosed EDT of *l*. This is much faster than recomputing the EDT for each binary image.\n3. Find an arbitrary foreground voxel and using that point as a source, compute the anisotropic euclidean distance field for *I<sub>l</sub>*. The coordinate of the maximum value is now \"the root\" *r*.\n4. From *r*, compute the euclidean distance field and save it as the distance from root field *D<sub>r</sub>*.\n5. Compute the penalized distance from root field *P<sub>r</sub>* = `pdrf_scale` * ((1 - *E<sub>l</sub>* / max(*E<sub>l</sub>*)) ^ `pdrf_exponent`) + *D<sub>r</sub>* / max(*D<sub>r</sub>*). \n6. While *I<sub>l</sub>* contains foreground voxels:\n    1. Identify a target coordinate, *t*, as the foreground voxel with maximum distance in *D<sub>r</sub>* from *r*.\n    2. Draw the shortest path *p* from *r* to *t* considering the voxel values in *P<sub>r</sub>* as edge weights.\n    3. For each vertex *v* in *p*, extend an invalidation cube of physical side length computed as `scale` * *E<sub>l</sub>*(*v*) + `const` and convert any foreground pixels in *I<sub>l</sub>* that overlap with these cubes to background pixels.\n    4. (Only if `fix_branching=True`) For each vertex coordinate *v* in *p*, set *P<sub>r</sub>*(*v*) = 0.\n    5. Append *p* to a list of paths for this label.\n7. Using *E<sub>l</sub>*, extract the distance to the nearest boundary each vertex in the skeleton represents.\n8. For each raw skeleton extracted from *I<sub>l</sub>*, translate the vertices by *B<sub>l</sub>* to correct for the translation the cropping operation induced.\n9. Multiply the vertices by the anisotropy *A* to place them in physical space.\n\nIf soma processing is considered, we modify the root (*r*) search process as follows:  \n\n1. If max(*E<sub>l</sub>*) > `soma_detection_threshold`...\n  1. Fill toplogical holes in *I<sub>l</sub>*. Soma are large regions that often have dust from imperfect automatic labeling methods.\n  2. Recompute *E<sub>l</sub>* from this cleaned up image.\n  3. If max(*E<sub>l</sub>*) > `soma_acceptance_threshold`, divert to soma processing mode.\n2. If in soma processing mode, continue, else go to step 3 in the algorithm above.\n3. Set *r* to the coordinate corresponding to max(*E<sub>l</sub>*)\n4. Create an invalidation sphere of physical radius `soma_invalidation_scale` * max(*E<sub>l</sub>*) + `soma_invalidation_const` and erase foreground voxels from *I<sub>l</sub>* contained within it. This helps prevent errant paths from being drawn all over the soma.\n5. Continue from step 4 in the above algorithm.\n\n### III. Finalization\n\nIn the final phase, we agglomerate the disparate connected component skeletons into single skeletons and assign their labels corresponding to the input image. This step is artificially broken out compared to how intermingled its implementation is with skeletonization, but it's conceptually separate.\n\n## Deviations from TEASAR\n\nThere were several places where we took a different approach than called for by the TEASAR authors.\n\n### Using DAF for Targets, PDRF for Pathfinding\n\nThe original TEASAR algorithm defines the Penalized Distance from Root voxel Field (PDRF, *P<sub>r</sub>* above) as:\n\n```\nPDRF = 5000 * (1 - DBF / max(DBF))^16 + DAF\n```\n\nDBF is the Distance from Boundary Field (*E<sub>l</sub>* above) and DAF is the Distance from Any voxel Field (*D<sub>r</sub>* above).  \n\nWe found the addition of the DAF tended to perturb the skeleton path from the centerline better described by the inverted DBF alone. We also found it helpful to modify the constant and exponent to tune cornering behavior. Initially, we completely stripped out the addition of the DAF from the PDRF, but this introduced a different kind of problem. The exponentiation of the PDRF caused floating point values to collapse in wide open spaces. This made the skeletons go crazy as they traced out a path described by floating point errors.  \n\nThe DAF provides a very helpful gradient to follow between the root and the target voxel, we just don't want that gradient to knock the path off the centerline. Therefore, in light of the fact that the PDRF base field is very large, we add the normalized DAF which is just enough to overwhelm floating point errors and provide direction in wide tubes and bulges.  \n\nThe original paper also called for selecting targets using the max(PDRF) foreground values. However, this is a bit strange since the PDRF values are dominated by boundary effects rather than a pure distance metric. Therefore, we select targets from the max(DAF) forground value.\n\n### Zero Weighting Previous Paths (`fix_branching=True`)\n\nThe 2001 skeletonization paper [2] called for correcting early forking by computing a DAF using already computed path vertices as field sources. This allows Dijkstra's algorithm to trace the existing path cost free and diverge from it at a closer point to the target.  \n\nAs we have strongly deemphasized the role of the DAF in dijkstra path finding, computing this field is unnecessary and we only need to set the PDRF to zero along the path of existing skeletons to achieve this effect. This saves us an expensive repeated DAF calculation per path.  \n\nHowever, we still incur a substantial cost for taking this approach because we had been computing a dijkstra \"parental field\" that recorded the shortest path to the root from every foreground voxel. We then used this saved result to rapidly compute all paths. However, as this zero weighting modification makes successive calculations dependent upon previous ones, we need to compute Dijkstra's algorithm anew for each path.\n\n### Non-Overlapped Chunked Processing (`fix_borders=True`)\n\nWhen processing large volumes, a sensible approach for mass producing skeletons is to chunk the volume, process the chunks independently, and merge the resulting skeleton fragments at the end. However, this is complicated by the \"edge effect\" induced by a loss of context which makes it impossible to expect the endpoints of skeleton fragments produced by adjacent chunks to align. In contrast, it is easy to join mesh fragments because the vertices of the edge of mesh fragments lie at predictable identical locations given one pixel of overlap.  \n\nPreviously, we had used 50% overlap to join adjacent skeleton fragments which increased the compute cost of skeletonizing a large volume by eight times. However, if we could force skeletons to lie at predictable locations on the border, we could use single pixel overlap and copy the simple mesh joining approach. As an (incorrect but useful) intuition for how one might go about this, consider computing the centroid of each connected component on each border plane and adding that as a required path target. This would guarantee that both sides of the plane connect at the same pixel. However, the centroid may not lie inside of non-convex hulls so we have to be more sophisticated and select some real point inside of the shape.\n\nTo this end, we again repurpose the euclidean distance transform and apply it to each of the six planes of connected components and select the maximum value as a mandatory target. This works well for many types of objects that contact a single plane and have a single maximum. However, we must treat the corners of the box and shapes that have multiple maxima.  \n\nTo handle shapes that contact multiple sides of the box, we simply assign targets to all connected components. If this introduces a cycle in post-processing, we already have cycle removing code to handle it in Igneous. If it introduces tiny useless appendages, we also have code to handle this.  \n\nIf a shape has multiple distance transform maxima, it is important to choose the same pixel without needing to communicate between spatially adjacent tasks which may run at different times on different machines. Additionally, the same plane on adjacent tasks has the coordinate system flipped. One simple approach might be to pick the coordinate with minimum x and y (or some other coordinate based criterion) in one of the coordinate frames, but this requires tracking the flips on all six planes and is annoying. Instead, we use a series of coordinate-free topology based filters which is both more fun, effort efficient, and picks something reasonable looking. A valid criticism of this approach is that it will fail on a perfectly symmetrical object, but these objects are rare in biological data.  \n\nWe apply a series of filters and pick the point based on the first filter it passes:\n\n1. The voxel closest to the centroid of the current label.\n2. The voxel closest to the centroid of the image plane.\n3. Closest to a corner of the plane.\n4. Closest to an edge of the plane.\n5. The previously found maxima.\n\nIt is important that filter #1 be based on the shape of the label so that kinks are minimimized for convex hulls. For example, originally we used only filters two thru five, but this caused skeletons for neurites located away from the center of a chunk to suddenly jink towards the center of the chunk at chunk boundaries.\n\n## Related Projects\n\nSeveral classic algorithms had to be specially tuned to make this module possible.  \n\n1. [edt](https://github.com/seung-lab/euclidean-distance-transform-3d): A single pass, multi-label anisotropy supporting euclidean distance transform implementation. \n2. [dijkstra3d](https://github.com/seung-lab/dijkstra3d): Dijkstra's shortest-path algorithm defined on 26-connected 3D images. This avoids the time cost of edge generation and wasted memory of a graph representation.\n3. [connected-components-3d](https://github.com/seung-lab/connected-components-3d): A connected components implementation defined on 26-connected 3D images with multiple labels.\n4. [fastremap](https://github.com/seung-lab/fastremap): Allows high speed renumbering of labels from 1 in a 3D array in order to reduce memory consumption caused by unnecessarily large 32 and 64-bit labels.\n5. [fill_voids](https://github.com/seung-lab/fill_voids): High speed binary_fill_holes.\n6. [xs3d](https://github.com/seung-lab/cross-section): Cross section analysis of 3D images.\n\nThis module was originally designed to be used with CloudVolume and Igneous. \n\n1. [CloudVolume](https://github.com/seung-lab/cloud-volume): Serverless client for reading and writing petascale chunked images of neural tissue, meshes, and skeletons.\n2. [Igneous](https://github.com/seung-lab/igneous/tree/master/igneous): Distributed computation for visualizing connectomics datasets.  \n\nSome of the TEASAR modifications used in this package were first demonstrated by Alex Bae.\n\n1. [skeletonization](https://github.com/seung-lab/skeletonization): Python implementation of modified TEASAR for sparse labels.\n\n## Credits  \n\nAlex Bae developed the precursor skeletonization package and several modifications to TEASAR that we use in this package. Alex also developed the postprocessing approach used for stitching skeletons using 50% overlap. Will Silversmith adapted these techniques for mass production, refined several basic algorithms for handling thousands of labels at once, and rewrote them into the Kimimaro package. Will added trickle DAF, zero weighted previously explored paths, and fixing borders to the algorithm. A.M. Wilson and Will designed the nucleus/soma \"avocado\" fuser. Forrest Collman added parameter flexibility and helped tune DAF computation performance. Sven Dorkenwald and Forrest both provided helpful discussions and feedback. Peter Li redesigned the target selection algorithm to avoid bilinear performance on complex cells. \n\n## Acknowledgments  \n\nWe are grateful to our partners in the Seung Lab, the Allen Institute for Brain Science, and the Baylor College of Medicine for providing the data and problems necessitating this library.\n\nThis research was supported by the Intelligence Advanced Research Projects Activity (IARPA) via Department of Interior/ Interior Business Center (DoI/IBC) contract number D16PC0005, NIH/NIMH (U01MH114824, U01MH117072, RF1MH117815), NIH/NINDS (U19NS104648, R01NS104926), NIH/NEI (R01EY027036), and ARO (W911NF-12-1-0594). The U.S. Government is authorized to reproduce and distribute reprints for Governmental purposes notwithstanding any copyright annotation thereon. Disclaimer: The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of IARPA, DoI/IBC, or the U.S. Government. We are grateful for assistance from Google, Amazon, and Intel.\n\n## Papers Using Kimimaro\n\nPlease cite Kimimaro using the CITATION.cff file located in this repository.\n\nThe below list is not comprehensive and is sourced from collaborators or found using internet searches and does not constitute an endorsement except to the extent that they used it for their work. \n\n1. A.M. Wilson, R. Schalek, A. Suissa-Peleg, T.R. Jones, S. Knowles-Barley, H. Pfister, J.M. Lichtman. \"Developmental Rewiring between Cerebellar Climbing Fibers and Purkinje Cells Begins with Positive Feedback Synapse Addition\". Cell Reports. Vol. 29, Iss. 9, November 2019. Pgs. 2849-2861.e6 doi: 10.1016/j.celrep.2019.10.081  ([link](https://www.cell.com/cell-reports/fulltext/S2211-1247(19)31403-2))\n2. S. Dorkenwald, N.L. Turner, T. Macrina, K. Lee, R. Lu, J. Wu, A.L. Bodor, A.A. Bleckert, D. Brittain, N. Kemnitz, W.M. Silversmith, D. Ih, J. Zung, A. Zlateski, I. Tartavull, S. Yu, S. Popovych, W. Wong, M. Castro, C. S. Jordan, A.M. Wilson, E. Froudarakis, J. Buchanan, M. Takeno, R. Torres, G. Mahalingam, F. Collman, C. Schneider-Mizell, D.J. Bumbarger, Y. Li, L. Becker, S. Suckow, J. Reimer, A.S. Tolias, N. Ma<span>&ccedil;</span>arico da Costa, R. C. Reid, H.S. Seung. \"Binary and analog variation of synapses between cortical pyramidal neurons\". bioRXiv. December 2019. doi: 10.1101/2019.12.29.890319 ([link](https://www.biorxiv.org/content/10.1101/2019.12.29.890319v1.full))  \n3. N.L. Turner, T. Macrina, J.A. Bae, R. Yang, A.M. Wilson, C. Schneider-Mizell, K. Lee, R. Lu, J. Wu, A.L. Bodor, A.A. Bleckert, D. Brittain, E. Froudarakis, S. Dorkenwald, F. Collman, N. Kemnitz, D. Ih, W.M. Silversmith, J. Zung, A. Zlateski, I. Tartavull, S. Yu, S. Popovych, S. Mu, W. Wong, C.S. Jordan, M. Castro, J. Buchanan, D.J. Bumbarger, M. Takeno, R. Torres, G. Mahalingam, L. Elabbady, Y. Li, E. Cobos, P. Zhou, S. Suckow, L. Becker, L. Paninski, F. Polleux, J. Reimer, A.S. Tolias, R.C. Reid, N. Ma<span>&ccedil;</span>arico da Costa, H.S. Seung. \"Multiscale and multimodal reconstruction of cortical structure and function\".\nbioRxiv. October 2020; doi: 10.1101/2020.10.14.338681 ([link](https://www.biorxiv.org/content/10.1101/2020.10.14.338681v3))\n4. P.H. Li, L.F. Lindsey, M. Januszewski, Z. Zheng, A.S. Bates, I. Taisz, M. Tyka, M. Nichols, F. Li, E. Perlman, J. Maitin-Shepard, T. Blakely, L. Leavitt, G. S.X.E. Jefferis, D. Bock, V. Jain. \"Automated Reconstruction of a Serial-Section EM Drosophila Brain with Flood-Filling Networks and Local Realignment\". bioRxiv. October 2020. doi: 10.1101/605634  ([link](https://www.biorxiv.org/content/10.1101/605634v3))\n\n## References \n\n1. M. Sato, I. Bitter, M.A. Bender, A.E. Kaufman, and M. Nakajima. \"TEASAR: Tree-structure Extraction Algorithm for Accurate and Robust Skeletons\". Proc. 8th Pacific Conf. on Computer Graphics and Applications. Oct. 2000. doi: 10.1109/PCCGA.2000.883951 ([link](https://ieeexplore.ieee.org/abstract/document/883951/))\n2. I. Bitter, A.E. Kaufman, and M. Sato. \"Penalized-distance volumetric skeleton algorithm\". IEEE Transactions on Visualization and Computer Graphics Vol. 7, Iss. 3, Jul-Sep 2001. doi: 10.1109/2945.942688 ([link](https://ieeexplore.ieee.org/abstract/document/942688/))\n3. T. Zhao, S. Plaza. \"Automatic Neuron Type Identification by Neurite Localization in the Drosophila Medulla\". Sept. 2014. arXiv:1409.1892 \\[q-bio.NC\\] ([link](https://arxiv.org/abs/1409.1892))\n4. A. Tagliasacchi, T. Delame, M. Spagnuolo, N. Amenta, A. Telea. \"3D Skeletons: A State-of-the-Art Report\". May 2016. Computer Graphics Forum. Vol. 35, Iss. 2. doi: 10.1111/cgf.12865 ([link](https://onlinelibrary.wiley.com/doi/full/10.1111/cgf.12865))\n5. P. Li, L. Lindsey, M. Januszewski, Z. Zheng, A. Bates, I. Taisz, M. Tyka, M. Nichols, F. Li, E. Perlman, J. Maitin-Shepard, T. Blakely, L. Leavitt, G. Jefferis, D. Bock, V. Jain. \"Automated Reconstruction of a Serial-Section EM Drosophila Brain with Flood-Filling Networks and Local Realignment\". April 2019. bioRXiv. doi: 10.1101/605634 ([link](https://www.biorxiv.org/content/10.1101/605634v1))\n6. M.M. McKerns, L. Strand, T. Sullivan, A. Fang, M.A.G. Aivazis, \"Building a framework for predictive science\", Proceedings of the 10th Python in Science Conference, 2011; http://arxiv.org/pdf/1202.1056\n7. Michael McKerns and Michael Aivazis, \"pathos: a framework for heterogeneous computing\", 2010- ; http://trac.mystic.cacr.caltech.edu/project/pathos\n"
  },
  {
    "path": "automated_test.py",
    "content": "import pytest\n\nimport edt\nimport numpy as np\nfrom osteoid import Skeleton\n\nimport kimimaro.intake\nimport kimimaro.post\nimport kimimaro.skeletontricks\nfrom kimimaro.utility import moving_average, cross_sectional_area\n\n@pytest.fixture\ndef connectomics_data():\n  import crackle\n  return crackle.load(\"benchmarks/connectomics.npy.ckl.gz\")\n\ndef test_empty_image():\n  labels = np.zeros( (256, 256, 256), dtype=bool)  \n  skels = kimimaro.skeletonize(labels, fix_borders=True)\n\n  assert len(skels) == 0\n\ndef test_very_sparse_image():\n  labels = np.zeros( (64, 64, 64), dtype=bool)  \n  labels[5,5,5] = True\n  labels[6,5,5] = True\n  labels[20,20,20] = True \n  skels = kimimaro.skeletonize(labels, dust_threshold=0)\n  \n  # single voxels don't get skeletonized\n  assert len(skels) == 1\n\ndef test_solid_image():\n  labels = np.ones( (128, 128, 128), dtype=bool)  \n  skels = kimimaro.skeletonize(labels, fix_borders=True)\n\n  assert len(skels) == 1\n\ndef test_binary_image():\n  labels = np.ones( (256, 256, 3), dtype=bool)\n  labels[-1,0] = 0\n  labels[0,-1] = 0\n  \n  skels = kimimaro.skeletonize(labels, fix_borders=False)\n\n  assert len(skels) == 1\n\n@pytest.mark.parametrize('fill_holes', (True, False))\ndef test_square(fill_holes):\n  labels = np.ones( (1000, 1000), dtype=np.uint8)\n  labels[-1,0] = 0\n  labels[0,-1] = 0\n  \n  teasar_params = {\n    \"scale\": 1.5, \n    \"const\": 300,\n    \"pdrf_scale\": 100000,\n    \"pdrf_exponent\": 4,\n    \"soma_acceptance_threshold\": 3500,\n    \"soma_detection_threshold\": 750,\n    \"soma_invalidation_const\": 300,\n    \"soma_invalidation_scale\": 2\n  }\n\n  skels = kimimaro.skeletonize(labels, teasar_params=teasar_params, fix_borders=False, fill_holes=fill_holes)\n\n  assert len(skels) == 1\n\n  skel = skels[1]\n  assert skel.vertices.shape[0] == 1000\n  assert skel.edges.shape[0] == 999\n  assert abs(skel.cable_length() - 999 * np.sqrt(2)) < 0.001\n  assert skel.space == 'physical'\n\n  labels = np.ones( (1000, 1000), dtype=np.uint8)\n  labels[0,0] = 0\n  labels[-1,-1] = 0\n\n  skels = kimimaro.skeletonize(labels, teasar_params=teasar_params, fix_borders=False, fill_holes=fill_holes)\n\n  assert len(skels) == 1\n\n  skel = skels[1]\n  assert skel.vertices.shape[0] == 1000\n  assert skel.edges.shape[0] == 999\n  assert abs(skel.cable_length() - 999 * np.sqrt(2)) < 0.001\n  assert skel.space == 'physical'\n\ndef test_cube():\n  labels = np.ones( (128, 128, 128), dtype=np.uint8)\n  labels[0, 0, 0] = 0\n  labels[-1, -1, -1] = 0\n  \n  skels = kimimaro.skeletonize(labels, fix_borders=False)\n\n  assert len(skels) == 1\n\n  skel = skels[1]\n  assert skel.vertices.shape[0] == 128\n  assert skel.edges.shape[0] == 127\n  assert abs(skel.cable_length() - 127 * np.sqrt(3)) < 0.001\n  assert skel.space == 'physical'\n\ndef test_find_border_targets():\n  labels = np.zeros( (257, 257), dtype=np.uint8)\n  labels[1:-1,1:-1] = 1 \n\n  dt = edt.edt(labels)\n  targets = kimimaro.skeletontricks.find_border_targets(\n    dt, labels.astype(np.uint32), wx=100, wy=100\n  )\n\n  assert len(targets) == 1\n  assert targets[1] == (128, 128)\n\ndef test_fix_borders_z():\n  labels = np.zeros((256, 256, 256), dtype=np.uint8)\n  labels[ 64:196, 64:196, : ] = 128\n\n  skels = kimimaro.skeletonize(\n    labels,\n    teasar_params={\n      'const': 250,\n      'scale': 10,\n      'pdrf_exponent': 4,\n      'pdrf_scale': 100000,\n    }, \n    anisotropy=(40,32,20),\n    object_ids=None, \n    dust_threshold=1000, \n    progress=True, \n    fix_branching=True, \n    in_place=False, \n    fix_borders=True\n  )\n\n  skel = skels[128]\n\n  assert skel.space == 'physical'\n  skel = skel.voxel_space()\n\n  assert np.all(skel.vertices[:,0] == 129)\n  assert np.all(skel.vertices[:,1] == 129)\n  assert np.all(skel.vertices[:,2] == np.arange(256))\n  assert skel.space == 'voxel'\n\ndef test_fix_borders_x():\n  labels = np.zeros((256, 256, 256), dtype=np.uint8)\n  labels[ :, 64:196, 64:196 ] = 128\n\n  skels = kimimaro.skeletonize(\n    labels,\n    teasar_params={\n      'const': 250,\n      'scale': 10,\n      'pdrf_exponent': 4,\n      'pdrf_scale': 100000,\n    }, \n    anisotropy=(1,1,1),\n    object_ids=None, \n    dust_threshold=1000, \n    progress=True, \n    fix_branching=True, \n    in_place=False, \n    fix_borders=True\n  )\n\n  skel = skels[128]\n\n  assert np.all(skel.vertices[:,0] == np.arange(256))\n  assert np.all(skel.vertices[:,1] == 129)\n  assert np.all(skel.vertices[:,2] == 129)\n\ndef test_fix_borders_y():\n  labels = np.zeros((256, 256, 256), dtype=np.uint8)\n  labels[ 64:196, :, 64:196 ] = 128\n\n  skels = kimimaro.skeletonize(\n    labels,\n    teasar_params={\n      'const': 250,\n      'scale': 10,\n      'pdrf_exponent': 4,\n      'pdrf_scale': 100000,\n    }, \n    anisotropy=(1,1,1),\n    object_ids=None, \n    dust_threshold=1000, \n    progress=True, \n    fix_branching=True, \n    in_place=False, \n    fix_borders=True\n  )\n\n  skel = skels[128]\n\n  assert np.all(skel.vertices[:,0] == 129)\n  assert np.all(skel.vertices[:,1] == np.arange(256))\n  assert np.all(skel.vertices[:,2] == 129)\n\ndef test_extra_targets():\n  labels = np.zeros((256, 256, 1), dtype=np.uint8)\n  labels[ 64:196, 64:196, : ] = 128\n\n  def skeletonize(labels, **kwargs):\n    return kimimaro.skeletonize(\n      labels,\n      teasar_params={\n        'const': 250,\n        'scale': 10,\n        'pdrf_exponent': 4,\n        'pdrf_scale': 100000,\n      }, \n      anisotropy=(1,1,1),\n      object_ids=None, \n      dust_threshold=1000, \n      progress=True, \n      fix_branching=True, \n      in_place=False, \n      fix_borders=True,\n      **kwargs\n    )[128]\n\n  skel1 = skeletonize(labels)\n  skel2 = skeletonize(labels, extra_targets_after=[ (65, 65, 0) ])\n\n  assert skel1.vertices.size < skel2.vertices.size\n\n  skel3 = skeletonize(labels, extra_targets_before=[ (65, 65, 0) ])\n\n  assert skel3.vertices.size < skel2.vertices.size\n\n\ndef test_parallel():\n  labels = np.zeros((256, 256, 128), dtype=np.uint8)\n  labels[ 0:128, 0:128, : ] = 1\n  labels[ 0:128, 128:256, : ] = 2\n  labels[ 128:256, 0:128, : ] = 3\n  labels[ 128:256, 128:256, : ] = 4\n\n  skels = kimimaro.skeletonize(\n    labels,\n    teasar_params={\n      'const': 250,\n      'scale': 10,\n      'pdrf_exponent': 4,\n      'pdrf_scale': 100000,\n    }, \n    anisotropy=(1,1,1),\n    object_ids=None, \n    dust_threshold=1000, \n    progress=True, \n    fix_branching=True, \n    in_place=False, \n    fix_borders=True,\n    parallel=2,\n  )\n\n  assert len(skels) == 4\n\ndef test_dimensions():\n  labels = np.zeros((10,), dtype=np.uint8)\n  skel = kimimaro.skeletonize(labels)\n\n  labels = np.zeros((10,10), dtype=np.uint8)\n  skel = kimimaro.skeletonize(labels)\n\n  labels = np.zeros((10,10,10), dtype=np.uint8)\n  skel = kimimaro.skeletonize(labels)\n\n  labels = np.zeros((10,10,10,1), dtype=np.uint8)\n  skel = kimimaro.skeletonize(labels)\n\n  try:\n    labels = np.zeros((10,10,10,2), dtype=np.uint8)\n    skel = kimimaro.skeletonize(labels)\n    assert False\n  except kimimaro.DimensionError:\n    pass\n\n@pytest.mark.parametrize('axis', ('x','y'))\ndef test_joinability(axis):\n  def skeletionize(labels, fix_borders):\n    return kimimaro.skeletonize(\n      labels,\n      teasar_params={\n        'const': 10,\n        'scale': 10,\n        'pdrf_exponent': 4,\n        'pdrf_scale': 100000,\n      }, \n      anisotropy=(1,1,1),\n      object_ids=None, \n      dust_threshold=0, \n      progress=True, \n      fix_branching=True, \n      in_place=False, \n      fix_borders=fix_borders,\n      parallel=1,\n    )\n\n  labels = np.zeros((256, 256, 20), dtype=np.uint8)\n\n  if axis == 'x':\n    lslice = np.s_[ 32:160, :, : ]\n  elif axis == 'y':\n    lslice = np.s_[ :, 32:160, : ]\n\n  labels = np.zeros((256, 256, 20), dtype=np.uint8)\n  labels[lslice] = 1\n\n  skels1 = skeletionize(labels[:,:,:10], True)\n  skels1 = skels1[1]\n\n  skels2 = skeletionize(labels[:,:,9:], True)\n  skels2 = skels2[1]\n  skels2.vertices[:,2] += 9\n\n  skels_fb = skels1.merge(skels2)\n  assert len(skels_fb.components()) == 1\n\n  skels1 = skeletionize(labels[:,:,:10], False)\n  skels1 = skels1[1]\n\n  skels2 = skeletionize(labels[:,:,9:], False)\n  skels2 = skels2[1]\n  skels2.vertices[:,2] += 9\n\n  skels = skels1.merge(skels2)\n  # Ususally this results in 2 connected components,\n  # but random variation in how fp is handled can \n  # result in a merge near the tails.\n  assert not Skeleton.equivalent(skels, skels_fb)\n\ndef test_find_cycle():\n  edges = np.array([\n    [0, 1],\n    [1, 2],\n    [2, 0],\n    [2, 3],\n    [2, 4]\n  ], dtype=np.int32)\n\n  cycle = kimimaro.skeletontricks.find_cycle(edges)\n\n  assert np.all(cycle == np.array([0, 2, 1, 0]))\n\n  edges = np.array([\n    [0, 1],\n    [1, 2],\n    [2, 3],\n    [3, 4], [4, 10], [10, 11], [11, 12], [12, 2],\n    [4, 5],\n    [5, 6],\n    [6, 7],\n  ], dtype=np.int32)\n\n  cycle = kimimaro.skeletontricks.find_cycle(edges)\n  \n  assert np.all(cycle == np.array([\n    2, 12, 11, 10, 4, 3, 2\n  ]))\n\n  # two loops\n  edges = np.array([\n    [0, 1], [0, 20], [20, 21], [21, 22], [22, 23], [23, 21],\n    [1, 2],\n    [2, 3],\n    [3, 4],\n    [4, 5],\n    [5, 6],\n    [6, 7], [7, 10], [10, 11], [11, 6]\n  ], dtype=np.int32)\n\n  cycle = kimimaro.skeletontricks.find_cycle(edges)\n  \n  assert np.all(cycle == np.array([\n    21, 23, 22, 21\n  ])) or np.all(cycle == np.array([ \n    6, 11, 10, 7, 6 \n  ]))\n\n\ndef test_join_close_components_simple():\n  skel = Skeleton([ \n      (0,0,0), (1,0,0), (10,0,0), (11, 0, 0)\n    ], \n    edges=[ (0,1), (2,3) ],\n    radii=[ 0, 1, 2, 3 ],\n    vertex_types=[ 0, 1, 2, 3 ],\n    segid=1337,\n  )\n\n  assert len(skel.components()) == 2\n\n  res = kimimaro.join_close_components(skel, radius=np.inf)\n  assert len(res.components()) == 1\n\n  res = kimimaro.join_close_components(skel, radius=9)\n  assert len(res.components()) == 1\n  assert np.all(res.edges == [[0,1], [1,2], [2,3]])\n\n  res = kimimaro.join_close_components(skel, radius=8.5)\n  assert len(res.components()) == 2\n\ndef test_join_close_components_complex():\n  skel = Skeleton([ \n      (0,0,0), (1,0,0),    (4,0,0), (6,0,0),        (20,0,0), (21, 0, 0),\n      \n\n      (0,0,5), \n      (0,0,10),\n    ], \n    edges=[ (0,1), (2,3), (4,5), (6,7) ],\n  )\n\n  assert len(skel.components()) == 4\n\n  res = kimimaro.join_close_components(skel, radius=np.inf)\n  assert len(res.components()) == 1\n\n  assert np.all(res.edges == [[0,1], [0,3], [1,2], [3,4], [4,5], [5,6], [6,7]])\n\ndef test_join_close_components_by_radius():\n  skel = Skeleton([ \n      (0,0,0), (1,0,0), (5,0,0), (11, 0, 0)\n    ], \n    edges=[ (0,1), (2,3) ],\n    radii=[ 100, 100, 100, 100 ],\n    vertex_types=[ 0, 1, 2, 3 ],\n    segid=1337,\n  )\n\n  res = kimimaro.join_close_components(skel, restrict_by_radius=False)\n  assert len(res.components()) == 1\n  assert np.all(res.edges == [[0,1], [1,2], [2,3]])\n\n  res = kimimaro.join_close_components(skel, restrict_by_radius=True)\n  assert len(res.components()) == 1\n  assert np.all(res.edges == [[0,1], [1,2], [2,3]])\n\n  skel.radii = np.array([1,1,1,1], dtype=np.float32)\n  res = kimimaro.join_close_components(skel, restrict_by_radius=True)\n  assert len(res.components()) == 2\n  assert np.all(res.edges == [[0,1], [2,3]])\n\n  skel.radii = np.array([1,0.9,3,1], dtype=np.float32)\n  res = kimimaro.join_close_components(skel, restrict_by_radius=True)\n  assert len(res.components()) == 2\n  assert np.all(res.edges == [[0,1], [2,3]])\n\n  skel.radii = np.array([1,1,3,1], dtype=np.float32)\n  res = kimimaro.join_close_components(skel, restrict_by_radius=True)\n  assert len(res.components()) == 1\n  assert np.all(res.edges == [[0,1], [1,2], [2,3]])\n\n\ndef test_fill_all_holes():\n  labels = np.zeros((64, 32, 32), dtype=np.uint32)\n\n  labels[0:32,:,:] = 1\n  labels[32:64,:,:] = 8\n\n  noise = np.random.randint(low=1, high=8, size=(30, 30, 30))\n  labels[1:31,1:31,1:31] = noise\n\n  noise = np.random.randint(low=8, high=11, size=(30, 30, 30))\n  labels[33:63,1:31,1:31] = noise\n\n  noise_labels = np.unique(labels)\n  assert set(noise_labels) == set([1,2,3,4,5,6,7,8,9,10])\n\n  result = kimimaro.intake.fill_all_holes(labels)\n\n  filled_labels = np.unique(result)\n  assert set(filled_labels) == set([1,8])\n\ndef test_fix_avocados():\n  labels = np.zeros((256, 256, 256), dtype=np.uint32)\n\n  # fake clipped avocado\n  labels[:50, :40, :30] = 1 \n  labels[:25, :20, :25] = 2\n\n  # double avocado\n  labels[50:100, 40:100, 30:80] = 3\n  labels[60:90, 50:90, 40:70] = 4\n  labels[60:70, 51:89, 41:69] = 5\n\n  # not an avocado\n  labels[200:,200:,200:] = 6 # not a pit\n  labels[150:200,200:,200:] = 7 # not a fruit\n\n  fn = lambda lbls: edt.edt(lbls)\n  dt = fn(labels)\n\n  labels, dbf, remapping = kimimaro.intake.engage_avocado_protection(\n    labels, dt, { 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7 },\n    soma_detection_threshold=1, \n    edtfn=fn, \n    progress=True\n  )\n\n  uniq = set(np.unique(labels))\n  assert uniq == set([0, 1, 2, 3, 4]) # 0,2,5 renumbered\n  assert np.all(labels[:50, :40, :30] == 1)\n  assert np.all(labels[50:100, 40:100, 30:80] == 2)\n  assert np.all(labels[150:200,200:,200:] == 3)\n  assert np.all(labels[200:,200:,200:] == 4)\n\n\ndef test_cross_sectional_area():\n  labels = np.ones((100,3,3), dtype=bool, order=\"F\")\n\n  vertices = np.array([\n    [x,1,1] for x in range(labels.shape[0])\n  ])\n\n  edges = np.array([\n    [x,x+1] for x in range(labels.shape[0] - 1)\n  ])\n\n  skel = Skeleton(vertices, edges, segid=1)\n  skel = kimimaro.cross_sectional_area(labels, skel, smoothing_window=5)\n\n  assert len(skel.cross_sectional_area == 100)\n  assert np.all(skel.cross_sectional_area == 9)\n\n\ndef test_moving_average():\n\n  data = np.array([])\n  assert np.all(moving_average(data, 1) == data)\n  assert np.all(moving_average(data, 2) == data)\n\n  data = np.array([1,1,1,1,1,1,1,1,1,1,1])\n  assert np.all(moving_average(data, 1) == data)\n\n  data = np.array([1,1,1,1,1,1,1,1,1,1,1,1])\n  assert np.all(moving_average(data, 1) == data)\n\n  data = np.array([1,1,1,1,1,10,1,1,1,1,1])\n  assert np.all(moving_average(data, 1) == data)\n\n  data = np.array([1,1,1,1,1,1,1,1,1,1,1])\n  assert np.all(moving_average(data, 2) == data)\n\n  data = np.array([0,1,1,1,1,1,1,1,1,1,0])\n  ans = np.array([\n    0,0.5,1,1,1,1,1,1,1,1,0.5\n  ])\n  assert np.all(moving_average(data, 2) == ans)\n\n  data = np.array([0,1,1,1,1,1,1,1,1,1,0])\n  ans = np.array([\n    1/3,1/3,2/3,1,1,1,1,1,1,1,2/3\n  ])\n  res = moving_average(data, 3)\n  assert np.all(res == ans)\n  assert len(ans) == len(data)\n\ndef test_no_fix_branching(connectomics_data):\n  kimimaro.skeletonize(connectomics_data[:,:,100], fix_branching=False)\n\n\ndef test_remove_row():\n  arr = np.array([\n    [0,1],\n    [1,2],\n    [2,1],\n    [2,2],\n    [2,3],\n    [3,4],\n  ])\n\n  result = kimimaro.post.remove_row(arr, np.array([[1,2]]))\n\n  assert np.all(result == np.array([[0,1],[2,2],[2,3],[3,4]]))\n\n  arr = np.array([\n    []\n  ])\n\n  result = kimimaro.post.remove_row(arr, np.array([[1,2]]))\n\n  assert np.all(result == np.array([]))\n\ndef test_cross_sectional_area():\n  labels = np.ones([100,100,100], dtype=np.uint8)\n  skel = kimimaro.skeletonize(labels, teasar_params={\n    \"pdrf_exponent\": 16,\n\n  })[1]\n\n  xsa_1 = cross_sectional_area(labels, skel, step=1).cross_sectional_area\n  xsa_10 = cross_sectional_area(labels, skel, step=10).cross_sectional_area\n\n  assert np.all(xsa_1[xsa_10 == 0] != xsa_10[xsa_10 == 0])\n  assert np.all(xsa_1[xsa_10 > 0] == xsa_10[xsa_10 > 0])\n  assert np.any(xsa_1 == 10000)\n\n  terminals = skel.terminals()\n  assert np.all(xsa_10[terminals] > 0)\n  assert np.all(xsa_10[terminals] > 0)\n\n  try:\n    cross_sectional_area(labels, skel, step=-1)\n  except AssertionError:\n    pass\n  \ndef test_postprocess():\n  skel = Skeleton([ \n      (0,0,0), (1,0,0),    (4,0,0), (6,0,0),        (20,0,0), (21, 0, 0),\n      \n\n      (0,0,5), \n      (0,0,10),\n    ], \n    edges=[ (0,1), (2,3), (4,5), (6,7), (0,7), (1,6) ],\n  )\n\n  res_skel = kimimaro.post.postprocess(skel, dust_threshold=0, tick_threshold=0)\n\n  ans = Skeleton([ \n      (4,0,0), (6,0,0),        (20,0,0), (21, 0, 0),\n    ], \n    edges=[ (0,1), (2,3) ],\n  )\n\n  assert Skeleton.equivalent(res_skel, ans)\n\n\n\n\n"
  },
  {
    "path": "benchmarks/README.md",
    "content": "Benchmarks\n==========\n\nTo open `connectomics.npy.ckl.gz` you must use [`crackle-codec`](https://github.com/seung-lab/crackle).\n\nExcept where noted, these benchmarks were executed on an 2.8 GHz Dual-Core Intel Core i7 with 1600 MHz DDR3 RAM. The data source used was `connectomics.npy` which can be found in this repository. `connectomics.npy` is a 32-bit 512x512x512 cutout of mouse visual cortex at 16nm x 16nm x 40nm resolution that contains 2124 connected components including a partial cell body and a large glia fragment.\n\nBelow, we compared the run time and peak memory usage of Kimimaro across many versions that contained performance significant updates. Due to the annoying length of each run, each value represents a single run, so there is some random perturbation around the true mean that can obscure the value of small improvements. Version 0.4.2 can be considered the first \"feature complete\" version that includes quality improvements like fix_branches, fix_borders, and a reasonable root selected for the cell body.\n\n<p style=\"font-style: italics;\" align=\"center\">\n<img height=512 src=\"https://raw.githubusercontent.com/seung-lab/kimimaro/master/benchmarks/kimimaro-execution-time-by-version.png\" alt=\"Kimimaro Execution Time by Version on connectomics.npy\" /><br>\nFig. 1: Kimimaro Execution Time by Version on `connectomics.npy`\n</p>\n\n<p style=\"font-style: italics;\" align=\"center\">\n<img height=512 src=\"https://raw.githubusercontent.com/seung-lab/kimimaro/master/benchmarks/kimimaro-peak-memory-usage-by-version.png\" alt=\"Kimimaro Peak Memory Usage by Version on connectomics.npy\" /><br>\nFig. 2: Kimimaro Peak Memory Usage by Version on `connectomics.npy`\n</p>\n\n<p style=\"font-style: italics;\" align=\"center\">\n<img height=512 src=\"https://raw.githubusercontent.com/seung-lab/kimimaro/master/benchmarks/kimimaro-memory-profiles-0.1.0-3.0.0.png\" alt=\"Kimimaro Memory Profile Versions 0.3.1 vs. 3.0.0\" /><br>\nFig. 3: Kimimaro Memory Profile Versions (blue) 0.3.1 (black) 3.0.0. The first hump on the left is processing a soma. The second hump is a glia.\n</p>\n\n\n"
  },
  {
    "path": "benchmarks/benchmark.py",
    "content": "import time\nimport numpy as np\nimport kimimaro\nimport crackle\nimport pickle\n\nlabels = crackle.load(\"connectomics.npy.ckl.gz\")\n\ns = time.time()\nskels = kimimaro.skeletonize(\n  labels, \n  teasar_params={\n    'scale': 1.5,\n    'const': 300, # physical units\n    'pdrf_exponent': 4,\n    'pdrf_scale': 100000,\n    'soma_detection_threshold': 1100, # physical units\n    'soma_acceptance_threshold': 3500, # physical units\n    'soma_invalidation_scale': 1.0,\n    'soma_invalidation_const': 300, # physical units\n    # 'max_paths': 50, # default None\n  },\n  # object_ids=[ ], # process only the specified labels\n  # extra_targets_before=[ (27,33,100), (44,45,46) ], # target points in voxels\n  # extra_targets_after=[ (27,33,100), (44,45,46) ], # target points in voxels\n  # dust_threshold=1000, # skip connected components with fewer than this many voxels\n  anisotropy=(16,16,40), # default True\n  # fix_branching=True, # default True\n  # fix_borders=True, # default True\n  # fill_holes=False, # default False\n  # fix_avocados=False, # default False\n  progress=True, # default False, show progress bar\n  # parallel=1, # <= 0 all cpu, 1 single process, 2+ multiprocess\n  # parallel_chunk_size=100, # how many skeletons to process before updating progress bar\n)\nprint(time.time() - s)\n\n# with open(\"skels.pkl\", \"wb\") as f:\n#   pickle.dump(skels, f)\n\n# with open(\"skels.pkl\", \"rb\") as f:\n#   skels = pickle.load(f)\n\ns = time.time()\nskels = kimimaro.cross_sectional_area(\n  labels, skels,\n  anisotropy=(16,16,40),\n  smoothing_window=7,\n  progress=True,\n  step=1,\n)\nprint(f\"{time.time() - s:.3f}s\")"
  },
  {
    "path": "build_linux.sh",
    "content": "#!/bin/bash\n# Some dependencies don't support manylinux1\ndocker build . -f manylinux2010.Dockerfile --tag seunglab/kimimaro:manylinux2010\ndocker build . -f manylinux2014.Dockerfile --tag seunglab/kimimaro:manylinux2014\ndocker run -v $PWD/dist:/output seunglab/kimimaro:manylinux2010 /bin/bash -c \"cp -r wheelhouse/* /output\"\ndocker run -v $PWD/dist:/output seunglab/kimimaro:manylinux2014 /bin/bash -c \"cp -r wheelhouse/* /output\""
  },
  {
    "path": "ext/skeletontricks/dijkstra_invalidation.hpp",
    "content": "/*\n * This file is part of Kimimaro.\n * \n * Kimimaro is free software: you can redistribute it and/or modify\n * it under the terms of the GNU General Public License as published by\n * the Free Software Foundation, either version 3 of the License, or\n * (at your option) any later version.\n * \n * Kimimaro is distributed in the hope that it will be useful,\n * but WITHOUT ANY WARRANTY; without even the implied warranty of\n * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n * GNU General Public License for more details.\n * \n * You should have received a copy of the GNU General Public License\n * along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.\n *\n * \n * This algorithm is derived from dijkstra3d: \n * https://github.com/seung-lab/dijkstra3d\n *\n * Author: William Silversmith\n * Affiliation: Seung Lab, Princeton University\n * Date: May 2024\n */\n\n#ifndef DIJKSTRA_INVALIDATION_HPP\n#define DIJKSTRA_INVALIDATION_HPP\n\n#include <algorithm>\n#include <cmath>\n#include <cstdio>\n#include <cstdint>\n#include <functional>\n#include <memory>\n#include <queue>\n#include <vector>\n\n#include \"./libdivide.h\"\n\n#define NHOOD_SIZE 26\n\nnamespace dijkstra_invalidation {\n\n// helper function to compute 2D anisotropy (\"_s\" = \"square\")\ninline float _s(const float wa, const float wb) {\n  return std::sqrt(wa * wa + wb * wb);\n}\n\n// helper function to compute 3D anisotropy (\"_c\" = \"cube\")\ninline float _c(const float wa, const float wb, const float wc) {\n  return std::sqrt(wa * wa + wb * wb + wc * wc);\n}\n\nvoid connectivity_check(int connectivity) {\n  if (connectivity != 6 && connectivity != 18 && connectivity != 26) {\n    throw std::runtime_error(\"Only 6, 18, and 26 connectivities are supported.\");\n  }\n}\n\nvoid compute_neighborhood_helper_6(\n  int *neighborhood, \n  const int x, const int y, const int z,\n  const uint64_t sx, const uint64_t sy, const uint64_t sz\n) {\n\n  const int sxy = sx * sy;\n\n  // 6-hood\n  neighborhood[0] = -1 * (x > 0); // -x\n  neighborhood[1] = (x < (static_cast<int>(sx) - 1)); // +x\n  neighborhood[2] = -static_cast<int>(sx) * (y > 0); // -y\n  neighborhood[3] = static_cast<int>(sx) * (y < static_cast<int>(sy) - 1); // +y\n  neighborhood[4] = -sxy * static_cast<int>(z > 0); // -z\n  neighborhood[5] = sxy * (z < static_cast<int>(sz) - 1); // +z\n}\n\nvoid compute_neighborhood_helper_18(\n  int *neighborhood, \n  const int x, const int y, const int z,\n  const uint64_t sx, const uint64_t sy, const uint64_t sz\n) {\n  // 6-hood\n  compute_neighborhood_helper_6(neighborhood, x,y,z, sx,sy,sz);\n\n  // 18-hood\n\n  // xy diagonals\n  neighborhood[6] = (neighborhood[0] + neighborhood[2]) * (neighborhood[0] && neighborhood[2]); // up-left\n  neighborhood[7] = (neighborhood[0] + neighborhood[3]) * (neighborhood[0] && neighborhood[3]); // up-right\n  neighborhood[8] = (neighborhood[1] + neighborhood[2]) * (neighborhood[1] && neighborhood[2]); // down-left\n  neighborhood[9] = (neighborhood[1] + neighborhood[3]) * (neighborhood[1] && neighborhood[3]); // down-right\n\n  // yz diagonals\n  neighborhood[10] = (neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]); // up-left\n  neighborhood[11] = (neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]); // up-right\n  neighborhood[12] = (neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]); // down-left\n  neighborhood[13] = (neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]); // down-right\n\n  // xz diagonals\n  neighborhood[14] = (neighborhood[0] + neighborhood[4]) * (neighborhood[0] && neighborhood[4]); // up-left\n  neighborhood[15] = (neighborhood[0] + neighborhood[5]) * (neighborhood[0] && neighborhood[5]); // up-right\n  neighborhood[16] = (neighborhood[1] + neighborhood[4]) * (neighborhood[1] && neighborhood[4]); // down-left\n  neighborhood[17] = (neighborhood[1] + neighborhood[5]) * (neighborhood[1] && neighborhood[5]); // down-right\n}\n\nvoid compute_neighborhood_helper_26(\n  int *neighborhood, \n  const int x, const int y, const int z,\n  const uint64_t sx, const uint64_t sy, const uint64_t sz\n) {\n  compute_neighborhood_helper_18(neighborhood, x,y,z, sx,sy,sz);\n  \n  // 26-hood\n\n  // Now the eight corners of the cube\n  neighborhood[18] = (neighborhood[0] + neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]);\n  neighborhood[19] = (neighborhood[1] + neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]);\n  neighborhood[20] = (neighborhood[0] + neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]);\n  neighborhood[21] = (neighborhood[0] + neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]);\n  neighborhood[22] = (neighborhood[1] + neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]);\n  neighborhood[23] = (neighborhood[1] + neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]);\n  neighborhood[24] = (neighborhood[0] + neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]);\n  neighborhood[25] = (neighborhood[1] + neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]);\n}\n\ninline void compute_neighborhood(\n  int *neighborhood, \n  const int x, const int y, const int z,\n  const uint64_t sx, const uint64_t sy, const uint64_t sz,\n  const int connectivity = 26, const uint32_t* voxel_connectivity_graph = NULL) {\n\n  if (connectivity == 26) {\n    compute_neighborhood_helper_26(neighborhood, x, y, z, sx, sy, sz);\n  }\n  else if (connectivity == 18) {\n    compute_neighborhood_helper_18(neighborhood, x, y, z, sx, sy, sz);\n  }\n  else {\n    compute_neighborhood_helper_6(neighborhood, x, y, z, sx, sy, sz);\n  }\n\n  if (voxel_connectivity_graph == NULL) {\n    return;\n  }\n\n  uint64_t loc = x + sx * (y + sy * z);\n  uint32_t graph = voxel_connectivity_graph[loc];\n\n  // graph conventions are defined here:\n  // https://github.com/seung-lab/connected-components-3d/blob/3.2.0/cc3d_graphs.hpp#L73-L92\n\n  // 6-hood\n  neighborhood[0] *= ((graph & 0b000010) > 0); // -x\n  neighborhood[1] *= ((graph & 0b000001) > 0); // +x\n  neighborhood[2] *= ((graph & 0b001000) > 0); // -y\n  neighborhood[3] *= ((graph & 0b000100) > 0); // +y\n  neighborhood[4] *= ((graph & 0b100000) > 0); // -z\n  neighborhood[5] *= ((graph & 0b010000) > 0); // +z\n\n  // 18-hood\n\n  // xy diagonals\n  neighborhood[6] *= ((graph & 0b1000000000) > 0); // up-left -x,-y\n  neighborhood[7] *= ((graph & 0b0010000000) > 0); // up-right -x,+y\n  neighborhood[8] *= ((graph & 0b0100000000) > 0); // down-left +x,-y\n  neighborhood[9] *= ((graph & 0b0001000000) > 0); // down-right +x,+y\n\n  // yz diagonals\n  neighborhood[10] *= ((graph & 0b100000000000000000) > 0); // up-left -y,-z\n  neighborhood[11] *= ((graph & 0b000010000000000000) > 0); // up-right -y,+z\n  neighborhood[12] *= ((graph & 0b010000000000000000) > 0); // down-left +y,-z\n  neighborhood[13] *= ((graph & 0b000001000000000000) > 0); // down-right +y,+z\n\n  // xz diagonals\n  neighborhood[14] *= ((graph & 0b001000000000000000) > 0); // up-left, -x,-z\n  neighborhood[15] *= ((graph & 0b000000100000000000) > 0); // up-right, -x,+z\n  neighborhood[16] *= ((graph & 0b000100000000000000) > 0); // down-left +x,-z\n  neighborhood[17] *= ((graph & 0b000000010000000000) > 0); // down-right +x,+z\n\n  // 26-hood\n\n  // Now the eight corners of the cube\n  neighborhood[18] *= ((graph & 0b10000000000000000000000000) > 0); // -x,-y,-z\n  neighborhood[19] *= ((graph & 0b01000000000000000000000000) > 0); // +x,-y,-z\n  neighborhood[20] *= ((graph & 0b00100000000000000000000000) > 0); // -x,+y,-z\n  neighborhood[21] *= ((graph & 0b00001000000000000000000000) > 0); // -x,-y,+z\n  neighborhood[22] *= ((graph & 0b00010000000000000000000000) > 0); // +x,+y,-z\n  neighborhood[23] *= ((graph & 0b00000100000000000000000000) > 0); // +x,-y,+z\n  neighborhood[24] *= ((graph & 0b00000010000000000000000000) > 0); // -x,+y,+z\n  neighborhood[25] *= ((graph & 0b00000001000000000000000000) > 0); // +x,+y,+z\n}\n\n#define DIJKSTRA_3D_PREFETCH_26WAY(field, loc) \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sxy - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sxy - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sxy + sx - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sxy - sx - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sxy + sx - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sxy - sx - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) + sx - 1]), 0, 1); \\\n  HEDLEYX_PREFETCH(reinterpret_cast<char*>(&field[(loc) - sx - 1]), 0, 1);\n\nclass HeapDistanceNode {\npublic:\n  float dist;\n  uint64_t original_loc;\n  uint64_t value;\n  float max_dist;\n\n  HeapDistanceNode() {\n    dist = 0;\n    value = 0;\n    original_loc = 0;\n    max_dist = 0;\n  }\n\n  HeapDistanceNode (float d, uint64_t o_loc, uint64_t val, float mx_dist) {\n    dist = d;\n    value = val;\n    original_loc = o_loc;\n    max_dist = mx_dist;\n  }\n\n  HeapDistanceNode (const HeapDistanceNode &h) {\n    dist = h.dist;\n    value = h.value;\n    max_dist = h.max_dist;\n    original_loc = h.original_loc;\n  }\n};\n\nstruct HeapDistanceNodeCompare {\n  bool operator()(const HeapDistanceNode &t1, const HeapDistanceNode &t2) const {\n    return t1.dist >= t2.dist;\n  }\n};\n\nint64_t _roll_invalidation_ball(\n  uint8_t* field, // really a boolean field\n  const uint64_t sx, const uint64_t sy, const uint64_t sz, \n  const float wx, const float wy, const float wz, \n  const std::vector<uint64_t> &sources,\n  const std::vector<float> &max_distances,\n  const int connectivity = 26, \n  const uint32_t* voxel_connectivity_graph = NULL\n) {\n\n  const uint64_t sxy = sx * sy;\n\n  const libdivide::divider<uint64_t> fast_sx(sx); \n  const libdivide::divider<uint64_t> fast_sxy(sxy); \n\n  const bool power_of_two = !((sx & (sx - 1)) || (sy & (sy - 1))); \n  const int xshift = std::log2(sx); // must use log2 here, not lg/lg2 to avoid fp errors\n  const int yshift = std::log2(sy);\n\n  connectivity_check(connectivity);\n\n  int neighborhood[NHOOD_SIZE] = {};\n\n  std::priority_queue<\n    HeapDistanceNode, std::vector<HeapDistanceNode>, HeapDistanceNodeCompare\n  > queue;\n\n  for (uint64_t i = 0; i < sources.size(); i++) {\n    queue.emplace(0.0, sources[i], sources[i], max_distances[i]);\n  }\n\n  uint64_t loc;\n  uint64_t neighboridx;\n\n  int64_t x, y, z;\n  int64_t orig_x, orig_y, orig_z;\n\n  int64_t invalidated = 0;\n\n  auto xyzfn = [=](uint64_t l, int64_t& x, int64_t& y, int64_t& z) {\n    if (power_of_two) {\n      z = l >> (xshift + yshift);\n      y = (l - (z << (xshift + yshift))) >> xshift;\n      x = l - ((y + (z << yshift)) << xshift);\n    }\n    else {\n      z = l / fast_sxy;\n      y = (l - (z * sxy)) / fast_sx;\n      x = l - sx * (y + z * sy);\n    }\n  };\n\n  while (!queue.empty()) {\n    const float max_dist = queue.top().max_dist;\n    const uint64_t original_loc = queue.top().original_loc;\n    loc = queue.top().value;\n    queue.pop();\n\n    if (!field[loc]) {\n      continue;\n    }\n\n    field[loc] = 0;\n    invalidated++;\n\n    xyzfn(loc, x, y, z);\n    xyzfn(original_loc, orig_x, orig_y, orig_z);\n    compute_neighborhood(neighborhood, x, y, z, sx, sy, sz, connectivity, voxel_connectivity_graph);\n\n    for (int i = 0; i < connectivity; i++) {\n      if (neighborhood[i] == 0) {\n        continue;\n      }\n\n      neighboridx = loc + neighborhood[i];\n      if (field[neighboridx] == 0) {\n        continue;\n      }\n\n      xyzfn(neighboridx, x, y, z);\n      float new_dist = _c(\n        wx * static_cast<float>(x - orig_x), \n        wy * static_cast<float>(y - orig_y), \n        wz * static_cast<float>(z - orig_z)\n      );\n\n      if (new_dist < max_dist) { \n        queue.emplace(new_dist, original_loc, neighboridx, max_dist);\n      }\n    }\n  }\n\n  return invalidated;\n}\n\n};\n\n#undef NHOOD_SIZE\n#undef DIJKSTRA_3D_PREFETCH_26WAY\n\n#endif\n"
  },
  {
    "path": "ext/skeletontricks/libdivide.h",
    "content": "// libdivide.h - Optimized integer division\n// https://libdivide.com\n//\n// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>\n// Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>\n//\n// libdivide is dual-licensed under the Boost or zlib licenses.\n// You may use libdivide under the terms of either of these.\n// See LICENSE.txt for more details.\n\n#ifndef LIBDIVIDE_H\n#define LIBDIVIDE_H\n\n// *** Version numbers are auto generated - do not edit ***\n#define LIBDIVIDE_VERSION \"5.2.0\"\n#define LIBDIVIDE_VERSION_MAJOR 5\n#define LIBDIVIDE_VERSION_MINOR 2\n#define LIBDIVIDE_VERSION_PATCH 0\n\n#include <stdint.h>\n\n#if !defined(__AVR__) && __STDC_HOSTED__ != 0\n#include <stdio.h>\n#include <stdlib.h>\n#endif\n\n#if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \\\n    (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L))\n#include <limits.h>\n#include <type_traits>\n#define LIBDIVIDE_VC_CXX20\n#endif\n\n#if defined(LIBDIVIDE_SSE2)\n#include <emmintrin.h>\n#endif\n\n#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)\n#include <immintrin.h>\n#endif\n\n#if defined(LIBDIVIDE_NEON)\n#include <arm_neon.h>\n#endif\n\n// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics\n#if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \\\n    (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))\n#define LIBDIVIDE_MULH_INTRINSICS\n#endif\n\n#if defined(_MSC_VER)\n#if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__)\n#include <intrin.h>\n#endif\n#ifndef __clang__\n#pragma warning(push)\n// 4146: unary minus operator applied to unsigned type, result still unsigned\n#pragma warning(disable : 4146)\n\n// 4204: nonstandard extension used : non-constant aggregate initializer\n#pragma warning(disable : 4204)\n#endif\n#define LIBDIVIDE_VC\n#endif\n\n#if !defined(__has_builtin)\n#define __has_builtin(x) 0\n#endif\n\n#if defined(__SIZEOF_INT128__)\n#define HAS_INT128_T\n// clang-cl on Windows does not yet support 128-bit division\n#if !(defined(__clang__) && defined(LIBDIVIDE_VC))\n#define HAS_INT128_DIV\n#endif\n#endif\n\n#if defined(__x86_64__) || defined(_M_X64)\n#define LIBDIVIDE_X86_64\n#endif\n\n#if defined(__i386__)\n#define LIBDIVIDE_i386\n#endif\n\n#if defined(__GNUC__) || defined(__clang__)\n#define LIBDIVIDE_GCC_STYLE_ASM\n#endif\n\n#if defined(__cplusplus) || defined(LIBDIVIDE_VC)\n#define LIBDIVIDE_FUNCTION __FUNCTION__\n#else\n#define LIBDIVIDE_FUNCTION __func__\n#endif\n\n// Set up forced inlining if possible.\n// We need both the attribute and keyword to avoid \"might not be inlineable\" warnings.\n#ifdef __has_attribute\n#if __has_attribute(always_inline)\n#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline\n#endif\n#endif\n#ifndef LIBDIVIDE_INLINE\n#ifdef _MSC_VER\n#define LIBDIVIDE_INLINE __forceinline\n#else\n#define LIBDIVIDE_INLINE inline\n#endif\n#endif\n\n#if defined(__AVR__) || __STDC_HOSTED__ == 0\n#define LIBDIVIDE_ERROR(msg)\n#else\n#define LIBDIVIDE_ERROR(msg)                                                                     \\\n    do {                                                                                         \\\n        fprintf(stderr, \"libdivide.h:%d: %s(): Error: %s\\n\", __LINE__, LIBDIVIDE_FUNCTION, msg); \\\n        abort();                                                                                 \\\n    } while (0)\n#endif\n\n#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0\n#define LIBDIVIDE_ASSERT(x)                                                           \\\n    do {                                                                              \\\n        if (!(x)) {                                                                   \\\n            fprintf(stderr, \"libdivide.h:%d: %s(): Assertion failed: %s\\n\", __LINE__, \\\n                LIBDIVIDE_FUNCTION, #x);                                              \\\n            abort();                                                                  \\\n        }                                                                             \\\n    } while (0)\n#else\n#define LIBDIVIDE_ASSERT(x)\n#endif\n\n#ifdef __cplusplus\n\n// For constexpr zero initialization, c++11 might handle things ok,\n// but just limit to at least c++14 to ensure we don't break anyone's code:\n\n// Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr\n#if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L)\n#define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE\n\n// Supposedly, MSVC might not implement feature test macros right:\n// https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c\n// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS\n// 2017 15.0 (for extended constexpr support:\n// https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)\n#elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)\n#define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE\n\n#else\n#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE\n#endif\n\nnamespace libdivide {\n#endif\n\n#if defined(_MSC_VER) && !defined(__clang__)\n#if defined(LIBDIVIDE_VC_CXX20)\nstatic LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) {\n    if (std::is_constant_evaluated()) {\n        for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {\n            if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;\n        }\n        return sizeof(x) * CHAR_BIT;\n    }\n#else\nstatic LIBDIVIDE_INLINE int __builtin_clz(unsigned x) {\n#endif\n#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)\n    return (int)_CountLeadingZeros(x);\n#elif defined(__AVX2__) || defined(__LZCNT__)\n    return (int)_lzcnt_u32(x);\n#else\n    unsigned long r;\n    _BitScanReverse(&r, x);\n    return (int)(r ^ 31);\n#endif\n}\n\n#if defined(LIBDIVIDE_VC_CXX20)\nstatic LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) {\n    if (std::is_constant_evaluated()) {\n        for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) {\n            if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i;\n        }\n        return sizeof(x) * CHAR_BIT;\n    }\n#else\nstatic LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) {\n#endif\n#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)\n    return (int)_CountLeadingZeros64(x);\n#elif defined(_WIN64)\n#if defined(__AVX2__) || defined(__LZCNT__)\n    return (int)_lzcnt_u64(x);\n#else\n    unsigned long r;\n    _BitScanReverse64(&r, x);\n    return (int)(r ^ 63);\n#endif\n#else\n    int l = __builtin_clz((unsigned)x) + 32;\n    int h = __builtin_clz((unsigned)(x >> 32));\n    return !!((unsigned)(x >> 32)) ? h : l;\n#endif\n}\n#endif // defined(_MSC_VER) && !defined(__clang__)\n\n// pack divider structs to prevent compilers from padding.\n// This reduces memory usage by up to 43% when using a large\n// array of libdivide dividers and improves performance\n// by up to 10% because of reduced memory bandwidth.\n#pragma pack(push, 1)\n\nstruct libdivide_u16_t {\n    uint16_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_s16_t {\n    int16_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_u32_t {\n    uint32_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_s32_t {\n    int32_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_u64_t {\n    uint64_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_s64_t {\n    int64_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_u16_branchfree_t {\n    uint16_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_s16_branchfree_t {\n    int16_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_u32_branchfree_t {\n    uint32_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_s32_branchfree_t {\n    int32_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_u64_branchfree_t {\n    uint64_t magic;\n    uint8_t more;\n};\n\nstruct libdivide_s64_branchfree_t {\n    int64_t magic;\n    uint8_t more;\n};\n\n#pragma pack(pop)\n\n// Explanation of the \"more\" field:\n//\n// * Bits 0-5 is the shift value (for shift path or mult path).\n// * Bit 6 is the add indicator for mult path.\n// * Bit 7 is set if the divisor is negative. We use bit 7 as the negative\n//   divisor indicator so that we can efficiently use sign extension to\n//   create a bitmask with all bits set to 1 (if the divisor is negative)\n//   or 0 (if the divisor is positive).\n//\n// u32: [0-4] shift value\n//      [5] ignored\n//      [6] add indicator\n//      magic number of 0 indicates shift path\n//\n// s32: [0-4] shift value\n//      [5] ignored\n//      [6] add indicator\n//      [7] indicates negative divisor\n//      magic number of 0 indicates shift path\n//\n// u64: [0-5] shift value\n//      [6] add indicator\n//      magic number of 0 indicates shift path\n//\n// s64: [0-5] shift value\n//      [6] add indicator\n//      [7] indicates negative divisor\n//      magic number of 0 indicates shift path\n//\n// In s32 and s64 branchfree modes, the magic number is negated according to\n// whether the divisor is negated. In branchfree strategy, it is not negated.\n\nenum {\n    LIBDIVIDE_16_SHIFT_MASK = 0x1F,\n    LIBDIVIDE_32_SHIFT_MASK = 0x1F,\n    LIBDIVIDE_64_SHIFT_MASK = 0x3F,\n    LIBDIVIDE_ADD_MARKER = 0x40,\n    LIBDIVIDE_NEGATIVE_DIVISOR = 0x80\n};\n\nstatic LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);\n\nstatic LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);\nstatic LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(\n    int16_t numer, int16_t magic, uint8_t more);\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_do(\n    int16_t numer, const struct libdivide_s16_t *denom);\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(\n    uint16_t numer, uint16_t magic, uint8_t more);\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_do(\n    uint16_t numer, const struct libdivide_u16_t *denom);\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(\n    int32_t numer, int32_t magic, uint8_t more);\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_do(\n    int32_t numer, const struct libdivide_s32_t *denom);\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(\n    uint32_t numer, uint32_t magic, uint8_t more);\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_do(\n    uint32_t numer, const struct libdivide_u32_t *denom);\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(\n    int64_t numer, int64_t magic, uint8_t more);\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_do(\n    int64_t numer, const struct libdivide_s64_t *denom);\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(\n    uint64_t numer, uint64_t magic, uint8_t more);\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_do(\n    uint64_t numer, const struct libdivide_u64_t *denom);\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(\n    int16_t numer, const struct libdivide_s16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(\n    uint16_t numer, const struct libdivide_u16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(\n    int32_t numer, const struct libdivide_s32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(\n    uint32_t numer, const struct libdivide_u32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(\n    int64_t numer, const struct libdivide_s64_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(\n    uint64_t numer, const struct libdivide_u64_branchfree_t *denom);\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(\n    const struct libdivide_s16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(\n    const struct libdivide_u16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(\n    const struct libdivide_s32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(\n    const struct libdivide_u32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(\n    const struct libdivide_s64_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(\n    const struct libdivide_u64_branchfree_t *denom);\n\n//////// Internal Utility Functions\n\nstatic LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {\n    uint32_t xl = x, yl = y;\n    uint32_t rl = xl * yl;\n    return (uint16_t)(rl >> 16);\n}\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {\n    int32_t xl = x, yl = y;\n    int32_t rl = xl * yl;\n    // needs to be arithmetic shift\n    return (int16_t)(rl >> 16);\n}\n\nstatic LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {\n    uint64_t xl = x, yl = y;\n    uint64_t rl = xl * yl;\n    return (uint32_t)(rl >> 32);\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {\n    int64_t xl = x, yl = y;\n    int64_t rl = xl * yl;\n    // needs to be arithmetic shift\n    return (int32_t)(rl >> 32);\n}\n\nstatic LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {\n#if defined(LIBDIVIDE_MULH_INTRINSICS)\n    return __umulh(x, y);\n#elif defined(HAS_INT128_T)\n    __uint128_t xl = x, yl = y;\n    __uint128_t rl = xl * yl;\n    return (uint64_t)(rl >> 64);\n#else\n    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)\n    uint32_t mask = 0xFFFFFFFF;\n    uint32_t x0 = (uint32_t)(x & mask);\n    uint32_t x1 = (uint32_t)(x >> 32);\n    uint32_t y0 = (uint32_t)(y & mask);\n    uint32_t y1 = (uint32_t)(y >> 32);\n    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);\n    uint64_t x0y1 = x0 * (uint64_t)y1;\n    uint64_t x1y0 = x1 * (uint64_t)y0;\n    uint64_t x1y1 = x1 * (uint64_t)y1;\n    uint64_t temp = x1y0 + x0y0_hi;\n    uint64_t temp_lo = temp & mask;\n    uint64_t temp_hi = temp >> 32;\n\n    return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);\n#endif\n}\n\nstatic LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {\n#if defined(LIBDIVIDE_MULH_INTRINSICS)\n    return __mulh(x, y);\n#elif defined(HAS_INT128_T)\n    __int128_t xl = x, yl = y;\n    __int128_t rl = xl * yl;\n    return (int64_t)(rl >> 64);\n#else\n    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)\n    uint32_t mask = 0xFFFFFFFF;\n    uint32_t x0 = (uint32_t)(x & mask);\n    uint32_t y0 = (uint32_t)(y & mask);\n    int32_t x1 = (int32_t)(x >> 32);\n    int32_t y1 = (int32_t)(y >> 32);\n    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);\n    int64_t t = x1 * (int64_t)y0 + x0y0_hi;\n    int64_t w1 = x0 * (int64_t)y1 + (t & mask);\n\n    return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);\n#endif\n}\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {\n#if defined(__AVR__)\n    // Fast way to count leading zeros\n    // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.\n    return __builtin_clz(val);\n#elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)\n    // Fast way to count leading zeros\n    return (int16_t)(__builtin_clz(val) - 16);\n#else\n    if (val == 0) return 16;\n    int16_t result = 4;\n    uint16_t hi = 0xFU << 12;\n    while ((val & hi) == 0) {\n        hi >>= 4;\n        result += 4;\n    }\n    while (val & hi) {\n        result -= 1;\n        hi <<= 1;\n    }\n    return result;\n#endif\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {\n#if defined(__AVR__)\n    // Fast way to count leading zeros\n    return __builtin_clzl(val);\n#elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER)\n    // Fast way to count leading zeros\n    return __builtin_clz(val);\n#else\n    if (val == 0) return 32;\n    int32_t result = 8;\n    uint32_t hi = 0xFFU << 24;\n    while ((val & hi) == 0) {\n        hi >>= 8;\n        result += 8;\n    }\n    while (val & hi) {\n        result -= 1;\n        hi <<= 1;\n    }\n    return result;\n#endif\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {\n#if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER)\n    // Fast way to count leading zeros\n    return __builtin_clzll(val);\n#else\n    uint32_t hi = val >> 32;\n    uint32_t lo = val & 0xFFFFFFFF;\n    if (hi != 0) return libdivide_count_leading_zeros32(hi);\n    return 32 + libdivide_count_leading_zeros32(lo);\n#endif\n}\n\n// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit\n// uint {v}. The result must fit in 16 bits.\n// Returns the quotient directly and the remainder in *r\nstatic LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(\n    uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {\n    uint32_t n = ((uint32_t)u1 << 16) | u0;\n    uint16_t result = (uint16_t)(n / v);\n    *r = (uint16_t)(n - result * (uint32_t)v);\n    return result;\n}\n\n// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit\n// uint {v}. The result must fit in 32 bits.\n// Returns the quotient directly and the remainder in *r\nstatic LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(\n    uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {\n#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)\n    uint32_t result;\n    __asm__(\"divl %[v]\" : \"=a\"(result), \"=d\"(*r) : [v] \"r\"(v), \"a\"(u0), \"d\"(u1));\n    return result;\n#else\n    uint64_t n = ((uint64_t)u1 << 32) | u0;\n    uint32_t result = (uint32_t)(n / v);\n    *r = (uint32_t)(n - result * (uint64_t)v);\n    return result;\n#endif\n}\n\n// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The\n// result must fit in 64 bits. Returns the quotient directly and the remainder in *r\nstatic LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(\n    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {\n    // N.B. resist the temptation to use __uint128_t here.\n    // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than\n    // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because\n    // it's not LIBDIVIDE_INLINEd.\n#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)\n    uint64_t result;\n    __asm__(\"div %[v]\" : \"=a\"(result), \"=d\"(*r) : [v] \"r\"(den), \"a\"(numlo), \"d\"(numhi));\n    return result;\n#else\n    // We work in base 2**32.\n    // A uint32 holds a single digit. A uint64 holds two digits.\n    // Our numerator is conceptually [num3, num2, num1, num0].\n    // Our denominator is [den1, den0].\n    const uint64_t b = ((uint64_t)1 << 32);\n\n    // The high and low digits of our computed quotient.\n    uint32_t q1;\n    uint32_t q0;\n\n    // The normalization shift factor.\n    int shift;\n\n    // The high and low digits of our denominator (after normalizing).\n    // Also the low 2 digits of our numerator (after normalizing).\n    uint32_t den1;\n    uint32_t den0;\n    uint32_t num1;\n    uint32_t num0;\n\n    // A partial remainder.\n    uint64_t rem;\n\n    // The estimated quotient, and its corresponding remainder (unrelated to true remainder).\n    uint64_t qhat;\n    uint64_t rhat;\n\n    // Variables used to correct the estimated quotient.\n    uint64_t c1;\n    uint64_t c2;\n\n    // Check for overflow and divide by 0.\n    if (numhi >= den) {\n        if (r) *r = ~0ull;\n        return ~0ull;\n    }\n\n    // Determine the normalization factor. We multiply den by this, so that its leading digit is at\n    // least half b. In binary this means just shifting left by the number of leading zeros, so that\n    // there's a 1 in the MSB.\n    // We also shift numer by the same amount. This cannot overflow because numhi < den.\n    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting\n    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is\n    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.\n    shift = libdivide_count_leading_zeros64(den);\n    den <<= shift;\n    numhi <<= shift;\n    numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);\n    numlo <<= shift;\n\n    // Extract the low digits of the numerator and both digits of the denominator.\n    num1 = (uint32_t)(numlo >> 32);\n    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);\n    den1 = (uint32_t)(den >> 32);\n    den0 = (uint32_t)(den & 0xFFFFFFFFu);\n\n    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].\n    // Estimate q1 as [n3 n2] / [d1], and then correct it.\n    // Note while qhat may be 2 digits, q1 is always 1 digit.\n    qhat = numhi / den1;\n    rhat = numhi % den1;\n    c1 = qhat * den0;\n    c2 = rhat * b + num1;\n    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;\n    q1 = (uint32_t)qhat;\n\n    // Compute the true (partial) remainder.\n    rem = numhi * b + num1 - q1 * den;\n\n    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].\n    // Estimate q0 as [rem1 rem0] / [d1] and correct it.\n    qhat = rem / den1;\n    rhat = rem % den1;\n    c1 = qhat * den0;\n    c2 = rhat * b + num0;\n    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;\n    q0 = (uint32_t)qhat;\n\n    // Return remainder if requested.\n    if (r) *r = (rem * b + num0 - q0 * den) >> shift;\n    return ((uint64_t)q1 << 32) | q0;\n#endif\n}\n\n#if !(defined(HAS_INT128_T) && \\\n      defined(HAS_INT128_DIV))\n\n// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)\nstatic LIBDIVIDE_INLINE void libdivide_u128_shift(\n    uint64_t *u1, uint64_t *u0, int32_t signed_shift) {\n    if (signed_shift > 0) {\n        uint32_t shift = signed_shift;\n        *u1 <<= shift;\n        *u1 |= *u0 >> (64 - shift);\n        *u0 <<= shift;\n    } else if (signed_shift < 0) {\n        uint32_t shift = -signed_shift;\n        *u0 >>= shift;\n        *u0 |= *u1 << (64 - shift);\n        *u1 >>= shift;\n    }\n}\n\n#endif\n\n// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.\nstatic LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(\n    uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {\n#if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)\n    __uint128_t ufull = u_hi;\n    __uint128_t vfull = v_hi;\n    ufull = (ufull << 64) | u_lo;\n    vfull = (vfull << 64) | v_lo;\n    uint64_t res = (uint64_t)(ufull / vfull);\n    __uint128_t remainder = ufull - (vfull * res);\n    *r_lo = (uint64_t)remainder;\n    *r_hi = (uint64_t)(remainder >> 64);\n    return res;\n#else\n    // Adapted from \"Unsigned Doubleword Division\" in Hacker's Delight\n    // We want to compute u / v\n    typedef struct {\n        uint64_t hi;\n        uint64_t lo;\n    } u128_t;\n    u128_t u = {u_hi, u_lo};\n    u128_t v = {v_hi, v_lo};\n\n    if (v.hi == 0) {\n        // divisor v is a 64 bit value, so we just need one 128/64 division\n        // Note that we are simpler than Hacker's Delight here, because we know\n        // the quotient fits in 64 bits whereas Hacker's Delight demands a full\n        // 128 bit quotient\n        *r_hi = 0;\n        return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);\n    }\n    // Here v >= 2**64\n    // We know that v.hi != 0, so count leading zeros is OK\n    // We have 0 <= n <= 63\n    uint32_t n = libdivide_count_leading_zeros64(v.hi);\n\n    // Normalize the divisor so its MSB is 1\n    u128_t v1t = v;\n    libdivide_u128_shift(&v1t.hi, &v1t.lo, n);\n    uint64_t v1 = v1t.hi;  // i.e. v1 = v1t >> 64\n\n    // To ensure no overflow\n    u128_t u1 = u;\n    libdivide_u128_shift(&u1.hi, &u1.lo, -1);\n\n    // Get quotient from divide unsigned insn.\n    uint64_t rem_ignored;\n    uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);\n\n    // Undo normalization and division of u by 2.\n    u128_t q0 = {0, q1};\n    libdivide_u128_shift(&q0.hi, &q0.lo, n);\n    libdivide_u128_shift(&q0.hi, &q0.lo, -63);\n\n    // Make q0 correct or too small by 1\n    // Equivalent to `if (q0 != 0) q0 = q0 - 1;`\n    if (q0.hi != 0 || q0.lo != 0) {\n        q0.hi -= (q0.lo == 0);  // borrow\n        q0.lo -= 1;\n    }\n\n    // Now q0 is correct.\n    // Compute q0 * v as q0v\n    // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)\n    // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +\n    //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)\n    // Each term is 128 bit\n    // High half of full product (upper 128 bits!) are dropped\n    u128_t q0v = {0, 0};\n    q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);\n    q0v.lo = q0.lo * v.lo;\n\n    // Compute u - q0v as u_q0v\n    // This is the remainder\n    u128_t u_q0v = u;\n    u_q0v.hi -= q0v.hi + (u.lo < q0v.lo);  // second term is borrow\n    u_q0v.lo -= q0v.lo;\n\n    // Check if u_q0v >= v\n    // This checks if our remainder is larger than the divisor\n    if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {\n        // Increment q0\n        q0.lo += 1;\n        q0.hi += (q0.lo == 0);  // carry\n\n        // Subtract v from remainder\n        u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);\n        u_q0v.lo -= v.lo;\n    }\n\n    *r_hi = u_q0v.hi;\n    *r_lo = u_q0v.lo;\n\n    LIBDIVIDE_ASSERT(q0.hi == 0);\n    return q0.lo;\n#endif\n}\n\n////////// UINT16\n\nstatic LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(\n    uint16_t d, int branchfree) {\n    if (d == 0) {\n        LIBDIVIDE_ERROR(\"divider must be != 0\");\n    }\n\n    struct libdivide_u16_t result;\n    uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));\n\n    // Power of 2\n    if ((d & (d - 1)) == 0) {\n        // We need to subtract 1 from the shift value in case of an unsigned\n        // branchfree divider because there is a hardcoded right shift by 1\n        // in its division algorithm. Because of this we also need to add back\n        // 1 in its recovery algorithm.\n        result.magic = 0;\n        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));\n    } else {\n        uint8_t more;\n        uint16_t rem, proposed_m;\n        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);\n\n        LIBDIVIDE_ASSERT(rem > 0 && rem < d);\n        const uint16_t e = d - rem;\n\n        // This power works if e < 2**floor_log_2_d.\n        if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {\n            // This power works\n            more = floor_log_2_d;\n        } else {\n            // We have to use the general 17-bit algorithm.  We need to compute\n            // (2**power) / d. However, we already have (2**(power-1))/d and\n            // its remainder.  By doubling both, and then correcting the\n            // remainder, we can compute the larger division.\n            // don't care about overflow here - in fact, we expect it\n            proposed_m += proposed_m;\n            const uint16_t twice_rem = rem + rem;\n            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;\n            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;\n        }\n        result.magic = 1 + proposed_m;\n        result.more = more;\n        // result.more's shift should in general be ceil_log_2_d. But if we\n        // used the smaller power, we subtract one from the shift because we're\n        // using the smaller power. If we're using the larger power, we\n        // subtract one from the shift because it's taken care of by the add\n        // indicator. So floor_log_2_d happens to be correct in both cases.\n    }\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {\n    return libdivide_internal_u16_gen(d, 0);\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {\n    if (d == 1) {\n        LIBDIVIDE_ERROR(\"branchfree divider must be != 1\");\n    }\n    struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);\n    struct libdivide_u16_branchfree_t ret = {\n        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};\n    return ret;\n}\n\n// The original libdivide_u16_do takes a const pointer. However, this cannot be used\n// with a compile time constant libdivide_u16_t: it will generate a warning about\n// taking the address of a temporary. Hence this overload.\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {\n    if (!magic) {\n        return numer >> more;\n    } else {\n        uint16_t q = libdivide_mullhi_u16(numer, magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            uint16_t t = ((numer - q) >> 1) + q;\n            return t >> (more & LIBDIVIDE_16_SHIFT_MASK);\n        } else {\n            // All upper bits are 0,\n            // don't need to mask them off.\n            return q >> more;\n        }\n    }\n}\n\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {\n    return libdivide_u16_do_raw(numer, denom->magic, denom->more);\n}\n\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(\n    uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {\n    uint16_t q = libdivide_mullhi_u16(numer, denom->magic);\n    uint16_t t = ((numer - q) >> 1) + q;\n    return t >> denom->more;\n}\n\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n\n    if (!denom->magic) {\n        return (uint16_t)1 << shift;\n    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {\n        // We compute q = n/d = n*m / 2^(16 + shift)\n        // Therefore we have d = 2^(16 + shift) / m\n        // We need to ceil it.\n        // We know d is not a power of 2, so m is not a power of 2,\n        // so we can just add 1 to the floor\n        uint16_t hi_dividend = (uint16_t)1 << shift;\n        uint16_t rem_ignored;\n        return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);\n    } else {\n        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).\n        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now\n        // Also note that shift may be as high as 15, so shift + 1 will\n        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and\n        // then double the quotient and remainder.\n        uint32_t half_n = (uint32_t)1 << (16 + shift);\n        uint32_t d = ((uint32_t)1 << 16) | denom->magic;\n        // Note that the quotient is guaranteed <= 16 bits, but the remainder\n        // may need 17!\n        uint16_t half_q = (uint16_t)(half_n / d);\n        uint32_t rem = half_n % d;\n        // We computed 2^(16+shift)/(m+2^16)\n        // Need to double it, and then add 1 to the quotient if doubling th\n        // remainder would increase the quotient.\n        // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits\n        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);\n\n        // We rounded down in gen (hence +1)\n        return full_q + 1;\n    }\n}\n\nstatic LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n\n    if (!denom->magic) {\n        return (uint16_t)1 << (shift + 1);\n    } else {\n        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).\n        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now\n        // Also note that shift may be as high as 15, so shift + 1 will\n        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and\n        // then double the quotient and remainder.\n        uint32_t half_n = (uint32_t)1 << (16 + shift);\n        uint32_t d = ((uint32_t)1 << 16) | denom->magic;\n        // Note that the quotient is guaranteed <= 16 bits, but the remainder\n        // may need 17!\n        uint16_t half_q = (uint16_t)(half_n / d);\n        uint32_t rem = half_n % d;\n        // We computed 2^(16+shift)/(m+2^16)\n        // Need to double it, and then add 1 to the quotient if doubling th\n        // remainder would increase the quotient.\n        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits\n        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);\n\n        // We rounded down in gen (hence +1)\n        return full_q + 1;\n    }\n}\n\n////////// UINT32\n\nstatic LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(\n    uint32_t d, int branchfree) {\n    if (d == 0) {\n        LIBDIVIDE_ERROR(\"divider must be != 0\");\n    }\n\n    struct libdivide_u32_t result;\n    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);\n\n    // Power of 2\n    if ((d & (d - 1)) == 0) {\n        // We need to subtract 1 from the shift value in case of an unsigned\n        // branchfree divider because there is a hardcoded right shift by 1\n        // in its division algorithm. Because of this we also need to add back\n        // 1 in its recovery algorithm.\n        result.magic = 0;\n        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));\n    } else {\n        uint8_t more;\n        uint32_t rem, proposed_m;\n        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);\n\n        LIBDIVIDE_ASSERT(rem > 0 && rem < d);\n        const uint32_t e = d - rem;\n\n        // This power works if e < 2**floor_log_2_d.\n        if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {\n            // This power works\n            more = (uint8_t)floor_log_2_d;\n        } else {\n            // We have to use the general 33-bit algorithm.  We need to compute\n            // (2**power) / d. However, we already have (2**(power-1))/d and\n            // its remainder.  By doubling both, and then correcting the\n            // remainder, we can compute the larger division.\n            // don't care about overflow here - in fact, we expect it\n            proposed_m += proposed_m;\n            const uint32_t twice_rem = rem + rem;\n            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;\n            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);\n        }\n        result.magic = 1 + proposed_m;\n        result.more = more;\n        // result.more's shift should in general be ceil_log_2_d. But if we\n        // used the smaller power, we subtract one from the shift because we're\n        // using the smaller power. If we're using the larger power, we\n        // subtract one from the shift because it's taken care of by the add\n        // indicator. So floor_log_2_d happens to be correct in both cases.\n    }\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {\n    return libdivide_internal_u32_gen(d, 0);\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {\n    if (d == 1) {\n        LIBDIVIDE_ERROR(\"branchfree divider must be != 1\");\n    }\n    struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);\n    struct libdivide_u32_branchfree_t ret = {\n        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};\n    return ret;\n}\n\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {\n    if (!magic) {\n        return numer >> more;\n    } else {\n        uint32_t q = libdivide_mullhi_u32(numer, magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            uint32_t t = ((numer - q) >> 1) + q;\n            return t >> (more & LIBDIVIDE_32_SHIFT_MASK);\n        } else {\n            // All upper bits are 0,\n            // don't need to mask them off.\n            return q >> more;\n        }\n    }\n}\n\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {\n    return libdivide_u32_do_raw(numer, denom->magic, denom->more);\n}\n\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(\n    uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {\n    uint32_t q = libdivide_mullhi_u32(numer, denom->magic);\n    uint32_t t = ((numer - q) >> 1) + q;\n    return t >> denom->more;\n}\n\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n\n    if (!denom->magic) {\n        return (uint32_t)1 << shift;\n    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {\n        // We compute q = n/d = n*m / 2^(32 + shift)\n        // Therefore we have d = 2^(32 + shift) / m\n        // We need to ceil it.\n        // We know d is not a power of 2, so m is not a power of 2,\n        // so we can just add 1 to the floor\n        uint32_t hi_dividend = (uint32_t)1 << shift;\n        uint32_t rem_ignored;\n        return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);\n    } else {\n        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).\n        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now\n        // Also note that shift may be as high as 31, so shift + 1 will\n        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and\n        // then double the quotient and remainder.\n        uint64_t half_n = (uint64_t)1 << (32 + shift);\n        uint64_t d = ((uint64_t)1 << 32) | denom->magic;\n        // Note that the quotient is guaranteed <= 32 bits, but the remainder\n        // may need 33!\n        uint32_t half_q = (uint32_t)(half_n / d);\n        uint64_t rem = half_n % d;\n        // We computed 2^(32+shift)/(m+2^32)\n        // Need to double it, and then add 1 to the quotient if doubling th\n        // remainder would increase the quotient.\n        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits\n        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);\n\n        // We rounded down in gen (hence +1)\n        return full_q + 1;\n    }\n}\n\nstatic LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n\n    if (!denom->magic) {\n        return (uint32_t)1 << (shift + 1);\n    } else {\n        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).\n        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now\n        // Also note that shift may be as high as 31, so shift + 1 will\n        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and\n        // then double the quotient and remainder.\n        uint64_t half_n = (uint64_t)1 << (32 + shift);\n        uint64_t d = ((uint64_t)1 << 32) | denom->magic;\n        // Note that the quotient is guaranteed <= 32 bits, but the remainder\n        // may need 33!\n        uint32_t half_q = (uint32_t)(half_n / d);\n        uint64_t rem = half_n % d;\n        // We computed 2^(32+shift)/(m+2^32)\n        // Need to double it, and then add 1 to the quotient if doubling th\n        // remainder would increase the quotient.\n        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits\n        uint32_t full_q = half_q + half_q + ((rem << 1) >= d);\n\n        // We rounded down in gen (hence +1)\n        return full_q + 1;\n    }\n}\n\n////////// UINT64\n\nstatic LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(\n    uint64_t d, int branchfree) {\n    if (d == 0) {\n        LIBDIVIDE_ERROR(\"divider must be != 0\");\n    }\n\n    struct libdivide_u64_t result;\n    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);\n\n    // Power of 2\n    if ((d & (d - 1)) == 0) {\n        // We need to subtract 1 from the shift value in case of an unsigned\n        // branchfree divider because there is a hardcoded right shift by 1\n        // in its division algorithm. Because of this we also need to add back\n        // 1 in its recovery algorithm.\n        result.magic = 0;\n        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));\n    } else {\n        uint64_t proposed_m, rem;\n        uint8_t more;\n        // (1 << (64 + floor_log_2_d)) / d\n        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);\n\n        LIBDIVIDE_ASSERT(rem > 0 && rem < d);\n        const uint64_t e = d - rem;\n\n        // This power works if e < 2**floor_log_2_d.\n        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {\n            // This power works\n            more = (uint8_t)floor_log_2_d;\n        } else {\n            // We have to use the general 65-bit algorithm.  We need to compute\n            // (2**power) / d. However, we already have (2**(power-1))/d and\n            // its remainder. By doubling both, and then correcting the\n            // remainder, we can compute the larger division.\n            // don't care about overflow here - in fact, we expect it\n            proposed_m += proposed_m;\n            const uint64_t twice_rem = rem + rem;\n            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;\n            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);\n        }\n        result.magic = 1 + proposed_m;\n        result.more = more;\n        // result.more's shift should in general be ceil_log_2_d. But if we\n        // used the smaller power, we subtract one from the shift because we're\n        // using the smaller power. If we're using the larger power, we\n        // subtract one from the shift because it's taken care of by the add\n        // indicator. So floor_log_2_d happens to be correct in both cases,\n        // which is why we do it outside of the if statement.\n    }\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {\n    return libdivide_internal_u64_gen(d, 0);\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {\n    if (d == 1) {\n        LIBDIVIDE_ERROR(\"branchfree divider must be != 1\");\n    }\n    struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);\n    struct libdivide_u64_branchfree_t ret = {\n        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};\n    return ret;\n}\n\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {\n   if (!magic) {\n        return numer >> more;\n    } else {\n        uint64_t q = libdivide_mullhi_u64(numer, magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            uint64_t t = ((numer - q) >> 1) + q;\n            return t >> (more & LIBDIVIDE_64_SHIFT_MASK);\n        } else {\n            // All upper bits are 0,\n            // don't need to mask them off.\n            return q >> more;\n        }\n    }\n}\n\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {\n    return libdivide_u64_do_raw(numer, denom->magic, denom->more);\n}\n\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(\n    uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {\n    uint64_t q = libdivide_mullhi_u64(numer, denom->magic);\n    uint64_t t = ((numer - q) >> 1) + q;\n    return t >> denom->more;\n}\n\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n\n    if (!denom->magic) {\n        return (uint64_t)1 << shift;\n    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {\n        // We compute q = n/d = n*m / 2^(64 + shift)\n        // Therefore we have d = 2^(64 + shift) / m\n        // We need to ceil it.\n        // We know d is not a power of 2, so m is not a power of 2,\n        // so we can just add 1 to the floor\n        uint64_t hi_dividend = (uint64_t)1 << shift;\n        uint64_t rem_ignored;\n        return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);\n    } else {\n        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).\n        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See\n        // libdivide_u32_recover for more on what we do here.\n        // TODO: do something better than 128 bit math\n\n        // Full n is a (potentially) 129 bit value\n        // half_n is a 128 bit value\n        // Compute the hi half of half_n. Low half is 0.\n        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;\n        // d is a 65 bit value. The high bit is always set to 1.\n        const uint64_t d_hi = 1, d_lo = denom->magic;\n        // Note that the quotient is guaranteed <= 64 bits,\n        // but the remainder may need 65!\n        uint64_t r_hi, r_lo;\n        uint64_t half_q =\n            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);\n        // We computed 2^(64+shift)/(m+2^64)\n        // Double the remainder ('dr') and check if that is larger than d\n        // Note that d is a 65 bit value, so r1 is small and so r1 + r1\n        // cannot overflow\n        uint64_t dr_lo = r_lo + r_lo;\n        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry\n        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);\n        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);\n        return full_q + 1;\n    }\n}\n\nstatic LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n\n    if (!denom->magic) {\n        return (uint64_t)1 << (shift + 1);\n    } else {\n        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).\n        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See\n        // libdivide_u32_recover for more on what we do here.\n        // TODO: do something better than 128 bit math\n\n        // Full n is a (potentially) 129 bit value\n        // half_n is a 128 bit value\n        // Compute the hi half of half_n. Low half is 0.\n        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;\n        // d is a 65 bit value. The high bit is always set to 1.\n        const uint64_t d_hi = 1, d_lo = denom->magic;\n        // Note that the quotient is guaranteed <= 64 bits,\n        // but the remainder may need 65!\n        uint64_t r_hi, r_lo;\n        uint64_t half_q =\n            libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);\n        // We computed 2^(64+shift)/(m+2^64)\n        // Double the remainder ('dr') and check if that is larger than d\n        // Note that d is a 65 bit value, so r1 is small and so r1 + r1\n        // cannot overflow\n        uint64_t dr_lo = r_lo + r_lo;\n        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry\n        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);\n        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);\n        return full_q + 1;\n    }\n}\n\n////////// SINT16\n\nstatic LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(\n    int16_t d, int branchfree) {\n    if (d == 0) {\n        LIBDIVIDE_ERROR(\"divider must be != 0\");\n    }\n\n    struct libdivide_s16_t result;\n\n    // If d is a power of 2, or negative a power of 2, we have to use a shift.\n    // This is especially important because the magic algorithm fails for -1.\n    // To check if d is a power of 2 or its inverse, it suffices to check\n    // whether its absolute value has exactly one bit set. This works even for\n    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set\n    // and is a power of 2.\n    uint16_t ud = (uint16_t)d;\n    uint16_t absD = (d < 0) ? -ud : ud;\n    uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);\n    // check if exactly one bit is set,\n    // don't care if absD is 0 since that's divide by zero\n    if ((absD & (absD - 1)) == 0) {\n        // Branchfree and normal paths are exactly the same\n        result.magic = 0;\n        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));\n    } else {\n        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);\n\n        uint8_t more;\n        // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word\n        // is 0 and the high word is floor_log_2_d - 1\n        uint16_t rem, proposed_m;\n        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);\n        const uint16_t e = absD - rem;\n\n        // We are going to start with a power of floor_log_2_d - 1.\n        // This works if works if e < 2**floor_log_2_d.\n        if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {\n            // This power works\n            more = (uint8_t)(floor_log_2_d - 1);\n        } else {\n            // We need to go one higher. This should not make proposed_m\n            // overflow, but it will make it negative when interpreted as an\n            // int16_t.\n            proposed_m += proposed_m;\n            const uint16_t twice_rem = rem + rem;\n            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;\n            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);\n        }\n\n        proposed_m += 1;\n        int16_t magic = (int16_t)proposed_m;\n\n        // Mark if we are negative. Note we only negate the magic number in the\n        // branchfull case.\n        if (d < 0) {\n            more |= LIBDIVIDE_NEGATIVE_DIVISOR;\n            if (!branchfree) {\n                magic = -magic;\n            }\n        }\n\n        result.more = more;\n        result.magic = magic;\n    }\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) {\n    return libdivide_internal_s16_gen(d, 0);\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {\n    struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);\n    struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};\n    return result;\n}\n\n// The original libdivide_s16_do takes a const pointer. However, this cannot be used\n// with a compile time constant libdivide_s16_t: it will generate a warning about\n// taking the address of a temporary. Hence this overload.\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {\n    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n\n    if (!magic) {\n        uint16_t sign = (int8_t)more >> 7;\n        uint16_t mask = ((uint16_t)1 << shift) - 1;\n        uint16_t uq = numer + ((numer >> 15) & mask);\n        int16_t q = (int16_t)uq;\n        q >>= shift;\n        q = (q ^ sign) - sign;\n        return q;\n    } else {\n        uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift and then sign extend\n            int16_t sign = (int8_t)more >> 7;\n            // q += (more < 0 ? -numer : numer)\n            // cast required to avoid UB\n            uq += ((uint16_t)numer ^ sign) - sign;\n        }\n        int16_t q = (int16_t)uq;\n        q >>= shift;\n        q += (q < 0);\n        return q;\n    }\n}\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {\n    return libdivide_s16_do_raw(numer, denom->magic, denom->more);\n}\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n    // must be arithmetic shift and then sign extend\n    int16_t sign = (int8_t)more >> 7;\n    int16_t magic = denom->magic;\n    int16_t q = libdivide_mullhi_s16(numer, magic);\n    q += numer;\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is a power of\n    // 2, or (2**shift) if it is not a power of 2\n    uint16_t is_power_of_2 = (magic == 0);\n    uint16_t q_sign = (uint16_t)(q >> 15);\n    q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);\n\n    // Now arithmetic right shift\n    q >>= shift;\n    // Negate if needed\n    q = (q ^ sign) - sign;\n\n    return q;\n}\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n    if (!denom->magic) {\n        uint16_t absD = (uint16_t)1 << shift;\n        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {\n            absD = -absD;\n        }\n        return (int16_t)absD;\n    } else {\n        // Unsigned math is much easier\n        // We negate the magic number only in the branchfull case, and we don't\n        // know which case we're in. However we have enough information to\n        // determine the correct sign of the magic number. The divisor was\n        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,\n        // the magic number's sign is opposite that of the divisor.\n        // We want to compute the positive magic number.\n        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);\n        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;\n\n        // Handle the power of 2 case (including branchfree)\n        if (denom->magic == 0) {\n            int16_t result = (uint16_t)1 << shift;\n            return negative_divisor ? -result : result;\n        }\n\n        uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);\n        uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30\n        uint16_t q = (uint16_t)(n / d);\n        int16_t result = (int16_t)q;\n        result += 1;\n        return negative_divisor ? -result : result;\n    }\n}\n\nstatic LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {\n    const struct libdivide_s16_t den = {denom->magic, denom->more};\n    return libdivide_s16_recover(&den);\n}\n\n////////// SINT32\n\nstatic LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(\n    int32_t d, int branchfree) {\n    if (d == 0) {\n        LIBDIVIDE_ERROR(\"divider must be != 0\");\n    }\n\n    struct libdivide_s32_t result;\n\n    // If d is a power of 2, or negative a power of 2, we have to use a shift.\n    // This is especially important because the magic algorithm fails for -1.\n    // To check if d is a power of 2 or its inverse, it suffices to check\n    // whether its absolute value has exactly one bit set. This works even for\n    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set\n    // and is a power of 2.\n    uint32_t ud = (uint32_t)d;\n    uint32_t absD = (d < 0) ? -ud : ud;\n    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);\n    // check if exactly one bit is set,\n    // don't care if absD is 0 since that's divide by zero\n    if ((absD & (absD - 1)) == 0) {\n        // Branchfree and normal paths are exactly the same\n        result.magic = 0;\n        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));\n    } else {\n        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);\n\n        uint8_t more;\n        // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word\n        // is 0 and the high word is floor_log_2_d - 1\n        uint32_t rem, proposed_m;\n        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);\n        const uint32_t e = absD - rem;\n\n        // We are going to start with a power of floor_log_2_d - 1.\n        // This works if works if e < 2**floor_log_2_d.\n        if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {\n            // This power works\n            more = (uint8_t)(floor_log_2_d - 1);\n        } else {\n            // We need to go one higher. This should not make proposed_m\n            // overflow, but it will make it negative when interpreted as an\n            // int32_t.\n            proposed_m += proposed_m;\n            const uint32_t twice_rem = rem + rem;\n            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;\n            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);\n        }\n\n        proposed_m += 1;\n        int32_t magic = (int32_t)proposed_m;\n\n        // Mark if we are negative. Note we only negate the magic number in the\n        // branchfull case.\n        if (d < 0) {\n            more |= LIBDIVIDE_NEGATIVE_DIVISOR;\n            if (!branchfree) {\n                magic = -magic;\n            }\n        }\n\n        result.more = more;\n        result.magic = magic;\n    }\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) {\n    return libdivide_internal_s32_gen(d, 0);\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {\n    struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);\n    struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n\n    if (!magic) {\n        uint32_t sign = (int8_t)more >> 7;\n        uint32_t mask = ((uint32_t)1 << shift) - 1;\n        uint32_t uq = numer + ((numer >> 31) & mask);\n        int32_t q = (int32_t)uq;\n        q >>= shift;\n        q = (q ^ sign) - sign;\n        return q;\n    } else {\n        uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift and then sign extend\n            int32_t sign = (int8_t)more >> 7;\n            // q += (more < 0 ? -numer : numer)\n            // cast required to avoid UB\n            uq += ((uint32_t)numer ^ sign) - sign;\n        }\n        int32_t q = (int32_t)uq;\n        q >>= shift;\n        q += (q < 0);\n        return q;\n    }\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {\n    return libdivide_s32_do_raw(numer, denom->magic, denom->more);\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n    // must be arithmetic shift and then sign extend\n    int32_t sign = (int8_t)more >> 7;\n    int32_t magic = denom->magic;\n    int32_t q = libdivide_mullhi_s32(numer, magic);\n    q += numer;\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is a power of\n    // 2, or (2**shift) if it is not a power of 2\n    uint32_t is_power_of_2 = (magic == 0);\n    uint32_t q_sign = (uint32_t)(q >> 31);\n    q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);\n\n    // Now arithmetic right shift\n    q >>= shift;\n    // Negate if needed\n    q = (q ^ sign) - sign;\n\n    return q;\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n    if (!denom->magic) {\n        uint32_t absD = (uint32_t)1 << shift;\n        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {\n            absD = -absD;\n        }\n        return (int32_t)absD;\n    } else {\n        // Unsigned math is much easier\n        // We negate the magic number only in the branchfull case, and we don't\n        // know which case we're in. However we have enough information to\n        // determine the correct sign of the magic number. The divisor was\n        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,\n        // the magic number's sign is opposite that of the divisor.\n        // We want to compute the positive magic number.\n        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);\n        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;\n\n        // Handle the power of 2 case (including branchfree)\n        if (denom->magic == 0) {\n            int32_t result = (uint32_t)1 << shift;\n            return negative_divisor ? -result : result;\n        }\n\n        uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);\n        uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30\n        uint32_t q = (uint32_t)(n / d);\n        int32_t result = (int32_t)q;\n        result += 1;\n        return negative_divisor ? -result : result;\n    }\n}\n\nstatic LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {\n    const struct libdivide_s32_t den = {denom->magic, denom->more};\n    return libdivide_s32_recover(&den);\n}\n\n////////// SINT64\n\nstatic LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(\n    int64_t d, int branchfree) {\n    if (d == 0) {\n        LIBDIVIDE_ERROR(\"divider must be != 0\");\n    }\n\n    struct libdivide_s64_t result;\n\n    // If d is a power of 2, or negative a power of 2, we have to use a shift.\n    // This is especially important because the magic algorithm fails for -1.\n    // To check if d is a power of 2 or its inverse, it suffices to check\n    // whether its absolute value has exactly one bit set.  This works even for\n    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set\n    // and is a power of 2.\n    uint64_t ud = (uint64_t)d;\n    uint64_t absD = (d < 0) ? -ud : ud;\n    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);\n    // check if exactly one bit is set,\n    // don't care if absD is 0 since that's divide by zero\n    if ((absD & (absD - 1)) == 0) {\n        // Branchfree and non-branchfree cases are the same\n        result.magic = 0;\n        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));\n    } else {\n        // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word\n        // is 0 and the high word is floor_log_2_d - 1\n        uint8_t more;\n        uint64_t rem, proposed_m;\n        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);\n        const uint64_t e = absD - rem;\n\n        // We are going to start with a power of floor_log_2_d - 1.\n        // This works if works if e < 2**floor_log_2_d.\n        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {\n            // This power works\n            more = (uint8_t)(floor_log_2_d - 1);\n        } else {\n            // We need to go one higher. This should not make proposed_m\n            // overflow, but it will make it negative when interpreted as an\n            // int32_t.\n            proposed_m += proposed_m;\n            const uint64_t twice_rem = rem + rem;\n            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;\n            // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we\n            // also set ADD_MARKER this is an annoying optimization that\n            // enables algorithm #4 to avoid the mask. However we always set it\n            // in the branchfree case\n            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);\n        }\n        proposed_m += 1;\n        int64_t magic = (int64_t)proposed_m;\n\n        // Mark if we are negative\n        if (d < 0) {\n            more |= LIBDIVIDE_NEGATIVE_DIVISOR;\n            if (!branchfree) {\n                magic = -magic;\n            }\n        }\n\n        result.more = more;\n        result.magic = magic;\n    }\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) {\n    return libdivide_internal_s64_gen(d, 0);\n}\n\nstatic LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {\n    struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);\n    struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};\n    return ret;\n}\n\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n\n    if (!magic) {  // shift path\n        uint64_t mask = ((uint64_t)1 << shift) - 1;\n        uint64_t uq = numer + ((numer >> 63) & mask);\n        int64_t q = (int64_t)uq;\n        q >>= shift;\n        // must be arithmetic shift and then sign-extend\n        int64_t sign = (int8_t)more >> 7;\n        q = (q ^ sign) - sign;\n        return q;\n    } else {\n        uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift and then sign extend\n            int64_t sign = (int8_t)more >> 7;\n            // q += (more < 0 ? -numer : numer)\n            // cast required to avoid UB\n            uq += ((uint64_t)numer ^ sign) - sign;\n        }\n        int64_t q = (int64_t)uq;\n        q >>= shift;\n        q += (q < 0);\n        return q;\n    }\n}\n\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {\n    return libdivide_s64_do_raw(numer, denom->magic, denom->more);\n}\n\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n    // must be arithmetic shift and then sign extend\n    int64_t sign = (int8_t)more >> 7;\n    int64_t magic = denom->magic;\n    int64_t q = libdivide_mullhi_s64(numer, magic);\n    q += numer;\n\n    // If q is non-negative, we have nothing to do.\n    // If q is negative, we want to add either (2**shift)-1 if d is a power of\n    // 2, or (2**shift) if it is not a power of 2.\n    uint64_t is_power_of_2 = (magic == 0);\n    uint64_t q_sign = (uint64_t)(q >> 63);\n    q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);\n\n    // Arithmetic right shift\n    q >>= shift;\n    // Negate if needed\n    q = (q ^ sign) - sign;\n\n    return q;\n}\n\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n    if (denom->magic == 0) {  // shift path\n        uint64_t absD = (uint64_t)1 << shift;\n        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {\n            absD = -absD;\n        }\n        return (int64_t)absD;\n    } else {\n        // Unsigned math is much easier\n        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);\n        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;\n\n        uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);\n        uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;\n        uint64_t rem_ignored;\n        uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);\n        int64_t result = (int64_t)(q + 1);\n        if (negative_divisor) {\n            result = -result;\n        }\n        return result;\n    }\n}\n\nstatic LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {\n    const struct libdivide_s64_t den = {denom->magic, denom->more};\n    return libdivide_s64_recover(&den);\n}\n\n// Simplest possible vector type division: treat the vector type as an array\n// of underlying native type.\n//\n// Use a union to read a vector via pointer-to-integer, without violating strict\n// aliasing.\n#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo)                          \\\n    const size_t count = sizeof(VecT) / sizeof(IntT);                     \\\n    union type_pun_vec {                                                  \\\n        VecT vec;                                                         \\\n        IntT arr[sizeof(VecT) / sizeof(IntT)];                            \\\n    };                                                                    \\\n    union type_pun_vec result;                                            \\\n    union type_pun_vec input;                                             \\\n    input.vec = numers;                                                   \\\n    for (size_t loop = 0; loop < count; ++loop) {                         \\\n        result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \\\n    }                                                                     \\\n    return result.vec;\n\n#if defined(LIBDIVIDE_NEON)\n\nstatic LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(\n    uint16x8_t numers, const struct libdivide_u16_t *denom);\nstatic LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(\n    int16x8_t numers, const struct libdivide_s16_t *denom);\nstatic LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(\n    uint32x4_t numers, const struct libdivide_u32_t *denom);\nstatic LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(\n    int32x4_t numers, const struct libdivide_s32_t *denom);\nstatic LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(\n    uint64x2_t numers, const struct libdivide_u64_t *denom);\nstatic LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(\n    int64x2_t numers, const struct libdivide_s64_t *denom);\n\nstatic LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(\n    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(\n    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(\n    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(\n    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(\n    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(\n    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);\n\n//////// Internal Utility Functions\n\n// Logical right shift by runtime value.\n// NEON implements right shift as left shits by negative values.\nstatic LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {\n    int32_t wamt = (int32_t)(amt);\n    return vshlq_u32(v, vdupq_n_s32(-wamt));\n}\n\nstatic LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {\n    int64_t wamt = (int64_t)(amt);\n    return vshlq_u64(v, vdupq_n_s64(-wamt));\n}\n\n// Arithmetic right shift by runtime value.\nstatic LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {\n    int32_t wamt = (int32_t)(amt);\n    return vshlq_s32(v, vdupq_n_s32(-wamt));\n}\n\nstatic LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {\n    int64_t wamt = (int64_t)(amt);\n    return vshlq_s64(v, vdupq_n_s64(-wamt));\n}\n\nstatic LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }\n\nstatic LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {\n    // Desire is [x0, x1, x2, x3]\n    uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]\n    uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]\n    return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]\n}\n\nstatic LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {\n    int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]\n    int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]\n    return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]\n}\n\nstatic LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {\n    // full 128 bits product is:\n    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)\n    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.\n\n    // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.\n    uint64x2_t y = vdupq_n_u64(sy);\n    uint32x2_t x0 = vmovn_u64(x);\n    uint32x2_t y0 = vmovn_u64(y);\n    uint32x2_t x1 = vshrn_n_u64(x, 32);\n    uint32x2_t y1 = vshrn_n_u64(y, 32);\n\n    // Compute x0*y0.\n    uint64x2_t x0y0 = vmull_u32(x0, y0);\n    uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);\n\n    // Compute other intermediate products.\n    uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0);  // temp = x0y0_hi + x1*y0;\n    // We want to split temp into its low 32 bits and high 32 bits, both\n    // in the low half of 64 bit registers.\n    // Use shifts to avoid needing a reg for the mask.\n    uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32);  // temp_lo = temp & 0xFFFFFFFF;\n    uint64x2_t temp_hi = vshrq_n_u64(temp, 32);                   // temp_hi = temp >> 32;\n\n    temp_lo = vmlal_u32(temp_lo, x0, y1);  // temp_lo += x0*y0\n    temp_lo = vshrq_n_u64(temp_lo, 32);    // temp_lo >>= 32\n    temp_hi = vmlal_u32(temp_hi, x1, y1);  // temp_hi += x1*y1\n    uint64x2_t result = vaddq_u64(temp_hi, temp_lo);\n    return result;\n}\n\nstatic LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {\n    int64x2_t p = vreinterpretq_s64_u64(\n        libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));\n    int64x2_t y = vdupq_n_s64(sy);\n    int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);\n    int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);\n    p = vsubq_s64(p, t1);\n    p = vsubq_s64(p, t2);\n    return p;\n}\n\n////////// UINT16\n\nuint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){\n    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}\n\nuint16x8_t libdivide_u16_branchfree_do_vec128(\n    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){\n    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}\n\n////////// UINT32\n\nuint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return libdivide_u32_neon_srl(numers, more);\n    } else {\n        uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            // Note we can use halving-subtract to avoid the shift.\n            uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n            uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);\n            return libdivide_u32_neon_srl(t, shift);\n        } else {\n            return libdivide_u32_neon_srl(q, more);\n        }\n    }\n}\n\nuint32x4_t libdivide_u32_branchfree_do_vec128(\n    uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {\n    uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);\n    uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);\n    return libdivide_u32_neon_srl(t, denom->more);\n}\n\n////////// UINT64\n\nuint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return libdivide_u64_neon_srl(numers, more);\n    } else {\n        uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            // No 64-bit halving subtracts in NEON :(\n            uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n            uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);\n            return libdivide_u64_neon_srl(t, shift);\n        } else {\n            return libdivide_u64_neon_srl(q, more);\n        }\n    }\n}\n\nuint64x2_t libdivide_u64_branchfree_do_vec128(\n    uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {\n    uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);\n    uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);\n    return libdivide_u64_neon_srl(t, denom->more);\n}\n\n////////// SINT16\n\nint16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){\n    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}\n\nint16x8_t libdivide_s16_branchfree_do_vec128(\n    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){\n    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}\n\n////////// SINT32\n\nint32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n        uint32_t mask = ((uint32_t)1 << shift) - 1;\n        int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);\n        // q = numer + ((numer >> 31) & roundToZeroTweak);\n        int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));\n        q = libdivide_s32_neon_sra(q, shift);\n        int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = vsubq_s32(veorq_s32(q, sign), sign);\n        return q;\n    } else {\n        int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));\n        }\n        // q >>= shift\n        q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);\n        q = vaddq_s32(\n            q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31)));  // q += (q < 0)\n        return q;\n    }\n}\n\nint32x4_t libdivide_s32_branchfree_do_vec128(\n    int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {\n    int32_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n    // must be arithmetic shift\n    int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);\n    int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);\n    q = vaddq_s32(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2\n    uint32_t is_power_of_2 = (magic == 0);\n    int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31\n    int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);\n    q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)\n    q = libdivide_s32_neon_sra(q, shift);       // q >>= shift\n    q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n////////// SINT64\n\nint64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {\n    uint8_t more = denom->more;\n    int64_t magic = denom->magic;\n    if (magic == 0) {  // shift path\n        uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n        uint64_t mask = ((uint64_t)1 << shift) - 1;\n        int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend\n        // q = numer + ((numer >> 63) & roundToZeroTweak);\n        int64x2_t q =\n            vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));\n        q = libdivide_s64_neon_sra(q, shift);\n        // q = (q ^ sign) - sign;\n        int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));\n        q = vsubq_s64(veorq_s64(q, sign), sign);\n        return q;\n    } else {\n        int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: no need to widen\n            // q += ((numer ^ sign) - sign);\n            q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));\n        }\n        // q >>= denom->mult_path.shift\n        q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);\n        q = vaddq_s64(\n            q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63)));  // q += (q < 0)\n        return q;\n    }\n}\n\nint64x2_t libdivide_s64_branchfree_do_vec128(\n    int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {\n    int64_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n    // must be arithmetic shift\n    int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: avoid sign extend\n\n    // libdivide_mullhi_s64(numers, magic);\n    int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);\n    q = vaddq_s64(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do.\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2.\n    uint32_t is_power_of_2 = (magic == 0);\n    int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63\n    int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);\n    q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)\n    q = libdivide_s64_neon_sra(q, shift);       // q >>= shift\n    q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n#endif\n\n#if defined(LIBDIVIDE_AVX512)\n\nstatic LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(\n    __m512i numers, const struct libdivide_u16_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(\n    __m512i numers, const struct libdivide_s16_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(\n    __m512i numers, const struct libdivide_u32_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(\n    __m512i numers, const struct libdivide_s32_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(\n    __m512i numers, const struct libdivide_u64_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(\n    __m512i numers, const struct libdivide_s64_t *denom);\n\nstatic LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_u16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_s16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_u32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_s32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_u64_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_s64_branchfree_t *denom);\n\n//////// Internal Utility Functions\n\nstatic LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {\n    ;\n    return _mm512_srai_epi64(v, 63);\n}\n\nstatic LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {\n    return _mm512_srai_epi64(v, amt);\n}\n\n// Here, b is assumed to contain one 32-bit value repeated.\nstatic LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {\n    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);\n    __m512i a1X3X = _mm512_srli_epi64(a, 32);\n    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);\n    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);\n    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);\n}\n\n// b is one 32-bit value repeated.\nstatic LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {\n    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);\n    __m512i a1X3X = _mm512_srli_epi64(a, 32);\n    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);\n    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);\n    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);\n}\n\n// Here, y is assumed to contain one 64-bit value repeated.\nstatic LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {\n    // see m128i variant for comments.\n    __m512i x0y0 = _mm512_mul_epu32(x, y);\n    __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);\n\n    __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));\n    __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));\n\n    __m512i x0y1 = _mm512_mul_epu32(x, y1);\n    __m512i x1y0 = _mm512_mul_epu32(x1, y);\n    __m512i x1y1 = _mm512_mul_epu32(x1, y1);\n\n    __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);\n    __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);\n    __m512i temp_lo = _mm512_and_si512(temp, mask);\n    __m512i temp_hi = _mm512_srli_epi64(temp, 32);\n\n    temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);\n    temp_hi = _mm512_add_epi64(x1y1, temp_hi);\n    return _mm512_add_epi64(temp_lo, temp_hi);\n}\n\n// y is one 64-bit value repeated.\nstatic LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {\n    __m512i p = libdivide_mullhi_u64_vec512(x, y);\n    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);\n    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);\n    p = _mm512_sub_epi64(p, t1);\n    p = _mm512_sub_epi64(p, t2);\n    return p;\n}\n\n////////// UINT16\n\n__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){\n    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}\n\n__m512i libdivide_u16_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_u16_branchfree_t *denom){\n    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}\n\n////////// UINT32\n\n__m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm512_srli_epi32(numers, more);\n    } else {\n        __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n            __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);\n            return _mm512_srli_epi32(t, shift);\n        } else {\n            return _mm512_srli_epi32(q, more);\n        }\n    }\n}\n\n__m512i libdivide_u32_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_u32_branchfree_t *denom) {\n    __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));\n    __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);\n    return _mm512_srli_epi32(t, denom->more);\n}\n\n////////// UINT64\n\n__m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm512_srli_epi64(numers, more);\n    } else {\n        __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n            __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);\n            return _mm512_srli_epi64(t, shift);\n        } else {\n            return _mm512_srli_epi64(q, more);\n        }\n    }\n}\n\n__m512i libdivide_u64_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_u64_branchfree_t *denom) {\n    __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));\n    __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);\n    return _mm512_srli_epi64(t, denom->more);\n}\n\n////////// SINT16\n\n__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){\n    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}\n\n__m512i libdivide_s16_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_s16_branchfree_t *denom){\n    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}\n\n////////// SINT32\n\n__m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n        uint32_t mask = ((uint32_t)1 << shift) - 1;\n        __m512i roundToZeroTweak = _mm512_set1_epi32(mask);\n        // q = numer + ((numer >> 31) & roundToZeroTweak);\n        __m512i q = _mm512_add_epi32(\n            numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));\n        q = _mm512_srai_epi32(q, shift);\n        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);\n        return q;\n    } else {\n        __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));\n        }\n        // q >>= shift\n        q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);\n        q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m512i libdivide_s32_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_s32_branchfree_t *denom) {\n    int32_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n    // must be arithmetic shift\n    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);\n    __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));\n    q = _mm512_add_epi32(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2\n    uint32_t is_power_of_2 = (magic == 0);\n    __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31\n    __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);\n    q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)\n    q = _mm512_srai_epi32(q, shift);                          // q >>= shift\n    q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n////////// SINT64\n\n__m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {\n    uint8_t more = denom->more;\n    int64_t magic = denom->magic;\n    if (magic == 0) {  // shift path\n        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n        uint64_t mask = ((uint64_t)1 << shift) - 1;\n        __m512i roundToZeroTweak = _mm512_set1_epi64(mask);\n        // q = numer + ((numer >> 63) & roundToZeroTweak);\n        __m512i q = _mm512_add_epi64(\n            numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));\n        q = libdivide_s64_shift_right_vec512(q, shift);\n        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);\n        return q;\n    } else {\n        __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));\n        }\n        // q >>= denom->mult_path.shift\n        q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);\n        q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m512i libdivide_s64_branchfree_do_vec512(\n    __m512i numers, const struct libdivide_s64_branchfree_t *denom) {\n    int64_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n    // must be arithmetic shift\n    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);\n\n    // libdivide_mullhi_s64(numers, magic);\n    __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));\n    q = _mm512_add_epi64(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do.\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2.\n    uint32_t is_power_of_2 = (magic == 0);\n    __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63\n    __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);\n    q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)\n    q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift\n    q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n#endif\n\n#if defined(LIBDIVIDE_AVX2)\n\nstatic LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(\n    __m256i numers, const struct libdivide_u16_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(\n    __m256i numers, const struct libdivide_s16_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(\n    __m256i numers, const struct libdivide_u32_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(\n    __m256i numers, const struct libdivide_s32_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(\n    __m256i numers, const struct libdivide_u64_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(\n    __m256i numers, const struct libdivide_s64_t *denom);\n\nstatic LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_u16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_s16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_u32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_s32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_u64_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_s64_branchfree_t *denom);\n\n//////// Internal Utility Functions\n\n// Implementation of _mm256_srai_epi64(v, 63) (from AVX512).\nstatic LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {\n    __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));\n    __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);\n    return signBits;\n}\n\n// Implementation of _mm256_srai_epi64 (from AVX512).\nstatic LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {\n    const int b = 64 - amt;\n    __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));\n    __m256i x = _mm256_srli_epi64(v, amt);\n    __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);\n    return result;\n}\n\n// Here, b is assumed to contain one 32-bit value repeated.\nstatic LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {\n    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);\n    __m256i a1X3X = _mm256_srli_epi64(a, 32);\n    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);\n    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);\n    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);\n}\n\n// b is one 32-bit value repeated.\nstatic LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {\n    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);\n    __m256i a1X3X = _mm256_srli_epi64(a, 32);\n    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);\n    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);\n    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);\n}\n\n// Here, y is assumed to contain one 64-bit value repeated.\nstatic LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {\n    // see m128i variant for comments.\n    __m256i x0y0 = _mm256_mul_epu32(x, y);\n    __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);\n\n    __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));\n    __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));\n\n    __m256i x0y1 = _mm256_mul_epu32(x, y1);\n    __m256i x1y0 = _mm256_mul_epu32(x1, y);\n    __m256i x1y1 = _mm256_mul_epu32(x1, y1);\n\n    __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);\n    __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);\n    __m256i temp_lo = _mm256_and_si256(temp, mask);\n    __m256i temp_hi = _mm256_srli_epi64(temp, 32);\n\n    temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);\n    temp_hi = _mm256_add_epi64(x1y1, temp_hi);\n    return _mm256_add_epi64(temp_lo, temp_hi);\n}\n\n// y is one 64-bit value repeated.\nstatic LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {\n    __m256i p = libdivide_mullhi_u64_vec256(x, y);\n    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);\n    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);\n    p = _mm256_sub_epi64(p, t1);\n    p = _mm256_sub_epi64(p, t2);\n    return p;\n}\n\n////////// UINT16\n\n__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm256_srli_epi16(numers, more);\n    } else {\n        __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);\n            return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));\n        } else {\n            return _mm256_srli_epi16(q, more);\n        }\n    }\n}\n\n__m256i libdivide_u16_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_u16_branchfree_t *denom) {\n    __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));\n    __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);\n    return _mm256_srli_epi16(t, denom->more);\n}\n\n////////// UINT32\n\n__m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm256_srli_epi32(numers, more);\n    } else {\n        __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n            __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);\n            return _mm256_srli_epi32(t, shift);\n        } else {\n            return _mm256_srli_epi32(q, more);\n        }\n    }\n}\n\n__m256i libdivide_u32_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_u32_branchfree_t *denom) {\n    __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));\n    __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);\n    return _mm256_srli_epi32(t, denom->more);\n}\n\n////////// UINT64\n\n__m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm256_srli_epi64(numers, more);\n    } else {\n        __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n            __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);\n            return _mm256_srli_epi64(t, shift);\n        } else {\n            return _mm256_srli_epi64(q, more);\n        }\n    }\n}\n\n__m256i libdivide_u64_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_u64_branchfree_t *denom) {\n    __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));\n    __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);\n    return _mm256_srli_epi64(t, denom->more);\n}\n\n////////// SINT16\n\n__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n        uint16_t mask = ((uint16_t)1 << shift) - 1;\n        __m256i roundToZeroTweak = _mm256_set1_epi16(mask);\n        // q = numer + ((numer >> 15) & roundToZeroTweak);\n        __m256i q = _mm256_add_epi16(\n            numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));\n        q = _mm256_srai_epi16(q, shift);\n        __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);\n        return q;\n    } else {\n        __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));\n        }\n        // q >>= shift\n        q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);\n        q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m256i libdivide_s16_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_s16_branchfree_t *denom) {\n    int16_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n    // must be arithmetic shift\n    __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);\n    __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));\n    q = _mm256_add_epi16(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2\n    uint16_t is_power_of_2 = (magic == 0);\n    __m256i q_sign = _mm256_srai_epi16(q, 15);  // q_sign = q >> 15\n    __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);\n    q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)\n    q = _mm256_srai_epi16(q, shift);                          // q >>= shift\n    q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n////////// SINT32\n\n__m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n        uint32_t mask = ((uint32_t)1 << shift) - 1;\n        __m256i roundToZeroTweak = _mm256_set1_epi32(mask);\n        // q = numer + ((numer >> 31) & roundToZeroTweak);\n        __m256i q = _mm256_add_epi32(\n            numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));\n        q = _mm256_srai_epi32(q, shift);\n        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);\n        return q;\n    } else {\n        __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));\n        }\n        // q >>= shift\n        q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);\n        q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m256i libdivide_s32_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_s32_branchfree_t *denom) {\n    int32_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n    // must be arithmetic shift\n    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);\n    __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));\n    q = _mm256_add_epi32(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2\n    uint32_t is_power_of_2 = (magic == 0);\n    __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31\n    __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);\n    q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)\n    q = _mm256_srai_epi32(q, shift);                          // q >>= shift\n    q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n////////// SINT64\n\n__m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {\n    uint8_t more = denom->more;\n    int64_t magic = denom->magic;\n    if (magic == 0) {  // shift path\n        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n        uint64_t mask = ((uint64_t)1 << shift) - 1;\n        __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);\n        // q = numer + ((numer >> 63) & roundToZeroTweak);\n        __m256i q = _mm256_add_epi64(\n            numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));\n        q = libdivide_s64_shift_right_vec256(q, shift);\n        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);\n        return q;\n    } else {\n        __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));\n        }\n        // q >>= denom->mult_path.shift\n        q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);\n        q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m256i libdivide_s64_branchfree_do_vec256(\n    __m256i numers, const struct libdivide_s64_branchfree_t *denom) {\n    int64_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n    // must be arithmetic shift\n    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);\n\n    // libdivide_mullhi_s64(numers, magic);\n    __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));\n    q = _mm256_add_epi64(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do.\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2.\n    uint32_t is_power_of_2 = (magic == 0);\n    __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63\n    __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);\n    q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)\n    q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift\n    q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n#endif\n\n#if defined(LIBDIVIDE_SSE2)\n\nstatic LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(\n    __m128i numers, const struct libdivide_u16_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(\n    __m128i numers, const struct libdivide_s16_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(\n    __m128i numers, const struct libdivide_u32_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(\n    __m128i numers, const struct libdivide_s32_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(\n    __m128i numers, const struct libdivide_u64_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(\n    __m128i numers, const struct libdivide_s64_t *denom);\n\nstatic LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_u16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_s16_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_u32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_s32_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_u64_branchfree_t *denom);\nstatic LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_s64_branchfree_t *denom);\n\n//////// Internal Utility Functions\n\n// Implementation of _mm_srai_epi64(v, 63) (from AVX512).\nstatic LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {\n    __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));\n    __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);\n    return signBits;\n}\n\n// Implementation of _mm_srai_epi64 (from AVX512).\nstatic LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {\n    const int b = 64 - amt;\n    __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));\n    __m128i x = _mm_srli_epi64(v, amt);\n    __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);\n    return result;\n}\n\n// Here, b is assumed to contain one 32-bit value repeated.\nstatic LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {\n    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);\n    __m128i a1X3X = _mm_srli_epi64(a, 32);\n    __m128i mask = _mm_set_epi32(-1, 0, -1, 0);\n    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);\n    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);\n}\n\n// SSE2 does not have a signed multiplication instruction, but we can convert\n// unsigned to signed pretty efficiently. Again, b is just a 32 bit value\n// repeated four times.\nstatic LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {\n    __m128i p = libdivide_mullhi_u32_vec128(a, b);\n    // t1 = (a >> 31) & y, arithmetic shift\n    __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);\n    __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);\n    p = _mm_sub_epi32(p, t1);\n    p = _mm_sub_epi32(p, t2);\n    return p;\n}\n\n// Here, y is assumed to contain one 64-bit value repeated.\nstatic LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {\n    // full 128 bits product is:\n    // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)\n    // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.\n\n    // Compute x0*y0.\n    // Note x1, y1 are ignored by mul_epu32.\n    __m128i x0y0 = _mm_mul_epu32(x, y);\n    __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);\n\n    // Get x1, y1 in the low bits.\n    // We could shuffle or right shift. Shuffles are preferred as they preserve\n    // the source register for the next computation.\n    __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));\n    __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));\n\n    // No need to mask off top 32 bits for mul_epu32.\n    __m128i x0y1 = _mm_mul_epu32(x, y1);\n    __m128i x1y0 = _mm_mul_epu32(x1, y);\n    __m128i x1y1 = _mm_mul_epu32(x1, y1);\n\n    // Mask here selects low bits only.\n    __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);\n    __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);\n    __m128i temp_lo = _mm_and_si128(temp, mask);\n    __m128i temp_hi = _mm_srli_epi64(temp, 32);\n\n    temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);\n    temp_hi = _mm_add_epi64(x1y1, temp_hi);\n    return _mm_add_epi64(temp_lo, temp_hi);\n}\n\n// y is one 64-bit value repeated.\nstatic LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {\n    __m128i p = libdivide_mullhi_u64_vec128(x, y);\n    __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);\n    __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);\n    p = _mm_sub_epi64(p, t1);\n    p = _mm_sub_epi64(p, t2);\n    return p;\n}\n\n////////// UINT26\n\n__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm_srli_epi16(numers, more);\n    } else {\n        __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);\n            return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));\n        } else {\n            return _mm_srli_epi16(q, more);\n        }\n    }\n}\n\n__m128i libdivide_u16_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_u16_branchfree_t *denom) {\n    __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));\n    __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);\n    return _mm_srli_epi16(t, denom->more);\n}\n\n////////// UINT32\n\n__m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm_srli_epi32(numers, more);\n    } else {\n        __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n            __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);\n            return _mm_srli_epi32(t, shift);\n        } else {\n            return _mm_srli_epi32(q, more);\n        }\n    }\n}\n\n__m128i libdivide_u32_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_u32_branchfree_t *denom) {\n    __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));\n    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);\n    return _mm_srli_epi32(t, denom->more);\n}\n\n////////// UINT64\n\n__m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        return _mm_srli_epi64(numers, more);\n    } else {\n        __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // uint32_t t = ((numer - q) >> 1) + q;\n            // return t >> denom->shift;\n            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n            __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);\n            return _mm_srli_epi64(t, shift);\n        } else {\n            return _mm_srli_epi64(q, more);\n        }\n    }\n}\n\n__m128i libdivide_u64_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_u64_branchfree_t *denom) {\n    __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));\n    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);\n    return _mm_srli_epi64(t, denom->more);\n}\n\n////////// SINT16\n\n__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n        uint16_t mask = ((uint16_t)1 << shift) - 1;\n        __m128i roundToZeroTweak = _mm_set1_epi16(mask);\n        // q = numer + ((numer >> 15) & roundToZeroTweak);\n        __m128i q =\n            _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));\n        q = _mm_srai_epi16(q, shift);\n        __m128i sign = _mm_set1_epi16((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);\n        return q;\n    } else {\n        __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m128i sign = _mm_set1_epi16((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));\n        }\n        // q >>= shift\n        q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);\n        q = _mm_add_epi16(q, _mm_srli_epi16(q, 15));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m128i libdivide_s16_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_s16_branchfree_t *denom) {\n    int16_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;\n    // must be arithmetic shift\n    __m128i sign = _mm_set1_epi16((int8_t)more >> 7);\n    __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));\n    q = _mm_add_epi16(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2\n    uint16_t is_power_of_2 = (magic == 0);\n    __m128i q_sign = _mm_srai_epi16(q, 15);  // q_sign = q >> 15\n    __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);\n    q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)\n    q = _mm_srai_epi16(q, shift);                       // q >>= shift\n    q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n////////// SINT32\n\n__m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {\n    uint8_t more = denom->more;\n    if (!denom->magic) {\n        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n        uint32_t mask = ((uint32_t)1 << shift) - 1;\n        __m128i roundToZeroTweak = _mm_set1_epi32(mask);\n        // q = numer + ((numer >> 31) & roundToZeroTweak);\n        __m128i q =\n            _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));\n        q = _mm_srai_epi32(q, shift);\n        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);\n        return q;\n    } else {\n        __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));\n        }\n        // q >>= shift\n        q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);\n        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m128i libdivide_s32_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_s32_branchfree_t *denom) {\n    int32_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;\n    // must be arithmetic shift\n    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);\n    __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));\n    q = _mm_add_epi32(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2\n    uint32_t is_power_of_2 = (magic == 0);\n    __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31\n    __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);\n    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)\n    q = _mm_srai_epi32(q, shift);                       // q >>= shift\n    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n////////// SINT64\n\n__m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {\n    uint8_t more = denom->more;\n    int64_t magic = denom->magic;\n    if (magic == 0) {  // shift path\n        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n        uint64_t mask = ((uint64_t)1 << shift) - 1;\n        __m128i roundToZeroTweak = _mm_set1_epi64x(mask);\n        // q = numer + ((numer >> 63) & roundToZeroTweak);\n        __m128i q = _mm_add_epi64(\n            numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));\n        q = libdivide_s64_shift_right_vec128(q, shift);\n        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);\n        // q = (q ^ sign) - sign;\n        q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);\n        return q;\n    } else {\n        __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));\n        if (more & LIBDIVIDE_ADD_MARKER) {\n            // must be arithmetic shift\n            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);\n            // q += ((numer ^ sign) - sign);\n            q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));\n        }\n        // q >>= denom->mult_path.shift\n        q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);\n        q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));  // q += (q < 0)\n        return q;\n    }\n}\n\n__m128i libdivide_s64_branchfree_do_vec128(\n    __m128i numers, const struct libdivide_s64_branchfree_t *denom) {\n    int64_t magic = denom->magic;\n    uint8_t more = denom->more;\n    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;\n    // must be arithmetic shift\n    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);\n\n    // libdivide_mullhi_s64(numers, magic);\n    __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));\n    q = _mm_add_epi64(q, numers);  // q += numers\n\n    // If q is non-negative, we have nothing to do.\n    // If q is negative, we want to add either (2**shift)-1 if d is\n    // a power of 2, or (2**shift) if it is not a power of 2.\n    uint32_t is_power_of_2 = (magic == 0);\n    __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63\n    __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);\n    q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)\n    q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift\n    q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign\n    return q;\n}\n\n#endif\n\n////////// C++ stuff\n\n#ifdef __cplusplus\n\nenum Branching {\n    BRANCHFULL,  // use branching algorithms\n    BRANCHFREE   // use branchfree algorithms\n};\n\nnamespace detail {\nenum Signedness {\n    SIGNED,\n    UNSIGNED,\n};\n\n#if defined(LIBDIVIDE_NEON)\n// Helper to deduce NEON vector type for integral type.\ntemplate <int _WIDTH, Signedness _SIGN>\nstruct NeonVec {};\n\ntemplate <>\nstruct NeonVec<16, UNSIGNED> {\n    typedef uint16x8_t type;\n};\n\ntemplate <>\nstruct NeonVec<16, SIGNED> {\n    typedef int16x8_t type;\n};\n\ntemplate <>\nstruct NeonVec<32, UNSIGNED> {\n    typedef uint32x4_t type;\n};\n\ntemplate <>\nstruct NeonVec<32, SIGNED> {\n    typedef int32x4_t type;\n};\n\ntemplate <>\nstruct NeonVec<64, UNSIGNED> {\n    typedef uint64x2_t type;\n};\n\ntemplate <>\nstruct NeonVec<64, SIGNED> {\n    typedef int64x2_t type;\n};\n\ntemplate <typename T>\nstruct NeonVecFor {\n    // See 'class divider' for an explanation of these template parameters.\n    typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;\n};\n\n#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \\\n    LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \\\n        typename NeonVecFor<INT_TYPE>::type n) const {           \\\n        return libdivide_##ALGO##_do_vec128(n, &denom);          \\\n    }\n#else\n#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)\n#endif\n\n#if defined(LIBDIVIDE_SSE2)\n#define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \\\n    LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \\\n        return libdivide_##ALGO##_do_vec128(n, &denom); \\\n    }\n#else\n#define LIBDIVIDE_DIVIDE_SSE2(ALGO)\n#endif\n\n#if defined(LIBDIVIDE_AVX2)\n#define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \\\n    LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \\\n        return libdivide_##ALGO##_do_vec256(n, &denom); \\\n    }\n#else\n#define LIBDIVIDE_DIVIDE_AVX2(ALGO)\n#endif\n\n#if defined(LIBDIVIDE_AVX512)\n#define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \\\n    LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \\\n        return libdivide_##ALGO##_do_vec512(n, &denom); \\\n    }\n#else\n#define LIBDIVIDE_DIVIDE_AVX512(ALGO)\n#endif\n\n// The DISPATCHER_GEN() macro generates C++ methods (for the given integer\n// and algorithm types) that redirect to libdivide's C API.\n#define DISPATCHER_GEN(T, ALGO)                                                       \\\n    libdivide_##ALGO##_t denom;                                                       \\\n    LIBDIVIDE_INLINE dispatcher() {}                                                  \\\n    explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {}              \\\n    LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \\\n    LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \\\n    LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \\\n    LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \\\n    LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \\\n    LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \\\n    LIBDIVIDE_DIVIDE_AVX512(ALGO)\n\n// The dispatcher selects a specific division algorithm for a given\n// width, signedness, and ALGO using partial template specialization.\ntemplate <int _WIDTH, Signedness _SIGN, Branching _ALGO>\nstruct dispatcher {};\n\ntemplate <>\nstruct dispatcher<16, SIGNED, BRANCHFULL> {\n    DISPATCHER_GEN(int16_t, s16)\n};\ntemplate <>\nstruct dispatcher<16, SIGNED, BRANCHFREE> {\n    DISPATCHER_GEN(int16_t, s16_branchfree)\n};\ntemplate <>\nstruct dispatcher<16, UNSIGNED, BRANCHFULL> {\n    DISPATCHER_GEN(uint16_t, u16)\n};\ntemplate <>\nstruct dispatcher<16, UNSIGNED, BRANCHFREE> {\n    DISPATCHER_GEN(uint16_t, u16_branchfree)\n};\ntemplate <>\nstruct dispatcher<32, SIGNED, BRANCHFULL> {\n    DISPATCHER_GEN(int32_t, s32)\n};\ntemplate <>\nstruct dispatcher<32, SIGNED, BRANCHFREE> {\n    DISPATCHER_GEN(int32_t, s32_branchfree)\n};\ntemplate <>\nstruct dispatcher<32, UNSIGNED, BRANCHFULL> {\n    DISPATCHER_GEN(uint32_t, u32)\n};\ntemplate <>\nstruct dispatcher<32, UNSIGNED, BRANCHFREE> {\n    DISPATCHER_GEN(uint32_t, u32_branchfree)\n};\ntemplate <>\nstruct dispatcher<64, SIGNED, BRANCHFULL> {\n    DISPATCHER_GEN(int64_t, s64)\n};\ntemplate <>\nstruct dispatcher<64, SIGNED, BRANCHFREE> {\n    DISPATCHER_GEN(int64_t, s64_branchfree)\n};\ntemplate <>\nstruct dispatcher<64, UNSIGNED, BRANCHFULL> {\n    DISPATCHER_GEN(uint64_t, u64)\n};\ntemplate <>\nstruct dispatcher<64, UNSIGNED, BRANCHFREE> {\n    DISPATCHER_GEN(uint64_t, u64_branchfree)\n};\n}  // namespace detail\n\n#if defined(LIBDIVIDE_NEON)\n// Allow NeonVecFor outside of detail namespace.\ntemplate <typename T>\nstruct NeonVecFor {\n    typedef typename detail::NeonVecFor<T>::type type;\n};\n#endif\n\n// This is the main divider class for use by the user (C++ API).\n// The actual division algorithm is selected using the dispatcher struct\n// based on the integer width and algorithm template parameters.\ntemplate <typename T, Branching ALGO = BRANCHFULL>\nclass divider {\n   private:\n    // Dispatch based on the size and signedness.\n    // We avoid using type_traits as it's not available in AVR.\n    // Detect signedness by checking if T(-1) is less than T(0).\n    // Also throw in a shift by 0, which prevents floating point types from being passed.\n    typedef detail::dispatcher<sizeof(T) * 8,\n        (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>\n        dispatcher_t;\n\n   public:\n    // We leave the default constructor empty so that creating\n    // an array of dividers and then initializing them\n    // later doesn't slow us down.\n    divider() {}\n\n    // constexpr zero-initialization to allow for use w/ static constinit\n    explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}\n\n    // Constructor that takes the divisor as a parameter\n    LIBDIVIDE_INLINE divider(T d) : div(d) {}\n\n    // Divides n by the divisor\n    LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }\n\n    // Recovers the divisor, returns the value that was\n    // used to initialize this divider object.\n    T recover() const { return div.recover(); }\n\n    bool operator==(const divider<T, ALGO> &other) const {\n        return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;\n    }\n\n    bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }\n\n    // Vector variants treat the input as packed integer values with the same type as the divider\n    // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed\n    // quotients.\n#if defined(LIBDIVIDE_SSE2)\n    LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }\n#endif\n#if defined(LIBDIVIDE_AVX2)\n    LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }\n#endif\n#if defined(LIBDIVIDE_AVX512)\n    LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }\n#endif\n#if defined(LIBDIVIDE_NEON)\n    LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {\n        return div.divide(n);\n    }\n#endif\n\n   private:\n    // Storage for the actual divisor\n    dispatcher_t div;\n};\n\n// Overload of operator / for scalar division\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {\n    return div.divide(n);\n}\n\n// Overload of operator /= for scalar division\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {\n    n = div.divide(n);\n    return n;\n}\n\n// Overloads for vector types.\n#if defined(LIBDIVIDE_SSE2)\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {\n    return div.divide(n);\n}\n\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {\n    n = div.divide(n);\n    return n;\n}\n#endif\n#if defined(LIBDIVIDE_AVX2)\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {\n    return div.divide(n);\n}\n\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {\n    n = div.divide(n);\n    return n;\n}\n#endif\n#if defined(LIBDIVIDE_AVX512)\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {\n    return div.divide(n);\n}\n\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {\n    n = div.divide(n);\n    return n;\n}\n#endif\n\n#if defined(LIBDIVIDE_NEON)\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(\n    typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {\n    return div.divide(n);\n}\n\ntemplate <typename T, Branching ALGO>\nLIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(\n    typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {\n    n = div.divide(n);\n    return n;\n}\n#endif\n\n#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)\n// libdivide::branchfree_divider<T>\ntemplate <typename T>\nusing branchfree_divider = divider<T, BRANCHFREE>;\n#endif\n\n}  // namespace libdivide\n\n#endif  // __cplusplus\n\n#if defined(_MSC_VER) && !defined(__clang__)\n#pragma warning(pop)\n#endif\n\n#endif  // LIBDIVIDE_H\n"
  },
  {
    "path": "ext/skeletontricks/skeletontricks.hpp",
    "content": "/*\n * This file is part of Kimimaro.\n * \n * Kimimaro is free software: you can redistribute it and/or modify\n * it under the terms of the GNU General Public License as published by\n * the Free Software Foundation, either version 3 of the License, or\n * (at your option) any later version.\n * \n * Kimimaro is distributed in the hope that it will be useful,\n * but WITHOUT ANY WARRANTY; without even the implied warranty of\n * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n * GNU General Public License for more details.\n * \n * You should have received a copy of the GNU General Public License\n * along with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.\n *\n * \n * Author: William Silversmith\n * Affiliation: Seung Lab, Princeton University\n * Date: September 2018 - April 2025\n */\n\n#include <algorithm>\n#include <cmath>\n#include <cstdio>\n#include <cstdint>\n#include <queue>\n#include <vector>\n#include <stack>\n#include <unordered_map>\n#include <unordered_set>\n#include <string>\n#include <utility>\n\n#include \"unordered_dense.hpp\"\n\n#ifndef SKELETONTRICKS_HPP\n#define SKELETONTRICKS_HPP\n\nnamespace skeletontricks {\n\nsize_t _roll_invalidation_cube(\n  uint8_t* labels, float* DBF,\n  const int64_t sx, const int64_t sy, const int64_t sz,\n  const float wx, const float wy, const float wz,\n  size_t* path, const size_t path_size,\n  const float scale, const float constant\n) {\n\n  if (path_size == 0) {\n    return 0;\n  }\n\n  const size_t sxy = sx * sy;\n  const size_t voxels = sxy * sz;\n\n  int64_t minx, maxx, miny, maxy, minz, maxz;\n  int64_t x, y, z;\n\n  int64_t global_minx = sx;\n  int64_t global_maxx = 0;\n  int64_t global_miny = sy;\n  int64_t global_maxy = 0;\n  int64_t global_minz = sz;\n  int64_t global_maxz = 0;\n\n  std::vector<int16_t> topology(voxels);\n  \n  const bool power_of_two = !((sx & (sx - 1)) || (sy & (sy - 1))); \n  const int xshift = std::log2(sx); // must use log2 here, not lg/lg2 to avoid fp errors\n  const int yshift = std::log2(sy);\n\n  size_t loc;\n  float radius;\n\n  // First pass: compute toplology\n  for (size_t i = 0; i < path_size; i++) {\n    loc = path[i];\n    radius = scale * DBF[loc] + constant;\n\n    if (power_of_two) {\n      z = loc >> (xshift + yshift);\n      y = (loc - (z << (xshift + yshift))) >> xshift;\n      x = loc - ((y + (z << yshift)) << xshift);\n    }\n    else {\n      z = loc / sxy;\n      y = (loc - (z * sxy)) / sx;\n      x = loc - sx * (y + z * sy);\n    }\n\n    const int64_t ZERO = 0;\n\n    minx = std::max(ZERO,    static_cast<int64_t>(x - (radius / wx)));\n    maxx = std::min(sx-1, static_cast<int64_t>(0.5 + (x + (radius / wx))));\n    miny = std::max(ZERO,    static_cast<int64_t>(y - (radius / wy)));\n    maxy = std::min(sy-1, static_cast<int64_t>(0.5 + (y + (radius / wy))));\n    minz = std::max(ZERO,    static_cast<int64_t>(z - (radius / wz)));\n    maxz = std::min(sz-1, static_cast<int64_t>(0.5 + (z + (radius / wz))));\n\n    global_minx = std::min(global_minx, minx);\n    global_maxx = std::max(global_maxx, maxx);\n    global_miny = std::min(global_miny, miny);\n    global_maxy = std::max(global_maxy, maxy);\n    global_minz = std::min(global_minz, minz);\n    global_maxz = std::max(global_maxz, maxz);\n\n    for (y = miny; y <= maxy; y++) {\n      for (z = minz; z <= maxz; z++) {\n        topology[minx + sx * y + sxy * z] += 1;\n        topology[maxx + sx * y + sxy * z] -= 1;\n      }\n    }\n  }\n\n  // Second pass: invalidate labels\n  int coloring;\n  size_t invalidated = 0;\n  size_t yzoffset;\n  for (z = global_minz; z <= global_maxz; z++) {\n    for (y = global_miny; y <= global_maxy; y++) {\n      yzoffset = sx * y + sxy * z;\n\n      coloring = 0;\n      for (x = global_minx; x <= global_maxx; x++) {\n        coloring += topology[x + yzoffset];\n        if (coloring > 0 || topology[x + yzoffset]) {\n          invalidated += static_cast<size_t>(labels[x + yzoffset] > 0); // convert non-bool vals\n          labels[x + yzoffset] = 0;\n        }\n      }\n    }\n  }\n\n  return invalidated;\n}\n\ntemplate <typename T>\ninline size_t max(T* edges, const size_t size) {\n  if (size == 0) {\n    return 0;\n  }\n\n  size_t mx = edges[0];\n  for (size_t i = 0; i < size; i++) {\n    if (static_cast<size_t>(edges[i]) > mx) {\n      mx = static_cast<size_t>(edges[i]);\n    }\n  }\n\n  return mx;\n}\n\ntemplate <typename T>\nvoid printvec(std::vector<T> vec) {\n  for (T v : vec) {\n    printf(\"%d, \", v);\n  }\n  printf(\"\\n\");\n}\n\ntemplate <typename T>\nvoid printstack(std::stack<T> stack) {\n  while (!stack.empty()) {\n    printf(\"%d, \", stack.top());\n    stack.pop();\n  }\n\n  printf(\"\\n\");\n}\n\ntemplate <typename T>\nstd::vector<T> stack2vec(std::stack<T> stk) {\n  std::vector<T> vec;\n  vec.reserve(stk.size());\n\n  while (!stk.empty()) {\n    vec.push_back(stk.top());\n    stk.pop();\n  }\n\n  std::reverse(vec.begin(), vec.end());\n\n  return vec;\n}\n\n// Ne = size of edges / 2\n// Nv = number of vertices (max of edge values)\ntemplate <typename T>\nstd::vector<T> _find_cycle(const T* edges, const size_t Ne) {\n  if (Ne == 0) {\n    return std::vector<T>(0);\n  }\n\n  size_t Nv = max(edges, Ne * 2) + 1; // +1 to ensure zero is counted\n\n  std::vector< ankerl::unordered_dense::set<T> > index(Nv);\n  index.reserve(Nv);\n\n  // NB: consolidate handles the trivial loops (e1 == e2)\n  //     and deduplication of edges\n  for (size_t i = 0; i < 2 * Ne; i += 2) {\n    T e1 = edges[i];\n    T e2 = edges[i+1];\n\n    index[e1].insert(e2);\n    index[e2].insert(e1);\n  }\n\n  T root = edges[0];\n  T node = -1;\n  T parent = -1;\n  uint32_t depth = -1;\n\n  std::stack<T> stack;\n  std::stack<T> parents;\n  std::stack<uint32_t> depth_stack;\n  std::stack<T> path;\n\n  stack.push(root);\n  parents.push(-1);\n  depth_stack.push(0);\n  \n  std::vector<bool> visited(Nv, false);\n\n  while (!stack.empty()) {\n    node = stack.top();\n    parent = parents.top();\n    depth = depth_stack.top();\n\n    stack.pop();\n    parents.pop();\n    depth_stack.pop();\n\n    while (path.size() > depth) {\n      path.pop();\n    }\n\n    path.push(node);\n\n    if (visited[node]) {\n      break;\n    }\n    visited[node] = true;\n\n    for (T child : index[node]) {\n      if (child == parent) {\n        continue;\n      }\n\n      stack.push(child);\n      parents.push(node);\n      depth_stack.push(depth + 1);\n    }\n  }\n\n  if (path.size() <= 1) {\n    return std::vector<T>(0);\n  }\n\n  // cast stack to vector w/ zero copy\n  std::vector<T> vec_path = stack2vec<T>(path);\n\n  // Find start of loop. Since a cycle was detected,\n  // the last node found started the cycle. We need\n  // to trim the path leading up to that connection.\n  size_t i;\n  for (i = 0; i < vec_path.size() - 1; i++) {\n    if (vec_path[i] == node) {\n      break;\n    }\n  }\n\n  if (vec_path.size() - i < 3) {\n    return std::vector<T>(0);\n  }\n\n  return std::vector<T>(vec_path.begin() + i, vec_path.end());\n}\n\n// Had trouble returning an unordered_map< pair<int,int>, float>\n// to python, so I decided to just pack two uint32s into a uint64\n// and unpack them on the other side.\nstd::unordered_map<uint64_t, float> _create_distance_graph(\n  float* vertices, size_t Nv, \n  uint32_t* edges, size_t Ne, uint32_t start_node,\n  std::vector<int32_t> critical_points_vec\n) {\n\n  std::vector< std::vector<uint32_t> > tree(Nv);\n  tree.reserve(Nv);\n\n  std::vector<bool> critical_points(Nv, false);\n  for (uint32_t edge : critical_points_vec) {\n    critical_points[edge] = true;\n  }\n\n  for (size_t i = 0; i < Ne; i++) {\n    uint32_t e1 = edges[2*i];\n    uint32_t e2 = edges[2*i + 1];\n\n    tree[e1].push_back(e2);\n    tree[e2].push_back(e1);\n  }\n\n  std::unordered_map<uint64_t, float> distgraph;\n\n  std::stack<uint32_t> stack;\n  std::stack<int32_t> parents;\n  std::stack<float> dist_stack;\n  std::stack<uint32_t> root_stack;\n\n  stack.push(start_node);\n  parents.push(-1);\n  dist_stack.push(0.0);\n  root_stack.push(start_node);\n\n  uint32_t node, root;\n  int32_t parent;\n  float dist;\n\n  uint64_t key = 0;\n\n  std::vector<bool> visited(Nv, false);\n\n  while (!stack.empty()) {\n    node = stack.top();\n    dist = dist_stack.top();\n    root = root_stack.top();\n    parent = parents.top();\n\n    if (visited[node]) {\n      throw std::runtime_error(std::string(\"Cycle detected. Node: \") + std::to_string(node));\n    }\n    visited[node] = true;\n\n    stack.pop();\n    dist_stack.pop();\n    root_stack.pop();\n    parents.pop();\n\n    if (critical_points[node] && node != root) {\n      key = (root < node)\n        ? static_cast<uint64_t>(root) | (static_cast<uint64_t>(node) << 32)\n        : static_cast<uint64_t>(node) | (static_cast<uint64_t>(root) << 32);\n\n      distgraph[key] = dist;\n      dist = 0.0;\n      root = node;\n    }\n\n    for (int32_t child : tree[node]) {\n      if (static_cast<int32_t>(child) == parent) {\n        continue;\n      }\n\n      float dx = vertices[3*node + 0] - vertices[3*child + 0];\n      float dy = vertices[3*node + 1] - vertices[3*child + 1];\n      float dz = vertices[3*node + 2] - vertices[3*child + 2];\n\n      dx *= dx;\n      dy *= dy;\n      dz *= dz;\n\n      stack.push(child);\n      parents.push(static_cast<int32_t>(node));\n      dist_stack.push(\n        dist + sqrt(dx + dy + dz)\n      );\n      root_stack.push(root);\n    }\n  }\n\n  return distgraph;\n}\n\n// extracting skeletons from binary images produced by\n// other thinning based skeletonization algorithms\n\ninline void compute_neighborhood(\n  int *neighborhood, \n  const int x, const int y, const int z,\n  const uint64_t sx, const uint64_t sy, const uint64_t sz,\n  const int connectivity = 26\n) {\n\n  const int sxy = sx * sy;\n\n  const int plus_x = (x < (static_cast<int>(sx) - 1)); // +x\n  const int minus_x = -1 * (x > 0); // -x\n  const int plus_y = static_cast<int>(sx) * (y < static_cast<int>(sy) - 1); // +y\n  const int minus_y = -static_cast<int>(sx) * (y > 0); // -y\n  const int minus_z = -sxy * static_cast<int>(z > 0); // -z\n\n  // 6-hood\n  neighborhood[0] = minus_x;\n  neighborhood[1] = minus_y;\n  neighborhood[2] = minus_z;\n  \n  // 18-hood\n\n  // xy diagonals\n  neighborhood[3] = (connectivity > 6) * (minus_x + minus_y) * (minus_x && minus_y); // up-left\n  neighborhood[4] = (connectivity > 6) * (plus_x + minus_y) * (plus_x && minus_y); // up-right\n\n  // yz diagonals\n  neighborhood[5] = (connectivity > 6) * (minus_x + minus_z) * (minus_x && minus_z); // down-left\n  neighborhood[6] = (connectivity > 6) * (plus_x + minus_z) * (plus_x && minus_z); // down-right\n\n  // xz diagonals\n  neighborhood[7] = (connectivity > 6) * (minus_y + minus_z) * (minus_y && minus_z); // down-left\n  neighborhood[8] = (connectivity > 6) * (plus_y + minus_z) * (plus_y && minus_z); // down-right\n\n  // 26-hood\n\n  // Now the eight corners of the cube\n  neighborhood[9] = (connectivity > 18) * (minus_x + minus_y + minus_z) * (minus_y && minus_z);\n  neighborhood[10] = (connectivity > 18) * (plus_x + minus_y + minus_z) * (minus_y && minus_z);\n  neighborhood[11] = (connectivity > 18) * (minus_x + plus_y + minus_z) * (plus_y && minus_z);\n  neighborhood[12] = (connectivity > 18) * (plus_x + plus_y + minus_z) * (plus_y && minus_z);\n}\n\nstruct pair_hash {\n  inline std::size_t operator()(const std::pair<uint64_t,uint64_t> & v) const {\n    return v.first * 31 + v.second; // arbitrary hash fn\n  }\n};\n\nstd::unordered_set<std::pair<uint64_t, uint64_t>, pair_hash> \n_extract_edges_from_binary_image(\n  const uint8_t* image,\n  const uint64_t sx, const uint64_t sy, const uint64_t sz,\n  const int connectivity = 26\n) {\n\n  const uint64_t sxy = sx * sy;\n\n  std::unordered_set<std::pair<uint64_t, uint64_t>, pair_hash> edges;\n  edges.reserve(sx * sy * sz / 100);\n\n  int neighborhood[13];\n  uint64_t neighboridx = 0;\n\n  for (uint64_t z = 0; z < sz; z++) {\n    for (uint64_t y = 0; y < sy; y++) {\n      for (uint64_t x = 0; x < sx; x++) {\n        uint64_t loc = x + sx * y + sxy * z;\n        if (image[loc] == 0) {\n          continue;\n        }\n\n        compute_neighborhood(neighborhood, x, y, z, sx, sy, sz, connectivity);\n\n        for (int i = 0; i < 13; i++) {\n          if (neighborhood[i] == 0) {\n            continue;\n          }\n\n          neighboridx = loc + neighborhood[i];\n          if (image[neighboridx] == 0) {\n            continue;\n          }\n\n          if (loc <= neighboridx) {\n            edges.emplace(std::make_pair(loc, neighboridx));\n          } \n          else {\n            edges.emplace(std::make_pair(neighboridx, loc));\n          }\n        }\n      }\n    }\n  }\n\n  return edges;\n}\n\n};\n\n#endif\n"
  },
  {
    "path": "ext/skeletontricks/skeletontricks.pyx",
    "content": "# cython: language_level=3\n\"\"\"\nCertain operations have to be fast for the skeletonization\nprocedure. The ones that didn't fit elsewhere have a home here.\n\nAuthor: William Silversmith\nAffiliation: Seung Lab, Princeton Neuroscience Institute\nDate: August 2018 - May 2024\n\n*****************************************************************\nThis file is part of Kimimaro.\n\nKimimaro is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n\nKimimaro is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\nGNU General Public License for more details.\n\nYou should have received a copy of the GNU General Public License\nalong with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.\n*****************************************************************\n\"\"\"\ncimport cython\nfrom libc.stdlib cimport calloc, free\nfrom libc.stdint cimport (\n  int8_t, int16_t, int32_t, int64_t,\n  uint8_t, uint16_t, uint32_t, uint64_t\n)\nfrom libcpp cimport bool\nfrom cpython cimport array \nimport array\nimport sys\n\nfrom libcpp.vector cimport vector\nfrom libcpp.unordered_map cimport unordered_map\nfrom libcpp.unordered_set cimport unordered_set\nfrom libcpp.utility cimport pair as cpp_pair\n\ncimport numpy as cnp\nimport numpy as np\n\ncnp.import_array()\n\nfrom collections import defaultdict\n\ncdef float INFINITY = float('inf')\n\nctypedef fused UINT:\n  uint8_t\n  uint16_t\n  uint32_t\n  uint64_t\n  unsigned char\n\nctypedef fused INTEGER: \n  int8_t\n  int16_t\n  int32_t\n  int64_t\n  UINT\n\ncdef extern from \"dijkstra_invalidation.hpp\" namespace \"dijkstra_invalidation\":\n  cdef int64_t _roll_invalidation_ball(\n    uint8_t* field,\n    uint64_t sx, uint64_t sy, uint64_t sz, \n    float wx, float wy, float wz, \n    vector[uint64_t] sources,\n    vector[float] max_distances,\n    int connectivity,\n    uint32_t* voxel_connectivity_graph\n  )\n\ncdef extern from \"skeletontricks.hpp\" namespace \"skeletontricks\":\n  cdef size_t _roll_invalidation_cube(\n    uint8_t* labels, float* DBF,\n    int64_t sx, int64_t sy, int64_t sz,\n    float wx, float wy, float wz,\n    size_t* path, size_t path_size,\n    float scale, float constant\n  )\n\n  cdef vector[T] _find_cycle[T](T* edges, size_t Ne)\n  \n  cdef unordered_map[ uint64_t, float ] _create_distance_graph(\n    float* vertices, size_t Nv, \n    uint32_t* edges, size_t Ne, uint32_t start_node,\n    vector[int32_t] critical_points_vec\n  )\n\n  cdef struct pair_hash:\n    size_t __call__(cpp_pair[uint64_t,uint64_t] v)\n  cdef unordered_set[ cpp_pair[uint64_t, uint64_t], pair_hash ] _extract_edges_from_binary_image(\n    uint8_t* image, \n    uint64_t sx, uint64_t sy, uint64_t sz,\n    int connectivity\n  )\n\ndef find_cycle(cnp.ndarray[int32_t, ndim=2] edges):\n  \"\"\"\n  Given a graph of edges that are a single connected component,\n  find a cycle via depth first search.\n\n  Returns: list of edges in a cycle (empty list if no cycle is found)\n  \"\"\"\n  if edges.size == 0:\n    return np.zeros((0,), dtype=np.uint32)\n\n  edges = np.ascontiguousarray(edges)\n\n  cdef cnp.ndarray[int32_t, ndim=1] elist = np.array(\n    _find_cycle[int32_t](\n      <int32_t*>&edges[0,0], <size_t>(edges.size // 2)\n    ),\n    dtype=np.int32\n  )\n  return elist\n\ndef create_distance_graph(skeleton):\n  \"\"\"\n  Creates the distance \"supergraph\" from a single connected component \n  skeleton as described in _remove_ticks.\n\n  Returns: a distance \"supergraph\" describing the physical distance\n    between the critical points in the skeleton's structure.\n\n  Example skeleton with output:\n\n      60nm   60nm   60nm     \n    1------2------3------4\n      30nm |  70nm \\\n           5        ----6\n\n  { \n    (1,2): 60,  \n    (2,3): 60,\n    (2,5): 30,\n    (3,4): 60,\n    (3,6): 70,\n  }\n  \"\"\"\n  cdef cnp.ndarray[float, ndim=2] vertices = skeleton.vertices\n  cdef cnp.ndarray[uint32_t, ndim=2] edges = skeleton.edges\n\n  unique_nodes, unique_counts = np.unique(edges, return_counts=True)\n  terminal_nodes = unique_nodes[ unique_counts == 1 ]\n  branch_nodes = set(unique_nodes[ unique_counts >= 3 ])\n  \n  critical_points = set(terminal_nodes)\n  critical_points.update(branch_nodes)\n\n  res = _create_distance_graph(\n    <float*>&vertices[0,0], vertices.shape[0],\n    <uint32_t*>&edges[0,0], edges.shape[0], terminal_nodes[0],\n    list(critical_points)\n  )\n  cdef dict supergraph = res\n\n  cdef dict real_supergraph = {}\n  cdef uint64_t key = 0\n  cdef int32_t e1, e2\n\n  for key in supergraph.keys():\n    e2 = <int32_t>(key & 0xffffffff)\n    e1 = <int32_t>(key >> 32)\n    real_supergraph[ (e1, e2) ] = supergraph[key]\n\n  return real_supergraph\n\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\ndef inf2zero(cnp.ndarray[float, cast=True, ndim=3] field):\n  \"\"\"\n  inf2zero(cnp.ndarray[float, cast=True, ndim=3] field)\n\n  Convert infinities to zeros.\n\n  Returns: field\n  \"\"\"\n  cdef size_t sx, sy, sz \n  cdef size_t  x,  y,  z\n\n  sx = field.shape[0]\n  sy = field.shape[1]\n  sz = field.shape[2]\n\n  for z in range(0, sz):\n    for y in range(0, sy):\n      for x in range(0, sx):\n        if field[x,y,z] == INFINITY:\n          field[x,y,z] = 0\n\n  return field\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\ndef zero2inf(cnp.ndarray[float, cast=True, ndim=3] field):\n  \"\"\"\n  zero2inf(cnp.ndarray[float, cast=True, ndim=3] field)\n\n  Convert zeros to positive infinities.\n\n  Returns: field\n  \"\"\"\n  cdef size_t sx, sy, sz \n  cdef size_t  x,  y,  z\n\n  sx = field.shape[0]\n  sy = field.shape[1]\n  sz = field.shape[2]\n\n  for z in range(0, sz):\n    for y in range(0, sy):\n      for x in range(0, sx):\n        if (field[x,y,z] == 0):\n          field[x,y,z] = INFINITY\n\n  return field\n\n@cython.boundscheck(False)  \n@cython.wraparound(False)  # turn off negative index wrapping for entire function \n@cython.nonecheck(False)  \ndef zero_out_all_except(cnp.ndarray[INTEGER, cast=True, ndim=3] field, INTEGER leave_alone): \n  \"\"\"\n  zero_out_all_except(cnp.ndarray[INTEGER, cast=True, ndim=3] field, INTEGER leave_alone)\n\n  Change all values in field to zero except `leave_alone`.\n\n  Returns: field\n  \"\"\"\n  cdef size_t sx, sy, sz   \n  cdef size_t  x,  y,  z \n\n  sx = field.shape[0]  \n  sy = field.shape[1] \n  sz = field.shape[2] \n\n  for z in range(0, sz): \n    for y in range(0, sy):  \n      for x in range(0, sx):  \n        if (field[x,y,z] != leave_alone): \n          field[x,y,z] = 0  \n\n  return field  \n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\ndef finite_max(cnp.ndarray[float, cast=True, ndim=3] field):\n  \"\"\"\n  float finite_max(cnp.ndarray[float, cast=True, ndim=3] field)\n\n  Given a field of floats that may include infinities, find the \n  largest finite value.\n  \"\"\"\n  cdef size_t sx, sy, sz \n  cdef size_t  x,  y,  z\n\n  sx = field.shape[0]\n  sy = field.shape[1]\n  sz = field.shape[2]\n\n  cdef float maximum = -INFINITY\n  for z in range(0, sz):\n    for y in range(0, sy):\n      for x in range(0, sx):\n        if (field[x,y,z] > maximum) and (field[x,y,z] < +INFINITY):\n          maximum = field[x,y,z]\n\n  return maximum\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\ndef finite_min(cnp.ndarray[float, cast=True, ndim=3] field):\n  \"\"\"\n  float finite_min(cnp.ndarray[float, cast=True, ndim=3] field)\n\n  Given a field of floats that may include infinities, find the \n  minimum finite value.\n  \"\"\"\n  cdef size_t sx, sy, sz \n  cdef size_t  x,  y,  z\n\n  sx = field.shape[0]\n  sy = field.shape[1]\n  sz = field.shape[2]\n\n  cdef float minimum = -INFINITY\n  for z in range(0, sz):\n    for y in range(0, sy):\n      for x in range(0, sx):\n        if (field[x,y,z] < minimum) and (field[x,y,z] > -INFINITY):\n          minimum = field[x,y,z]\n\n  return minimum\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\ndef first_label(cnp.ndarray[uint8_t, cast=True, ndim=3] labels):\n  \"\"\"\n  uint8_t first_label(cnp.ndarray[uint8_t, cast=True, ndim=3] labels)\n\n  Scan through labels to find the first non-zero value and return it.\n  \"\"\"\n  cdef size_t sx, sy, sz \n  cdef size_t  x,  y,  z\n\n  sx = labels.shape[0]\n  sy = labels.shape[1]\n  sz = labels.shape[2]\n\n  for z in range(0, sz):\n    for y in range(0, sy):\n      for x in range(0, sx):\n        if labels[x,y,z]:\n          return (x,y,z)\n\n  return None\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\ndef find_target(\n    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, \n    cnp.ndarray[float, ndim=3] PDRF\n  ):\n  \"\"\"\n  find_target(ndarray[uint8_t, cast=True, ndim=3] labels, ndarray[float, ndim=3] PDRF)\n\n  Given a binary image and a coregistered map of values to it, \n  find the coordinate of the voxel corresponding to the first\n  instance of the maximum map value.\n\n  Returns: (x, y, z)\n  \"\"\"\n  cdef size_t x,y,z\n  cdef size_t sx, sy, sz\n\n  sx = labels.shape[0]\n  sy = labels.shape[1]\n  sz = labels.shape[2]\n\n  cdef int64_t mx, my, mz\n\n  mx = -1\n  my = -1\n  mz = -1\n\n  cdef float maxpdrf = -INFINITY\n  for x in range(0, sx):\n    for y in range(0, sy):\n      for z in range(0, sz):\n        if labels[x,y,z] and PDRF[x,y,z] > maxpdrf:\n          maxpdrf = PDRF[x,y,z]\n          mx = x\n          my = y\n          mz = z\n\n  return (mx, my, mz)\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\n@cython.binding(True)\ndef roll_invalidation_ball_inside_component(\n    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, \n    cnp.ndarray[float, ndim=3] DBF, \n    float scale, \n    float constant,\n    anisotropy,\n    path,\n    voxel_connectivity_graph = None,\n    connectivity = 26,\n):\n  cdef int64_t sx, sy, sz \n  sx = labels.shape[0]\n  sy = labels.shape[1]\n  sz = labels.shape[2]\n\n  cdef size_t sxy = sx * sy\n\n  cdef float wx, wy, wz\n  (wx, wy, wz) = anisotropy\n\n  max_distances = [ \n    (scale * DBF[x,y,z] + constant) for (x,y,z) in path \n  ]\n\n  path = [ \n    coord[0] + sx * coord[1] + sxy * coord[2] \n    for coord in path if tuple(coord)\n  ]\n\n  cdef uint32_t* vcg = NULL\n  cdef cnp.ndarray[uint32_t, ndim=3] vcg_arr\n\n  if isinstance(voxel_connectivity_graph, np.ndarray):\n    vcg_arr = voxel_connectivity_graph\n    vcg = <uint32_t*>&vcg_arr[0,0,0]\n\n  invalidated = _roll_invalidation_ball(\n    <uint8_t*>&labels[0,0,0],\n    sx, sy, sz, \n    wx, wy, wz,\n    path, max_distances,\n    connectivity, \n    vcg\n  )\n\n  return (invalidated, labels)\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\n@cython.binding(True)\ndef roll_invalidation_ball(\n    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, \n    cnp.ndarray[float, ndim=3] DBF, \n    path, float scale, float const,\n    anisotropy=(1,1,1),\n    invalid_vertices={},\n  ):\n  \"\"\"\n  Given an anisotropic binary image, its distance transform, and a path \n  traversing the binary image, erase the voxels surrounding the path\n  in a sphere around each vertex on the path corresponding to the \n  equation: \n\n  r = scale * DBF[x,y,z] + const\n\n  Returns: modified labels\n  \"\"\"\n  cdef int64_t sx, sy, sz \n  sx = labels.shape[0]\n  sy = labels.shape[1]\n  sz = labels.shape[2]\n\n  cdef float wx, wy, wz\n  (wx, wy, wz) = anisotropy\n    \n  cdef float radius, dist\n  cdef int64_t minx, maxx, miny, maxy, minz, maxz\n\n  cdef int64_t x,y,z\n  cdef int64_t x0, y0, z0\n\n  cdef size_t invalidated = 0\n\n  for coord in path:\n    if tuple(coord) in invalid_vertices:\n      continue\n\n    (x0, y0, z0) = coord\n    radius = DBF[x0,y0,z0] * scale + const # physical units (e.g. nm)\n\n    minx = max(0,  <int64_t>(0.5 + (x0 - (radius / wx))))\n    maxx = min(sx, <int64_t>(0.5 + (x0 + (radius / wx))))\n    miny = max(0,  <int64_t>(0.5 + (y0 - (radius / wy))))\n    maxy = min(sy, <int64_t>(0.5 + (y0 + (radius / wy))))\n    minz = max(0,  <int64_t>(0.5 + (z0 - (radius / wz))))\n    maxz = min(sz, <int64_t>(0.5 + (z0 + (radius / wz))))\n\n    radius *= radius \n\n    for x in range(minx, maxx):\n      for y in range(miny, maxy):\n        for z in range(minz, maxz):\n          if not labels[x,y,z]:\n            continue \n\n          dist = (wx * (x - x0)) ** 2 + (wy * (y - y0)) ** 2 + (wz * (z - z0)) ** 2\n          if dist <= radius:\n            invalidated += 1\n            labels[x,y,z] = 0\n\n  return invalidated, labels\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\n@cython.binding(True)\ndef get_mapping(\n    cnp.ndarray[INTEGER, ndim=3] orig_labels, \n    cnp.ndarray[UINT, ndim=3] cc_labels\n  ):\n  \"\"\"\n  Given a set of possibly not connected labels \n  and an image containing their labeled connected components, \n  produce a dictionary containing the inverse of this mapping.\n\n  Returns: { $CC_LABEL: $ORIGINAL_LABEL }\n  \"\"\"\n\n  cdef size_t sx, sy, sz \n  sx = orig_labels.shape[0]\n  sy = orig_labels.shape[1]\n  sz = orig_labels.shape[2]\n\n  cdef size_t x,y,z \n\n  remap = {}\n\n  if orig_labels.size == 0:\n    return remap\n\n  cdef UINT last_label = cc_labels[0,0,0]\n  remap[cc_labels[0,0,0]] = orig_labels[0,0,0]\n\n  for z in range(sz):\n    for y in range(sy):\n      for x in range(sx):\n        if last_label == cc_labels[x,y,z]:\n          continue\n        remap[cc_labels[x,y,z]] = orig_labels[x,y,z]\n        last_label = cc_labels[x,y,z]\n\n  return remap\n\n@cython.binding(True)\ndef compute_centroids(\n    cnp.ndarray[UINT, ndim=2] labels,\n    float wx, float wy\n  ):\n  \"\"\"\n  Compute the centroid for every label on a 2D image at once.\n\n  Returns: { $segid: (x, y), ... }\n  \"\"\"\n\n  cdef float[:] xsum = np.zeros( (labels.size,), dtype=np.float32)\n  cdef float[:] ysum = np.zeros( (labels.size,), dtype=np.float32)\n  cdef uint32_t[:] labelct = np.zeros( (labels.size,), dtype=np.uint32)\n\n  cdef size_t sx, sy\n  sx = labels.shape[0]\n  sy = labels.shape[1]\n\n  cdef size_t x, y\n  cdef uint32_t label = 0\n\n  for x in range(sx):\n    for y in range(sy):\n      label = labels[x,y]\n      if label == 0:\n        continue\n\n      xsum[label] += x \n      ysum[label] += y \n      labelct[label] += 1\n\n  result = {}\n\n  cdef float cx = wx * sx / 2\n  cdef float cy = wy * sy / 2\n\n  cdef float px, py\n\n  for label in range(labels.size):\n    if labelct[label] == 0:\n      continue\n\n    px = wx * <float>xsum[label] / <float>labelct[label]\n    py = wy * <float>ysum[label] / <float>labelct[label]\n\n    # Since we don't know which coordinate frame we \n    # are using, round toward the center of the image\n    # to ensure we get the same pixel every time.\n    if px - cx >= 0:\n      px = px # will be truncated towards center\n    else:\n      px = px + wx\n\n    if py - cy >= 0:\n      py = py # will be truncated towards center\n    else:\n      py = py + wy\n\n    result[label] = (<int>(px / wx), <int>(py / wy))\n\n  return result\n\n@cython.binding(True)\ndef find_border_targets(\n    cnp.ndarray[float, ndim=2] dt,\n    cnp.ndarray[UINT, ndim=2] cc_labels,\n    float wx, float wy\n  ):\n  \"\"\"\n  Given a set of connected components that line within \n  a plane and their distance transform, return a map of\n  label ID to the coordinate of its maximum distance \n  transform value. If there are multiple maxima, we \n  disambiguate based on topological criteria that are\n  coordinate frame independent in order to avoid dealing\n  with issues that come from the six rotated frames and\n  their mirrored partners.\n\n  The purpose of this function is to fix the edge effect\n  the standard TEASAR algorithm generates and ensure that\n  we can trivially join skeletons from adjacent chunks.  \n\n  Rotating the (x,y) pairs into their appropriate frame\n  is performed in the function that calls this one.\n\n  Returns: { $SEGID: (x, y), ... }\n  \"\"\"\n  cdef size_t sx, sy\n  sx = dt.shape[0]\n  sy = dt.shape[1]\n\n  cdef size_t x, y\n\n  mx = defaultdict(float)\n  pts = {}\n\n  cdef UINT label = 0\n  cdef dict centroids = compute_centroids(cc_labels, wx, wy)\n  cdef float px, py\n  cdef float centx, centy\n\n  for y in range(sy):\n    for x in range(sx):\n      label = cc_labels[x,y]\n      if label == 0:\n        continue\n      elif dt[x,y] == 0:\n        continue\n      elif dt[x,y] > mx[label]:\n        mx[label] = dt[x,y]\n        pts[label] = (x,y)\n      elif mx[label] == dt[x,y]:\n        px, py = pts[label]\n        centx, centy = centroids[label]\n        pts[label] = compute_tiebreaker_maxima(\n          px, py, x, y, \n          centx, centy,\n          sx, sy, wx, wy\n        )\n\n  return pts\n\ndef compute_tiebreaker_maxima(\n    float px, float py, \n    float x, float y, \n    float centx, float centy,\n    float sx, float sy,\n    float wx, float wy\n  ):\n  \"\"\"\n  compute_tiebreaker_maxima(\n    float px, float py, \n    float x, float y, \n    float centx, float centy,\n    float sx, float sy,\n    float wx, float wy\n  )\n\n  This function breaks ties for `compute_border_targets`.\n\n  (px,py): A previously found distance transform maxima \n  (x,y): The coordinate of the newly found maxima\n  (sx,sy): The length and width of the image plane.\n  (wx,wy): Weighting for anisotropy.\n  (centx, centy): The centroid of the current label.\n\n  We use following topolological criteria to achieve\n  a coordinate frame-free voxel selection. We pick\n  the result of the first criterion that is satisfied.\n\n  1) Pick the voxel closest to the centroid of the label.\n  2) The voxel closest to the centroid of the plane.\n  3) Closest to a corner of the plane.\n  4) Closest to an edge of the plane.\n  5) The previous maxima.\n\n  The worst case would be an annulus drawn around the center,\n  which would result in four equally eligible pixels....\n\n  Hopefully this won't happen too often...\n\n  Returns: some (x, y)\n  \"\"\"\n  cdef float cx = wx * sx / 2.0\n  cdef float cy = wy * sy / 2.0\n\n  cdef float dist1 = distsq(px,py, centx,centy, wx,wy)\n  cdef float dist2 = distsq( x, y, centx,centy, wx,wy)\n\n  if dist2 < dist1:\n    return (x, y)\n  elif dist1 == dist2:\n    dist1 = distsq(px,py, cx,cy, wx,wy)\n    dist2 = distsq( x, y, cx,cy, wx,wy)\n    if dist2 < dist1:\n      return (x,y)\n    elif dist1 == dist2:\n      dist1 = cornerness(px, py, sx, sy, wx,wy)\n      dist2 = cornerness( x,  y, sx, sy, wx,wy)\n      if dist2 < dist1:\n        return (x, y)\n      elif dist1 == dist2:\n        dist1 = edgeness(px, py, sx, sy, wx,wy)\n        dist2 = edgeness( x,  y, sx, sy, wx,wy)\n        if dist2 < dist1:\n          return (x, y)\n\n  return (px, py)\n\ncdef float edgeness(\n    float x, float y, float sx, float sy,\n    float wx, float wy\n  ):\n  \"\"\"\n  float edgeness(float x, float y, float sx, float sy)\n\n  Nearness of (x,y) to the edge of an image of size (sx,sy).\n  \"\"\"\n  return min(\n    wx * (x - 0.5),\n    wx * (sx - 0.5 - x),\n    wy * (y - 0.5),\n    wy * (sy - 0.5 - y)\n  )\n\ncdef float cornerness(\n    float x, float y, float sx, float sy,\n    float wx, float wy\n  ):\n  \"\"\"\n  float cornerness(\n      float x, float y, float sx, float sy\n      float wx, float wy\n  )\n\n  Nearness of (x,y) to a corner of an image of size (sx,sy).\n  \"\"\"\n  return min( \n    distsq(x,y,-0.5,-0.5, wx, wy), \n    distsq(x,y,sx-0.5,-0.5, wx, wy),\n    distsq(x,y,sx-0.5,sy-0.5, wx, wy),\n    distsq(x,y,-0.5,sx-0.5, wx, wy)\n  )\n\ncdef float distsq(\n    float p1x, float p1y, \n    float p2x, float p2y, \n    float wx, float wy\n  ):\n\n  p1x = wx * (p1x - p2x)\n  p1y = wy * (p1y - p2y)\n  return p1x * p1x + p1y * p1y \n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\n@cython.binding(True)\ndef roll_invalidation_cube(\n    cnp.ndarray[uint8_t, cast=True, ndim=3] labels, \n    cnp.ndarray[float, ndim=3] DBF, \n    path, float scale, float const,\n    anisotropy=(1,1,1),\n    invalid_vertices={},\n  ):\n  \"\"\"\n  Given an anisotropic binary image, its distance transform, and a path \n  traversing the binary image, erase the voxels surrounding the path\n  in a cube around each vertex. In contrast to `roll_invalidation_ball`,\n  this function runs in time linear in the number of image pixels.\n  \"\"\"\n  cdef int64_t sx, sy, sz \n  sx = labels.shape[0]\n  sy = labels.shape[1]\n  sz = labels.shape[2]\n\n  cdef size_t sxy = sx * sy\n\n  cdef float wx, wy, wz\n  (wx, wy, wz) = anisotropy\n\n  path = [ \n    coord[0] + sx * coord[1] + sxy * coord[2] \n    for coord in path if tuple(coord) not in invalid_vertices \n  ]\n  path = np.array(path, dtype=np.uintp)\n\n  cdef size_t[:] pathview = path\n\n  cdef size_t invalidated = _roll_invalidation_cube(\n    <uint8_t*>&labels[0,0,0], <float*>&DBF[0,0,0],\n    sx, sy, sz, \n    wx, wy, wz,\n    <size_t*>&pathview[0], path.size,\n    scale, const\n  )\n\n  return invalidated, labels\n\n@cython.boundscheck(False)\n@cython.wraparound(False)  # turn off negative index wrapping for entire function\n@cython.nonecheck(False)\ndef find_cycle_cython(cnp.ndarray[int32_t, ndim=2] edges):\n  \"\"\"\n  Given a graph of edges that are a single connected component,\n  find a cycle via depth first search.\n\n  Returns: list of edges in a cycle (empty list if no cycle is found)\n  \"\"\"\n  index = defaultdict(set)\n  visited = defaultdict(int)\n\n  if edges.size == 0:\n    return np.array([], dtype=np.int32)\n\n  for e1, e2 in edges:\n    index[e1].add(e2)\n    index[e2].add(e1)\n\n  cdef int root = edges[0,0]\n  cdef int node = -1\n  cdef int child = -1\n  cdef int parent = -1\n  cdef int depth = -1\n  cdef int i = 0\n\n  cdef list stack = [root]\n  cdef list parents = [-1]\n  cdef list depth_stack = [0]\n  cdef list path = []\n\n  while stack:\n    node = stack.pop()\n    parent = parents.pop()\n    depth = depth_stack.pop()\n\n    for i in range(len(path) - depth):\n      path.pop()\n\n    path.append(node)\n\n    if visited[node] == 1:\n      break\n\n    visited[node] = 1\n\n    for child in index[node]:\n      if child != parent:\n        stack.append(child)\n        parents.append(node)\n        depth_stack.append(depth + 1)\n\n  if len(path) <= 1:\n    return np.array([], dtype=np.int32)\n  \n  for i in range(len(path) - 1):\n    if path[i] == node:\n      break\n\n  path = path[i:]\n\n  if len(path) < 3:\n    return np.array([], dtype=np.int32)\n\n  return np.array(path, dtype=np.int32)\n\ndef find_avocado_fruit(\n  cnp.ndarray[INTEGER, ndim=3] labels, \n  size_t cx, size_t cy, size_t cz,\n  INTEGER background = 0\n):\n  \"\"\"\n  Tests to see if the current coordinate is inside \n  the nucleus of a somata that has been assigned\n  to a separate label from the rest of the cell.\n\n  Returns: (pit, fruit)\n  \"\"\"\n  cdef size_t sx, sy, sz\n  sx, sy, sz = labels.shape[:3]\n  cdef size_t voxels = sx * sy * sz \n\n  if cx >= sx or cy >= sy or cz >= sz:\n    raise ValueError(\n      \"<{},{},{}> must be be contained within shape <{},{},{}>\".format(\n        cx,cy,cz,sx,sy,sz\n    ))\n\n  cdef size_t x, y, z \n  cdef INTEGER label = labels[cx, cy, cz]\n  cdef list changes = [ None ] * 6\n\n  for x in range(cx, sx):\n    if labels[x,cy,cz] == background:\n      break\n    elif labels[x,cy,cz] != label:\n      changes[0] = labels[x,cy,cz]\n      break\n\n  for x in range(cx, 0, -1):\n    if labels[x,cy,cz] == background:\n      break\n    elif labels[x,cy,cz] != label:\n      changes[1] = labels[x,cy,cz]\n      break\n\n  for y in range(cy, sy):\n    if labels[cx,y,cz] == background:\n      break\n    if labels[cx,y,cz] != label:\n      changes[2] = labels[cx,y,cz]\n      break\n\n  for y in range(cy, 0, -1):\n    if labels[cx,y,cz] == background:\n      break\n    if labels[cx,y,cz] != label:\n      changes[3] = labels[cx,y,cz]\n      break\n\n  for z in range(cz, sz):\n    if labels[cx,cy,z] == background:\n      break\n    if labels[cx,cy,z] != label:\n      changes[4] = labels[cx,cy,z]\n      break\n\n  for z in range(cz, 0, -1):\n    if labels[cx,cy,z] == background:\n      break\n    if labels[cx,cy,z] != label:\n      changes[5] = labels[cx,cy,z]\n      break\n\n  changes = [ _ for _ in changes if _ is not None ]\n\n  # Too little info to make a decision\n  if len(changes) < 3:\n    return (label, label)\n\n  if len(changes) > 3: # if more than 3, allow one non-match\n    allowed_differences = 1\n  else: # allow no non-matches (we're in a corner)\n    allowed_differences = 0\n\n  uniq, cts = np.unique(changes, return_counts=True)\n  candidate_fruit_index = np.argmax(cts)\n  differences = len(changes) - cts[candidate_fruit_index]\n\n  # it's not an avocado if there's lots of\n  # labels surrounding the candidate \"pit\"\n  if differences > allowed_differences:\n    return (label, label)\n  \n  return (label, uniq[candidate_fruit_index])\n\nclass CachedTargetFinder:\n  def __init__(self, mask: np.ndarray, daf: np.ndarray):\n    \"\"\"\n    From DAF, compute a sorted list of the maximum values\n    so that finding them becomes very fast.\n    \"\"\"\n    mask_indices = np.flatnonzero(mask.ravel(order='F'))\n    if mask.size < np.iinfo(np.uint32).max:\n      mask_indices = mask_indices.astype(np.uint32, copy=False)\n    daf_sort = np.argsort(daf.ravel(order='F')[mask_indices])\n    daf_sort = np.flip(daf_sort)\n    self.daf_indices = mask_indices[daf_sort]\n\n  def find_target(self, mask: np.ndarray):\n    \"\"\"\n    Find the coordinate of a voxel corresponding \n    the maximum map value.\n\n    Returns: (x, y, z)\n    \"\"\"\n    first_positive_index = self.first_label_indexed(\n      mask.ravel(order='F'), self.daf_indices\n    )\n    if first_positive_index is None:\n      self.daf_indices = self.daf_indices[self.daf_indices.size:]  # Clear it.\n      return None\n\n    # This tells us mask positions daf_indices[0:first_positive_index] are now\n    # zeroed out. We assume that this is permanent, so we don't need to search\n    # those positions again next time.\n    self.daf_indices = self.daf_indices[first_positive_index:]\n\n    return np.unravel_index(self.daf_indices[0], mask.shape, order='F')\n\n  @cython.boundscheck(False)\n  @cython.wraparound(False)  # turn off negative index wrapping for entire function\n  @cython.nonecheck(False)\n  def first_label_indexed(self, uint8_t[:] labels not None, INTEGER[:] indices not None):\n    \"\"\"\n    Returns: first i for which labels[indices[i]] is non-zero.\n    \"\"\"\n    cdef size_t length = indices.size\n    cdef size_t i = 0\n    cdef INTEGER label_index\n\n    for i in range(length):\n      label_index = indices[i]\n      if labels[label_index]:\n        return i\n\n    return None  \n\ndef extract_edges_from_binary_image(uint8_t[:,:,:] binimg, int connectivity = 26):\n  cdef uint64_t sx, sy, sz\n  sx, sy, sz = tuple(binimg.shape)[:3]\n\n  cdef uint64_t sxy = sx * sy\n\n  binimg = np.asfortranarray(binimg)\n  cdef unordered_set[cpp_pair[uint64_t,uint64_t], pair_hash] edges = _extract_edges_from_binary_image(\n    &binimg[0,0,0], \n    sx, sy, sz, \n    connectivity\n  )\n\n  numbering = {}\n  cdef int64_t i = 0\n  for edge in edges:\n    for v in (edge.first, edge.second):\n      if v not in numbering:\n        numbering[v] = i\n        i += 1\n\n  inumbering = { v:k for k,v in numbering.items() }\n  vertices = []\n\n  cdef uint64_t loc, x, y, z\n  for i in range(len(inumbering)):\n    loc = inumbering[i]\n    z = loc // sxy\n    y = (loc - z * sxy) // sx\n    x = loc - z * sxy - y * sx\n    vertices.append((x,y,z))\n\n  int_edges = []\n  for v1,v2 in edges:\n    int_edges.append((numbering[v1], numbering[v2]))\n\n  vertices = np.array(vertices, dtype=np.uint32)\n  int_edges = np.array(int_edges, dtype=np.uint32)\n\n  return (vertices, int_edges)\n\n\n\n"
  },
  {
    "path": "ext/skeletontricks/unordered_dense.hpp",
    "content": "///////////////////////// ankerl::unordered_dense::{map, set} /////////////////////////\n\n// A fast & densely stored hashmap and hashset based on robin-hood backward shift deletion.\n// Version 4.5.0\n// https://github.com/martinus/unordered_dense\n//\n// Licensed under the MIT License <http://opensource.org/licenses/MIT>.\n// SPDX-License-Identifier: MIT\n// Copyright (c) 2022-2024 Martin Leitner-Ankerl <martin.ankerl@gmail.com>\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n// of this software and associated documentation files (the \"Software\"), to deal\n// in the Software without restriction, including without limitation the rights\n// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n// copies of the Software, and to permit persons to whom the Software is\n// furnished to do so, subject to the following conditions:\n//\n// The above copyright notice and this permission notice shall be included in all\n// copies or substantial portions of the Software.\n//\n// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n// SOFTWARE.\n\n#ifndef ANKERL_UNORDERED_DENSE_H\n#define ANKERL_UNORDERED_DENSE_H\n\n// see https://semver.org/spec/v2.0.0.html\n#define ANKERL_UNORDERED_DENSE_VERSION_MAJOR 4 // NOLINT(cppcoreguidelines-macro-usage) incompatible API changes\n#define ANKERL_UNORDERED_DENSE_VERSION_MINOR 5 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible functionality\n#define ANKERL_UNORDERED_DENSE_VERSION_PATCH 0 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible bug fixes\n\n// API versioning with inline namespace, see https://www.foonathan.net/2018/11/inline-namespaces/\n\n// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)\n#define ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch) v##major##_##minor##_##patch\n// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)\n#define ANKERL_UNORDERED_DENSE_VERSION_CONCAT(major, minor, patch) ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch)\n#define ANKERL_UNORDERED_DENSE_NAMESPACE   \\\n    ANKERL_UNORDERED_DENSE_VERSION_CONCAT( \\\n        ANKERL_UNORDERED_DENSE_VERSION_MAJOR, ANKERL_UNORDERED_DENSE_VERSION_MINOR, ANKERL_UNORDERED_DENSE_VERSION_PATCH)\n\n#if defined(_MSVC_LANG)\n#    define ANKERL_UNORDERED_DENSE_CPP_VERSION _MSVC_LANG\n#else\n#    define ANKERL_UNORDERED_DENSE_CPP_VERSION __cplusplus\n#endif\n\n#if defined(__GNUC__)\n// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)\n#    define ANKERL_UNORDERED_DENSE_PACK(decl) decl __attribute__((__packed__))\n#elif defined(_MSC_VER)\n// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)\n#    define ANKERL_UNORDERED_DENSE_PACK(decl) __pragma(pack(push, 1)) decl __pragma(pack(pop))\n#endif\n\n// exceptions\n#if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)\n#    define ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() 1 // NOLINT(cppcoreguidelines-macro-usage)\n#else\n#    define ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() 0 // NOLINT(cppcoreguidelines-macro-usage)\n#endif\n#ifdef _MSC_VER\n#    define ANKERL_UNORDERED_DENSE_NOINLINE __declspec(noinline)\n#else\n#    define ANKERL_UNORDERED_DENSE_NOINLINE __attribute__((noinline))\n#endif\n\n// defined in unordered_dense.cpp\n#if !defined(ANKERL_UNORDERED_DENSE_EXPORT)\n#    define ANKERL_UNORDERED_DENSE_EXPORT\n#endif\n\n#if ANKERL_UNORDERED_DENSE_CPP_VERSION < 201703L\n#    error ankerl::unordered_dense requires C++17 or higher\n#else\n#    include <array>            // for array\n#    include <cstdint>          // for uint64_t, uint32_t, uint8_t, UINT64_C\n#    include <cstring>          // for size_t, memcpy, memset\n#    include <functional>       // for equal_to, hash\n#    include <initializer_list> // for initializer_list\n#    include <iterator>         // for pair, distance\n#    include <limits>           // for numeric_limits\n#    include <memory>           // for allocator, allocator_traits, shared_ptr\n#    include <optional>         // for optional\n#    include <stdexcept>        // for out_of_range\n#    include <string>           // for basic_string\n#    include <string_view>      // for basic_string_view, hash\n#    include <tuple>            // for forward_as_tuple\n#    include <type_traits>      // for enable_if_t, declval, conditional_t, ena...\n#    include <utility>          // for forward, exchange, pair, as_const, piece...\n#    include <vector>           // for vector\n#    if ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() == 0\n#        include <cstdlib> // for abort\n#    endif\n\n#    if defined(__has_include) && !defined(ANKERL_UNORDERED_DENSE_DISABLE_PMR)\n#        if __has_include(<memory_resource>)\n#            define ANKERL_UNORDERED_DENSE_PMR std::pmr // NOLINT(cppcoreguidelines-macro-usage)\n#            include <memory_resource>                  // for polymorphic_allocator\n#        elif __has_include(<experimental/memory_resource>)\n#            define ANKERL_UNORDERED_DENSE_PMR std::experimental::pmr // NOLINT(cppcoreguidelines-macro-usage)\n#            include <experimental/memory_resource>                   // for polymorphic_allocator\n#        endif\n#    endif\n\n#    if defined(_MSC_VER) && defined(_M_X64)\n#        include <intrin.h>\n#        pragma intrinsic(_umul128)\n#    endif\n\n#    if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)\n#        define ANKERL_UNORDERED_DENSE_LIKELY(x) __builtin_expect(x, 1)   // NOLINT(cppcoreguidelines-macro-usage)\n#        define ANKERL_UNORDERED_DENSE_UNLIKELY(x) __builtin_expect(x, 0) // NOLINT(cppcoreguidelines-macro-usage)\n#    else\n#        define ANKERL_UNORDERED_DENSE_LIKELY(x) (x)   // NOLINT(cppcoreguidelines-macro-usage)\n#        define ANKERL_UNORDERED_DENSE_UNLIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage)\n#    endif\n\nnamespace ankerl::unordered_dense {\ninline namespace ANKERL_UNORDERED_DENSE_NAMESPACE {\n\nnamespace detail {\n\n#    if ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS()\n\n// make sure this is not inlined as it is slow and dramatically enlarges code, thus making other\n// inlinings more difficult. Throws are also generally the slow path.\n[[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_key_not_found() {\n    throw std::out_of_range(\"ankerl::unordered_dense::map::at(): key not found\");\n}\n[[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_bucket_overflow() {\n    throw std::overflow_error(\"ankerl::unordered_dense: reached max bucket size, cannot increase size\");\n}\n[[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_too_many_elements() {\n    throw std::out_of_range(\"ankerl::unordered_dense::map::replace(): too many elements\");\n}\n\n#    else\n\n[[noreturn]] inline void on_error_key_not_found() {\n    abort();\n}\n[[noreturn]] inline void on_error_bucket_overflow() {\n    abort();\n}\n[[noreturn]] inline void on_error_too_many_elements() {\n    abort();\n}\n\n#    endif\n\n} // namespace detail\n\n// hash ///////////////////////////////////////////////////////////////////////\n\n// This is a stripped-down implementation of wyhash: https://github.com/wangyi-fudan/wyhash\n// No big-endian support (because different values on different machines don't matter),\n// hardcodes seed and the secret, reformats the code, and clang-tidy fixes.\nnamespace detail::wyhash {\n\ninline void mum(uint64_t* a, uint64_t* b) {\n#    if defined(__SIZEOF_INT128__)\n    __uint128_t r = *a;\n    r *= *b;\n    *a = static_cast<uint64_t>(r);\n    *b = static_cast<uint64_t>(r >> 64U);\n#    elif defined(_MSC_VER) && defined(_M_X64)\n    *a = _umul128(*a, *b, b);\n#    else\n    uint64_t ha = *a >> 32U;\n    uint64_t hb = *b >> 32U;\n    uint64_t la = static_cast<uint32_t>(*a);\n    uint64_t lb = static_cast<uint32_t>(*b);\n    uint64_t hi{};\n    uint64_t lo{};\n    uint64_t rh = ha * hb;\n    uint64_t rm0 = ha * lb;\n    uint64_t rm1 = hb * la;\n    uint64_t rl = la * lb;\n    uint64_t t = rl + (rm0 << 32U);\n    auto c = static_cast<uint64_t>(t < rl);\n    lo = t + (rm1 << 32U);\n    c += static_cast<uint64_t>(lo < t);\n    hi = rh + (rm0 >> 32U) + (rm1 >> 32U) + c;\n    *a = lo;\n    *b = hi;\n#    endif\n}\n\n// multiply and xor mix function, aka MUM\n[[nodiscard]] inline auto mix(uint64_t a, uint64_t b) -> uint64_t {\n    mum(&a, &b);\n    return a ^ b;\n}\n\n// read functions. WARNING: we don't care about endianness, so results are different on big endian!\n[[nodiscard]] inline auto r8(const uint8_t* p) -> uint64_t {\n    uint64_t v{};\n    std::memcpy(&v, p, 8U);\n    return v;\n}\n\n[[nodiscard]] inline auto r4(const uint8_t* p) -> uint64_t {\n    uint32_t v{};\n    std::memcpy(&v, p, 4);\n    return v;\n}\n\n// reads 1, 2, or 3 bytes\n[[nodiscard]] inline auto r3(const uint8_t* p, size_t k) -> uint64_t {\n    return (static_cast<uint64_t>(p[0]) << 16U) | (static_cast<uint64_t>(p[k >> 1U]) << 8U) | p[k - 1];\n}\n\n[[maybe_unused]] [[nodiscard]] inline auto hash(void const* key, size_t len) -> uint64_t {\n    static constexpr auto secret = std::array{UINT64_C(0xa0761d6478bd642f),\n                                              UINT64_C(0xe7037ed1a0b428db),\n                                              UINT64_C(0x8ebc6af09c88c6e3),\n                                              UINT64_C(0x589965cc75374cc3)};\n\n    auto const* p = static_cast<uint8_t const*>(key);\n    uint64_t seed = secret[0];\n    uint64_t a{};\n    uint64_t b{};\n    if (ANKERL_UNORDERED_DENSE_LIKELY(len <= 16)) {\n        if (ANKERL_UNORDERED_DENSE_LIKELY(len >= 4)) {\n            a = (r4(p) << 32U) | r4(p + ((len >> 3U) << 2U));\n            b = (r4(p + len - 4) << 32U) | r4(p + len - 4 - ((len >> 3U) << 2U));\n        } else if (ANKERL_UNORDERED_DENSE_LIKELY(len > 0)) {\n            a = r3(p, len);\n            b = 0;\n        } else {\n            a = 0;\n            b = 0;\n        }\n    } else {\n        size_t i = len;\n        if (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 48)) {\n            uint64_t see1 = seed;\n            uint64_t see2 = seed;\n            do {\n                seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);\n                see1 = mix(r8(p + 16) ^ secret[2], r8(p + 24) ^ see1);\n                see2 = mix(r8(p + 32) ^ secret[3], r8(p + 40) ^ see2);\n                p += 48;\n                i -= 48;\n            } while (ANKERL_UNORDERED_DENSE_LIKELY(i > 48));\n            seed ^= see1 ^ see2;\n        }\n        while (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 16)) {\n            seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);\n            i -= 16;\n            p += 16;\n        }\n        a = r8(p + i - 16);\n        b = r8(p + i - 8);\n    }\n\n    return mix(secret[1] ^ len, mix(a ^ secret[1], b ^ seed));\n}\n\n[[nodiscard]] inline auto hash(uint64_t x) -> uint64_t {\n    return detail::wyhash::mix(x, UINT64_C(0x9E3779B97F4A7C15));\n}\n\n} // namespace detail::wyhash\n\nANKERL_UNORDERED_DENSE_EXPORT template <typename T, typename Enable = void>\nstruct hash {\n    auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))\n        -> uint64_t {\n        return std::hash<T>{}(obj);\n    }\n};\n\ntemplate <typename T>\nstruct hash<T, typename std::hash<T>::is_avalanching> {\n    using is_avalanching = void;\n    auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))\n        -> uint64_t {\n        return std::hash<T>{}(obj);\n    }\n};\n\ntemplate <typename CharT>\nstruct hash<std::basic_string<CharT>> {\n    using is_avalanching = void;\n    auto operator()(std::basic_string<CharT> const& str) const noexcept -> uint64_t {\n        return detail::wyhash::hash(str.data(), sizeof(CharT) * str.size());\n    }\n};\n\ntemplate <typename CharT>\nstruct hash<std::basic_string_view<CharT>> {\n    using is_avalanching = void;\n    auto operator()(std::basic_string_view<CharT> const& sv) const noexcept -> uint64_t {\n        return detail::wyhash::hash(sv.data(), sizeof(CharT) * sv.size());\n    }\n};\n\ntemplate <class T>\nstruct hash<T*> {\n    using is_avalanching = void;\n    auto operator()(T* ptr) const noexcept -> uint64_t {\n        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)\n        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr));\n    }\n};\n\ntemplate <class T>\nstruct hash<std::unique_ptr<T>> {\n    using is_avalanching = void;\n    auto operator()(std::unique_ptr<T> const& ptr) const noexcept -> uint64_t {\n        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)\n        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr.get()));\n    }\n};\n\ntemplate <class T>\nstruct hash<std::shared_ptr<T>> {\n    using is_avalanching = void;\n    auto operator()(std::shared_ptr<T> const& ptr) const noexcept -> uint64_t {\n        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)\n        return detail::wyhash::hash(reinterpret_cast<uintptr_t>(ptr.get()));\n    }\n};\n\ntemplate <typename Enum>\nstruct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {\n    using is_avalanching = void;\n    auto operator()(Enum e) const noexcept -> uint64_t {\n        using underlying = typename std::underlying_type_t<Enum>;\n        return detail::wyhash::hash(static_cast<underlying>(e));\n    }\n};\n\ntemplate <typename... Args>\nstruct tuple_hash_helper {\n    // Converts the value into 64bit. If it is an integral type, just cast it. Mixing is doing the rest.\n    // If it isn't an integral we need to hash it.\n    template <typename Arg>\n    [[nodiscard]] constexpr static auto to64(Arg const& arg) -> uint64_t {\n        if constexpr (std::is_integral_v<Arg> || std::is_enum_v<Arg>) {\n            return static_cast<uint64_t>(arg);\n        } else {\n            return hash<Arg>{}(arg);\n        }\n    }\n\n    [[nodiscard]] static auto mix64(uint64_t state, uint64_t v) -> uint64_t {\n        return detail::wyhash::mix(state + v, uint64_t{0x9ddfea08eb382d69});\n    }\n\n    // Creates a buffer that holds all the data from each element of the tuple. If possible we memcpy the data directly. If\n    // not, we hash the object and use this for the array. Size of the array is known at compile time, and memcpy is optimized\n    // away, so filling the buffer is highly efficient. Finally, call wyhash with this buffer.\n    template <typename T, std::size_t... Idx>\n    [[nodiscard]] static auto calc_hash(T const& t, std::index_sequence<Idx...>) noexcept -> uint64_t {\n        auto h = uint64_t{};\n        ((h = mix64(h, to64(std::get<Idx>(t)))), ...);\n        return h;\n    }\n};\n\ntemplate <typename... Args>\nstruct hash<std::tuple<Args...>> : tuple_hash_helper<Args...> {\n    using is_avalanching = void;\n    auto operator()(std::tuple<Args...> const& t) const noexcept -> uint64_t {\n        return tuple_hash_helper<Args...>::calc_hash(t, std::index_sequence_for<Args...>{});\n    }\n};\n\ntemplate <typename A, typename B>\nstruct hash<std::pair<A, B>> : tuple_hash_helper<A, B> {\n    using is_avalanching = void;\n    auto operator()(std::pair<A, B> const& t) const noexcept -> uint64_t {\n        return tuple_hash_helper<A, B>::calc_hash(t, std::index_sequence_for<A, B>{});\n    }\n};\n\n// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)\n#    define ANKERL_UNORDERED_DENSE_HASH_STATICCAST(T)                    \\\n        template <>                                                      \\\n        struct hash<T> {                                                 \\\n            using is_avalanching = void;                                 \\\n            auto operator()(T const& obj) const noexcept -> uint64_t {   \\\n                return detail::wyhash::hash(static_cast<uint64_t>(obj)); \\\n            }                                                            \\\n        }\n\n#    if defined(__GNUC__) && !defined(__clang__)\n#        pragma GCC diagnostic push\n#        pragma GCC diagnostic ignored \"-Wuseless-cast\"\n#    endif\n// see https://en.cppreference.com/w/cpp/utility/hash\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(bool);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(char);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(signed char);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned char);\n#    if ANKERL_UNORDERED_DENSE_CPP_VERSION >= 202002L && defined(__cpp_char8_t)\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(char8_t);\n#    endif\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(char16_t);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(char32_t);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(wchar_t);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(short);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned short);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(int);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned int);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(long);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(long long);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long);\nANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long long);\n\n#    if defined(__GNUC__) && !defined(__clang__)\n#        pragma GCC diagnostic pop\n#    endif\n\n// bucket_type //////////////////////////////////////////////////////////\n\nnamespace bucket_type {\n\nstruct standard {\n    static constexpr uint32_t dist_inc = 1U << 8U;             // skip 1 byte fingerprint\n    static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint\n\n    uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash\n    uint32_t m_value_idx;            // index into the m_values vector.\n};\n\nANKERL_UNORDERED_DENSE_PACK(struct big {\n    static constexpr uint32_t dist_inc = 1U << 8U;             // skip 1 byte fingerprint\n    static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint\n\n    uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash\n    size_t m_value_idx;              // index into the m_values vector.\n});\n\n} // namespace bucket_type\n\nnamespace detail {\n\nstruct nonesuch {};\nstruct default_container_t {};\n\ntemplate <class Default, class AlwaysVoid, template <class...> class Op, class... Args>\nstruct detector {\n    using value_t = std::false_type;\n    using type = Default;\n};\n\ntemplate <class Default, template <class...> class Op, class... Args>\nstruct detector<Default, std::void_t<Op<Args...>>, Op, Args...> {\n    using value_t = std::true_type;\n    using type = Op<Args...>;\n};\n\ntemplate <template <class...> class Op, class... Args>\nusing is_detected = typename detail::detector<detail::nonesuch, void, Op, Args...>::value_t;\n\ntemplate <template <class...> class Op, class... Args>\nconstexpr bool is_detected_v = is_detected<Op, Args...>::value;\n\ntemplate <typename T>\nusing detect_avalanching = typename T::is_avalanching;\n\ntemplate <typename T>\nusing detect_is_transparent = typename T::is_transparent;\n\ntemplate <typename T>\nusing detect_iterator = typename T::iterator;\n\ntemplate <typename T>\nusing detect_reserve = decltype(std::declval<T&>().reserve(size_t{}));\n\n// enable_if helpers\n\ntemplate <typename Mapped>\nconstexpr bool is_map_v = !std::is_void_v<Mapped>;\n\n// clang-format off\ntemplate <typename Hash, typename KeyEqual>\nconstexpr bool is_transparent_v = is_detected_v<detect_is_transparent, Hash> && is_detected_v<detect_is_transparent, KeyEqual>;\n// clang-format on\n\ntemplate <typename From, typename To1, typename To2>\nconstexpr bool is_neither_convertible_v = !std::is_convertible_v<From, To1> && !std::is_convertible_v<From, To2>;\n\ntemplate <typename T>\nconstexpr bool has_reserve = is_detected_v<detect_reserve, T>;\n\n// base type for map has mapped_type\ntemplate <class T>\nstruct base_table_type_map {\n    using mapped_type = T;\n};\n\n// base type for set doesn't have mapped_type\nstruct base_table_type_set {};\n\n} // namespace detail\n\n// Very much like std::deque, but faster for indexing (in most cases). As of now this doesn't implement the full std::vector\n// API, but merely what's necessary to work as an underlying container for ankerl::unordered_dense::{map, set}.\n// It allocates blocks of equal size and puts them into the m_blocks vector. That means it can grow simply by adding a new\n// block to the back of m_blocks, and doesn't double its size like an std::vector. The disadvantage is that memory is not\n// linear and thus there is one more indirection necessary for indexing.\ntemplate <typename T, typename Allocator = std::allocator<T>, size_t MaxSegmentSizeBytes = 4096>\nclass segmented_vector {\n    template <bool IsConst>\n    class iter_t;\n\npublic:\n    using allocator_type = Allocator;\n    using pointer = typename std::allocator_traits<allocator_type>::pointer;\n    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;\n    using difference_type = typename std::allocator_traits<allocator_type>::difference_type;\n    using value_type = T;\n    using size_type = std::size_t;\n    using reference = T&;\n    using const_reference = T const&;\n    using iterator = iter_t<false>;\n    using const_iterator = iter_t<true>;\n\nprivate:\n    using vec_alloc = typename std::allocator_traits<Allocator>::template rebind_alloc<pointer>;\n    std::vector<pointer, vec_alloc> m_blocks{};\n    size_t m_size{};\n\n    // Calculates the maximum number for x in  (s << x) <= max_val\n    static constexpr auto num_bits_closest(size_t max_val, size_t s) -> size_t {\n        auto f = size_t{0};\n        while (s << (f + 1) <= max_val) {\n            ++f;\n        }\n        return f;\n    }\n\n    using self_t = segmented_vector<T, Allocator, MaxSegmentSizeBytes>;\n    static constexpr auto num_bits = num_bits_closest(MaxSegmentSizeBytes, sizeof(T));\n    static constexpr auto num_elements_in_block = 1U << num_bits;\n    static constexpr auto mask = num_elements_in_block - 1U;\n\n    /**\n     * Iterator class doubles as const_iterator and iterator\n     */\n    template <bool IsConst>\n    class iter_t {\n        using ptr_t = typename std::conditional_t<IsConst, segmented_vector::const_pointer const*, segmented_vector::pointer*>;\n        ptr_t m_data{};\n        size_t m_idx{};\n\n        template <bool B>\n        friend class iter_t;\n\n    public:\n        using difference_type = segmented_vector::difference_type;\n        using value_type = T;\n        using reference = typename std::conditional_t<IsConst, value_type const&, value_type&>;\n        using pointer = typename std::conditional_t<IsConst, segmented_vector::const_pointer, segmented_vector::pointer>;\n        using iterator_category = std::forward_iterator_tag;\n\n        iter_t() noexcept = default;\n\n        template <bool OtherIsConst, typename = typename std::enable_if<IsConst && !OtherIsConst>::type>\n        // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)\n        constexpr iter_t(iter_t<OtherIsConst> const& other) noexcept\n            : m_data(other.m_data)\n            , m_idx(other.m_idx) {}\n\n        constexpr iter_t(ptr_t data, size_t idx) noexcept\n            : m_data(data)\n            , m_idx(idx) {}\n\n        template <bool OtherIsConst, typename = typename std::enable_if<IsConst && !OtherIsConst>::type>\n        constexpr auto operator=(iter_t<OtherIsConst> const& other) noexcept -> iter_t& {\n            m_data = other.m_data;\n            m_idx = other.m_idx;\n            return *this;\n        }\n\n        constexpr auto operator++() noexcept -> iter_t& {\n            ++m_idx;\n            return *this;\n        }\n\n        constexpr auto operator++(int) noexcept -> iter_t {\n            iter_t prev(*this);\n            this->operator++();\n            return prev;\n        }\n\n        constexpr auto operator+(difference_type diff) noexcept -> iter_t {\n            return {m_data, static_cast<size_t>(static_cast<difference_type>(m_idx) + diff)};\n        }\n\n        template <bool OtherIsConst>\n        constexpr auto operator-(iter_t<OtherIsConst> const& other) noexcept -> difference_type {\n            return static_cast<difference_type>(m_idx) - static_cast<difference_type>(other.m_idx);\n        }\n\n        constexpr auto operator*() const noexcept -> reference {\n            return m_data[m_idx >> num_bits][m_idx & mask];\n        }\n\n        constexpr auto operator->() const noexcept -> pointer {\n            return &m_data[m_idx >> num_bits][m_idx & mask];\n        }\n\n        template <bool O>\n        constexpr auto operator==(iter_t<O> const& o) const noexcept -> bool {\n            return m_idx == o.m_idx;\n        }\n\n        template <bool O>\n        constexpr auto operator!=(iter_t<O> const& o) const noexcept -> bool {\n            return !(*this == o);\n        }\n    };\n\n    // slow path: need to allocate a new segment every once in a while\n    void increase_capacity() {\n        auto ba = Allocator(m_blocks.get_allocator());\n        pointer block = std::allocator_traits<Allocator>::allocate(ba, num_elements_in_block);\n        m_blocks.push_back(block);\n    }\n\n    // Moves everything from other\n    void append_everything_from(segmented_vector&& other) {\n        reserve(size() + other.size());\n        for (auto&& o : other) {\n            emplace_back(std::move(o));\n        }\n    }\n\n    // Copies everything from other\n    void append_everything_from(segmented_vector const& other) {\n        reserve(size() + other.size());\n        for (auto const& o : other) {\n            emplace_back(o);\n        }\n    }\n\n    void dealloc() {\n        auto ba = Allocator(m_blocks.get_allocator());\n        for (auto ptr : m_blocks) {\n            std::allocator_traits<Allocator>::deallocate(ba, ptr, num_elements_in_block);\n        }\n    }\n\n    [[nodiscard]] static constexpr auto calc_num_blocks_for_capacity(size_t capacity) {\n        return (capacity + num_elements_in_block - 1U) / num_elements_in_block;\n    }\n\npublic:\n    segmented_vector() = default;\n\n    // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)\n    segmented_vector(Allocator alloc)\n        : m_blocks(vec_alloc(alloc)) {}\n\n    segmented_vector(segmented_vector&& other, Allocator alloc)\n        : segmented_vector(alloc) {\n        *this = std::move(other);\n    }\n\n    segmented_vector(segmented_vector const& other, Allocator alloc)\n        : m_blocks(vec_alloc(alloc)) {\n        append_everything_from(other);\n    }\n\n    segmented_vector(segmented_vector&& other) noexcept\n        : segmented_vector(std::move(other), get_allocator()) {}\n\n    segmented_vector(segmented_vector const& other) {\n        append_everything_from(other);\n    }\n\n    auto operator=(segmented_vector const& other) -> segmented_vector& {\n        if (this == &other) {\n            return *this;\n        }\n        clear();\n        append_everything_from(other);\n        return *this;\n    }\n\n    auto operator=(segmented_vector&& other) noexcept -> segmented_vector& {\n        clear();\n        dealloc();\n        if (other.get_allocator() == get_allocator()) {\n            m_blocks = std::move(other.m_blocks);\n            m_size = std::exchange(other.m_size, {});\n        } else {\n            // make sure to construct with other's allocator!\n            m_blocks = std::vector<pointer, vec_alloc>(vec_alloc(other.get_allocator()));\n            append_everything_from(std::move(other));\n        }\n        return *this;\n    }\n\n    ~segmented_vector() {\n        clear();\n        dealloc();\n    }\n\n    [[nodiscard]] constexpr auto size() const -> size_t {\n        return m_size;\n    }\n\n    [[nodiscard]] constexpr auto capacity() const -> size_t {\n        return m_blocks.size() * num_elements_in_block;\n    }\n\n    // Indexing is highly performance critical\n    [[nodiscard]] constexpr auto operator[](size_t i) const noexcept -> T const& {\n        return m_blocks[i >> num_bits][i & mask];\n    }\n\n    [[nodiscard]] constexpr auto operator[](size_t i) noexcept -> T& {\n        return m_blocks[i >> num_bits][i & mask];\n    }\n\n    [[nodiscard]] constexpr auto begin() -> iterator {\n        return {m_blocks.data(), 0U};\n    }\n    [[nodiscard]] constexpr auto begin() const -> const_iterator {\n        return {m_blocks.data(), 0U};\n    }\n    [[nodiscard]] constexpr auto cbegin() const -> const_iterator {\n        return {m_blocks.data(), 0U};\n    }\n\n    [[nodiscard]] constexpr auto end() -> iterator {\n        return {m_blocks.data(), m_size};\n    }\n    [[nodiscard]] constexpr auto end() const -> const_iterator {\n        return {m_blocks.data(), m_size};\n    }\n    [[nodiscard]] constexpr auto cend() const -> const_iterator {\n        return {m_blocks.data(), m_size};\n    }\n\n    [[nodiscard]] constexpr auto back() -> reference {\n        return operator[](m_size - 1);\n    }\n    [[nodiscard]] constexpr auto back() const -> const_reference {\n        return operator[](m_size - 1);\n    }\n\n    void pop_back() {\n        back().~T();\n        --m_size;\n    }\n\n    [[nodiscard]] auto empty() const {\n        return 0 == m_size;\n    }\n\n    void reserve(size_t new_capacity) {\n        m_blocks.reserve(calc_num_blocks_for_capacity(new_capacity));\n        while (new_capacity > capacity()) {\n            increase_capacity();\n        }\n    }\n\n    [[nodiscard]] auto get_allocator() const -> allocator_type {\n        return allocator_type{m_blocks.get_allocator()};\n    }\n\n    template <class... Args>\n    auto emplace_back(Args&&... args) -> reference {\n        if (m_size == capacity()) {\n            increase_capacity();\n        }\n        auto* ptr = static_cast<void*>(&operator[](m_size));\n        auto& ref = *new (ptr) T(std::forward<Args>(args)...);\n        ++m_size;\n        return ref;\n    }\n\n    void clear() {\n        if constexpr (!std::is_trivially_destructible_v<T>) {\n            for (size_t i = 0, s = size(); i < s; ++i) {\n                operator[](i).~T();\n            }\n        }\n        m_size = 0;\n    }\n\n    void shrink_to_fit() {\n        auto ba = Allocator(m_blocks.get_allocator());\n        auto num_blocks_required = calc_num_blocks_for_capacity(m_size);\n        while (m_blocks.size() > num_blocks_required) {\n            std::allocator_traits<Allocator>::deallocate(ba, m_blocks.back(), num_elements_in_block);\n            m_blocks.pop_back();\n        }\n        m_blocks.shrink_to_fit();\n    }\n};\n\nnamespace detail {\n\n// This is it, the table. Doubles as map and set, and uses `void` for T when its used as a set.\ntemplate <class Key,\n          class T, // when void, treat it as a set.\n          class Hash,\n          class KeyEqual,\n          class AllocatorOrContainer,\n          class Bucket,\n          class BucketContainer,\n          bool IsSegmented>\nclass table : public std::conditional_t<is_map_v<T>, base_table_type_map<T>, base_table_type_set> {\n    using underlying_value_type = typename std::conditional_t<is_map_v<T>, std::pair<Key, T>, Key>;\n    using underlying_container_type = std::conditional_t<IsSegmented,\n                                                         segmented_vector<underlying_value_type, AllocatorOrContainer>,\n                                                         std::vector<underlying_value_type, AllocatorOrContainer>>;\n\npublic:\n    using value_container_type = std::\n        conditional_t<is_detected_v<detect_iterator, AllocatorOrContainer>, AllocatorOrContainer, underlying_container_type>;\n\nprivate:\n    using bucket_alloc =\n        typename std::allocator_traits<typename value_container_type::allocator_type>::template rebind_alloc<Bucket>;\n    using default_bucket_container_type =\n        std::conditional_t<IsSegmented, segmented_vector<Bucket, bucket_alloc>, std::vector<Bucket, bucket_alloc>>;\n\n    using bucket_container_type = std::conditional_t<std::is_same_v<BucketContainer, detail::default_container_t>,\n                                                     default_bucket_container_type,\n                                                     BucketContainer>;\n\n    static constexpr uint8_t initial_shifts = 64 - 2; // 2^(64-m_shift) number of buckets\n    static constexpr float default_max_load_factor = 0.8F;\n\npublic:\n    using key_type = Key;\n    using value_type = typename value_container_type::value_type;\n    using size_type = typename value_container_type::size_type;\n    using difference_type = typename value_container_type::difference_type;\n    using hasher = Hash;\n    using key_equal = KeyEqual;\n    using allocator_type = typename value_container_type::allocator_type;\n    using reference = typename value_container_type::reference;\n    using const_reference = typename value_container_type::const_reference;\n    using pointer = typename value_container_type::pointer;\n    using const_pointer = typename value_container_type::const_pointer;\n    using const_iterator = typename value_container_type::const_iterator;\n    using iterator = std::conditional_t<is_map_v<T>, typename value_container_type::iterator, const_iterator>;\n    using bucket_type = Bucket;\n\nprivate:\n    using value_idx_type = decltype(Bucket::m_value_idx);\n    using dist_and_fingerprint_type = decltype(Bucket::m_dist_and_fingerprint);\n\n    static_assert(std::is_trivially_destructible_v<Bucket>, \"assert there's no need to call destructor / std::destroy\");\n    static_assert(std::is_trivially_copyable_v<Bucket>, \"assert we can just memset / memcpy\");\n\n    value_container_type m_values{}; // Contains all the key-value pairs in one densely stored container. No holes.\n    bucket_container_type m_buckets{};\n    size_t m_max_bucket_capacity = 0;\n    float m_max_load_factor = default_max_load_factor;\n    Hash m_hash{};\n    KeyEqual m_equal{};\n    uint8_t m_shifts = initial_shifts;\n\n    [[nodiscard]] auto next(value_idx_type bucket_idx) const -> value_idx_type {\n        return ANKERL_UNORDERED_DENSE_UNLIKELY(bucket_idx + 1U == bucket_count())\n                   ? 0\n                   : static_cast<value_idx_type>(bucket_idx + 1U);\n    }\n\n    // Helper to access bucket through pointer types\n    [[nodiscard]] static constexpr auto at(bucket_container_type& bucket, size_t offset) -> Bucket& {\n        return bucket[offset];\n    }\n\n    [[nodiscard]] static constexpr auto at(const bucket_container_type& bucket, size_t offset) -> const Bucket& {\n        return bucket[offset];\n    }\n\n    // use the dist_inc and dist_dec functions so that uint16_t types work without warning\n    [[nodiscard]] static constexpr auto dist_inc(dist_and_fingerprint_type x) -> dist_and_fingerprint_type {\n        return static_cast<dist_and_fingerprint_type>(x + Bucket::dist_inc);\n    }\n\n    [[nodiscard]] static constexpr auto dist_dec(dist_and_fingerprint_type x) -> dist_and_fingerprint_type {\n        return static_cast<dist_and_fingerprint_type>(x - Bucket::dist_inc);\n    }\n\n    // The goal of mixed_hash is to always produce a high quality 64bit hash.\n    template <typename K>\n    [[nodiscard]] constexpr auto mixed_hash(K const& key) const -> uint64_t {\n        if constexpr (is_detected_v<detect_avalanching, Hash>) {\n            // we know that the hash is good because is_avalanching.\n            if constexpr (sizeof(decltype(m_hash(key))) < sizeof(uint64_t)) {\n                // 32bit hash and is_avalanching => multiply with a constant to avalanche bits upwards\n                return m_hash(key) * UINT64_C(0x9ddfea08eb382d69);\n            } else {\n                // 64bit and is_avalanching => only use the hash itself.\n                return m_hash(key);\n            }\n        } else {\n            // not is_avalanching => apply wyhash\n            return wyhash::hash(m_hash(key));\n        }\n    }\n\n    [[nodiscard]] constexpr auto dist_and_fingerprint_from_hash(uint64_t hash) const -> dist_and_fingerprint_type {\n        return Bucket::dist_inc | (static_cast<dist_and_fingerprint_type>(hash) & Bucket::fingerprint_mask);\n    }\n\n    [[nodiscard]] constexpr auto bucket_idx_from_hash(uint64_t hash) const -> value_idx_type {\n        return static_cast<value_idx_type>(hash >> m_shifts);\n    }\n\n    [[nodiscard]] static constexpr auto get_key(value_type const& vt) -> key_type const& {\n        if constexpr (is_map_v<T>) {\n            return vt.first;\n        } else {\n            return vt;\n        }\n    }\n\n    template <typename K>\n    [[nodiscard]] auto next_while_less(K const& key) const -> Bucket {\n        auto hash = mixed_hash(key);\n        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);\n        auto bucket_idx = bucket_idx_from_hash(hash);\n\n        while (dist_and_fingerprint < at(m_buckets, bucket_idx).m_dist_and_fingerprint) {\n            dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n            bucket_idx = next(bucket_idx);\n        }\n        return {dist_and_fingerprint, bucket_idx};\n    }\n\n    void place_and_shift_up(Bucket bucket, value_idx_type place) {\n        while (0 != at(m_buckets, place).m_dist_and_fingerprint) {\n            bucket = std::exchange(at(m_buckets, place), bucket);\n            bucket.m_dist_and_fingerprint = dist_inc(bucket.m_dist_and_fingerprint);\n            place = next(place);\n        }\n        at(m_buckets, place) = bucket;\n    }\n\n    [[nodiscard]] static constexpr auto calc_num_buckets(uint8_t shifts) -> size_t {\n        return (std::min)(max_bucket_count(), size_t{1} << (64U - shifts));\n    }\n\n    [[nodiscard]] constexpr auto calc_shifts_for_size(size_t s) const -> uint8_t {\n        auto shifts = initial_shifts;\n        while (shifts > 0 && static_cast<size_t>(static_cast<float>(calc_num_buckets(shifts)) * max_load_factor()) < s) {\n            --shifts;\n        }\n        return shifts;\n    }\n\n    // assumes m_values has data, m_buckets=m_buckets_end=nullptr, m_shifts is INITIAL_SHIFTS\n    void copy_buckets(table const& other) {\n        // assumes m_values has already the correct data copied over.\n        if (empty()) {\n            // when empty, at least allocate an initial buckets and clear them.\n            allocate_buckets_from_shift();\n            clear_buckets();\n        } else {\n            m_shifts = other.m_shifts;\n            allocate_buckets_from_shift();\n            if constexpr (IsSegmented || !std::is_same_v<BucketContainer, default_container_t>) {\n                for (auto i = 0UL; i < bucket_count(); ++i) {\n                    at(m_buckets, i) = at(other.m_buckets, i);\n                }\n            } else {\n                std::memcpy(m_buckets.data(), other.m_buckets.data(), sizeof(Bucket) * bucket_count());\n            }\n        }\n    }\n\n    /**\n     * True when no element can be added any more without increasing the size\n     */\n    [[nodiscard]] auto is_full() const -> bool {\n        return size() > m_max_bucket_capacity;\n    }\n\n    void deallocate_buckets() {\n        m_buckets.clear();\n        m_buckets.shrink_to_fit();\n        m_max_bucket_capacity = 0;\n    }\n\n    void allocate_buckets_from_shift() {\n        auto num_buckets = calc_num_buckets(m_shifts);\n        if constexpr (IsSegmented || !std::is_same_v<BucketContainer, default_container_t>) {\n            if constexpr (has_reserve<bucket_container_type>) {\n                m_buckets.reserve(num_buckets);\n            }\n            for (size_t i = m_buckets.size(); i < num_buckets; ++i) {\n                m_buckets.emplace_back();\n            }\n        } else {\n            m_buckets.resize(num_buckets);\n        }\n        if (num_buckets == max_bucket_count()) {\n            // reached the maximum, make sure we can use each bucket\n            m_max_bucket_capacity = max_bucket_count();\n        } else {\n            m_max_bucket_capacity = static_cast<value_idx_type>(static_cast<float>(num_buckets) * max_load_factor());\n        }\n    }\n\n    void clear_buckets() {\n        if constexpr (IsSegmented || !std::is_same_v<BucketContainer, default_container_t>) {\n            for (auto&& e : m_buckets) {\n                std::memset(&e, 0, sizeof(e));\n            }\n        } else {\n            std::memset(m_buckets.data(), 0, sizeof(Bucket) * bucket_count());\n        }\n    }\n\n    void clear_and_fill_buckets_from_values() {\n        clear_buckets();\n        for (value_idx_type value_idx = 0, end_idx = static_cast<value_idx_type>(m_values.size()); value_idx < end_idx;\n             ++value_idx) {\n            auto const& key = get_key(m_values[value_idx]);\n            auto [dist_and_fingerprint, bucket] = next_while_less(key);\n\n            // we know for certain that key has not yet been inserted, so no need to check it.\n            place_and_shift_up({dist_and_fingerprint, value_idx}, bucket);\n        }\n    }\n\n    void increase_size() {\n        if (m_max_bucket_capacity == max_bucket_count()) {\n            // remove the value again, we can't add it!\n            m_values.pop_back();\n            on_error_bucket_overflow();\n        }\n        --m_shifts;\n        if constexpr (!IsSegmented || std::is_same_v<BucketContainer, default_container_t>) {\n            deallocate_buckets();\n        }\n        allocate_buckets_from_shift();\n        clear_and_fill_buckets_from_values();\n    }\n\n    template <typename Op>\n    void do_erase(value_idx_type bucket_idx, Op handle_erased_value) {\n        auto const value_idx_to_remove = at(m_buckets, bucket_idx).m_value_idx;\n\n        // shift down until either empty or an element with correct spot is found\n        auto next_bucket_idx = next(bucket_idx);\n        while (at(m_buckets, next_bucket_idx).m_dist_and_fingerprint >= Bucket::dist_inc * 2) {\n            at(m_buckets, bucket_idx) = {dist_dec(at(m_buckets, next_bucket_idx).m_dist_and_fingerprint),\n                                         at(m_buckets, next_bucket_idx).m_value_idx};\n            bucket_idx = std::exchange(next_bucket_idx, next(next_bucket_idx));\n        }\n        at(m_buckets, bucket_idx) = {};\n        handle_erased_value(std::move(m_values[value_idx_to_remove]));\n\n        // update m_values\n        if (value_idx_to_remove != m_values.size() - 1) {\n            // no luck, we'll have to replace the value with the last one and update the index accordingly\n            auto& val = m_values[value_idx_to_remove];\n            val = std::move(m_values.back());\n\n            // update the values_idx of the moved entry. No need to play the info game, just look until we find the values_idx\n            auto mh = mixed_hash(get_key(val));\n            bucket_idx = bucket_idx_from_hash(mh);\n\n            auto const values_idx_back = static_cast<value_idx_type>(m_values.size() - 1);\n            while (values_idx_back != at(m_buckets, bucket_idx).m_value_idx) {\n                bucket_idx = next(bucket_idx);\n            }\n            at(m_buckets, bucket_idx).m_value_idx = value_idx_to_remove;\n        }\n        m_values.pop_back();\n    }\n\n    template <typename K, typename Op>\n    auto do_erase_key(K&& key, Op handle_erased_value) -> size_t {\n        if (empty()) {\n            return 0;\n        }\n\n        auto [dist_and_fingerprint, bucket_idx] = next_while_less(key);\n\n        while (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&\n               !m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {\n            dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n            bucket_idx = next(bucket_idx);\n        }\n\n        if (dist_and_fingerprint != at(m_buckets, bucket_idx).m_dist_and_fingerprint) {\n            return 0;\n        }\n        do_erase(bucket_idx, handle_erased_value);\n        return 1;\n    }\n\n    template <class K, class M>\n    auto do_insert_or_assign(K&& key, M&& mapped) -> std::pair<iterator, bool> {\n        auto it_isinserted = try_emplace(std::forward<K>(key), std::forward<M>(mapped));\n        if (!it_isinserted.second) {\n            it_isinserted.first->second = std::forward<M>(mapped);\n        }\n        return it_isinserted;\n    }\n\n    template <typename... Args>\n    auto do_place_element(dist_and_fingerprint_type dist_and_fingerprint, value_idx_type bucket_idx, Args&&... args)\n        -> std::pair<iterator, bool> {\n\n        // emplace the new value. If that throws an exception, no harm done; index is still in a valid state\n        m_values.emplace_back(std::forward<Args>(args)...);\n\n        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);\n        if (ANKERL_UNORDERED_DENSE_UNLIKELY(is_full())) {\n            increase_size();\n        } else {\n            place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);\n        }\n\n        // place element and shift up until we find an empty spot\n        return {begin() + static_cast<difference_type>(value_idx), true};\n    }\n\n    template <typename K, typename... Args>\n    auto do_try_emplace(K&& key, Args&&... args) -> std::pair<iterator, bool> {\n        auto hash = mixed_hash(key);\n        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);\n        auto bucket_idx = bucket_idx_from_hash(hash);\n\n        while (true) {\n            auto* bucket = &at(m_buckets, bucket_idx);\n            if (dist_and_fingerprint == bucket->m_dist_and_fingerprint) {\n                if (m_equal(key, get_key(m_values[bucket->m_value_idx]))) {\n                    return {begin() + static_cast<difference_type>(bucket->m_value_idx), false};\n                }\n            } else if (dist_and_fingerprint > bucket->m_dist_and_fingerprint) {\n                return do_place_element(dist_and_fingerprint,\n                                        bucket_idx,\n                                        std::piecewise_construct,\n                                        std::forward_as_tuple(std::forward<K>(key)),\n                                        std::forward_as_tuple(std::forward<Args>(args)...));\n            }\n            dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n            bucket_idx = next(bucket_idx);\n        }\n    }\n\n    template <typename K>\n    auto do_find(K const& key) -> iterator {\n        if (ANKERL_UNORDERED_DENSE_UNLIKELY(empty())) {\n            return end();\n        }\n\n        auto mh = mixed_hash(key);\n        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(mh);\n        auto bucket_idx = bucket_idx_from_hash(mh);\n        auto* bucket = &at(m_buckets, bucket_idx);\n\n        // unrolled loop. *Always* check a few directly, then enter the loop. This is faster.\n        if (dist_and_fingerprint == bucket->m_dist_and_fingerprint && m_equal(key, get_key(m_values[bucket->m_value_idx]))) {\n            return begin() + static_cast<difference_type>(bucket->m_value_idx);\n        }\n        dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n        bucket_idx = next(bucket_idx);\n        bucket = &at(m_buckets, bucket_idx);\n\n        if (dist_and_fingerprint == bucket->m_dist_and_fingerprint && m_equal(key, get_key(m_values[bucket->m_value_idx]))) {\n            return begin() + static_cast<difference_type>(bucket->m_value_idx);\n        }\n        dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n        bucket_idx = next(bucket_idx);\n        bucket = &at(m_buckets, bucket_idx);\n\n        while (true) {\n            if (dist_and_fingerprint == bucket->m_dist_and_fingerprint) {\n                if (m_equal(key, get_key(m_values[bucket->m_value_idx]))) {\n                    return begin() + static_cast<difference_type>(bucket->m_value_idx);\n                }\n            } else if (dist_and_fingerprint > bucket->m_dist_and_fingerprint) {\n                return end();\n            }\n            dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n            bucket_idx = next(bucket_idx);\n            bucket = &at(m_buckets, bucket_idx);\n        }\n    }\n\n    template <typename K>\n    auto do_find(K const& key) const -> const_iterator {\n        return const_cast<table*>(this)->do_find(key); // NOLINT(cppcoreguidelines-pro-type-const-cast)\n    }\n\n    template <typename K, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto do_at(K const& key) -> Q& {\n        if (auto it = find(key); ANKERL_UNORDERED_DENSE_LIKELY(end() != it)) {\n            return it->second;\n        }\n        on_error_key_not_found();\n    }\n\n    template <typename K, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto do_at(K const& key) const -> Q const& {\n        return const_cast<table*>(this)->at(key); // NOLINT(cppcoreguidelines-pro-type-const-cast)\n    }\n\npublic:\n    explicit table(size_t bucket_count,\n                   Hash const& hash = Hash(),\n                   KeyEqual const& equal = KeyEqual(),\n                   allocator_type const& alloc_or_container = allocator_type())\n        : m_values(alloc_or_container)\n        , m_buckets(alloc_or_container)\n        , m_hash(hash)\n        , m_equal(equal) {\n        if (0 != bucket_count) {\n            reserve(bucket_count);\n        } else {\n            allocate_buckets_from_shift();\n            clear_buckets();\n        }\n    }\n\n    table()\n        : table(0) {}\n\n    table(size_t bucket_count, allocator_type const& alloc)\n        : table(bucket_count, Hash(), KeyEqual(), alloc) {}\n\n    table(size_t bucket_count, Hash const& hash, allocator_type const& alloc)\n        : table(bucket_count, hash, KeyEqual(), alloc) {}\n\n    explicit table(allocator_type const& alloc)\n        : table(0, Hash(), KeyEqual(), alloc) {}\n\n    template <class InputIt>\n    table(InputIt first,\n          InputIt last,\n          size_type bucket_count = 0,\n          Hash const& hash = Hash(),\n          KeyEqual const& equal = KeyEqual(),\n          allocator_type const& alloc = allocator_type())\n        : table(bucket_count, hash, equal, alloc) {\n        insert(first, last);\n    }\n\n    template <class InputIt>\n    table(InputIt first, InputIt last, size_type bucket_count, allocator_type const& alloc)\n        : table(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}\n\n    template <class InputIt>\n    table(InputIt first, InputIt last, size_type bucket_count, Hash const& hash, allocator_type const& alloc)\n        : table(first, last, bucket_count, hash, KeyEqual(), alloc) {}\n\n    table(table const& other)\n        : table(other, other.m_values.get_allocator()) {}\n\n    table(table const& other, allocator_type const& alloc)\n        : m_values(other.m_values, alloc)\n        , m_max_load_factor(other.m_max_load_factor)\n        , m_hash(other.m_hash)\n        , m_equal(other.m_equal) {\n        copy_buckets(other);\n    }\n\n    table(table&& other) noexcept\n        : table(std::move(other), other.m_values.get_allocator()) {}\n\n    table(table&& other, allocator_type const& alloc) noexcept\n        : m_values(alloc) {\n        *this = std::move(other);\n    }\n\n    table(std::initializer_list<value_type> ilist,\n          size_t bucket_count = 0,\n          Hash const& hash = Hash(),\n          KeyEqual const& equal = KeyEqual(),\n          allocator_type const& alloc = allocator_type())\n        : table(bucket_count, hash, equal, alloc) {\n        insert(ilist);\n    }\n\n    table(std::initializer_list<value_type> ilist, size_type bucket_count, allocator_type const& alloc)\n        : table(ilist, bucket_count, Hash(), KeyEqual(), alloc) {}\n\n    table(std::initializer_list<value_type> init, size_type bucket_count, Hash const& hash, allocator_type const& alloc)\n        : table(init, bucket_count, hash, KeyEqual(), alloc) {}\n\n    ~table() {}\n\n    auto operator=(table const& other) -> table& {\n        if (&other != this) {\n            deallocate_buckets(); // deallocate before m_values is set (might have another allocator)\n            m_values = other.m_values;\n            m_max_load_factor = other.m_max_load_factor;\n            m_hash = other.m_hash;\n            m_equal = other.m_equal;\n            m_shifts = initial_shifts;\n            copy_buckets(other);\n        }\n        return *this;\n    }\n\n    auto operator=(table&& other) noexcept(noexcept(std::is_nothrow_move_assignable_v<value_container_type> &&\n                                                    std::is_nothrow_move_assignable_v<Hash> &&\n                                                    std::is_nothrow_move_assignable_v<KeyEqual>)) -> table& {\n        if (&other != this) {\n            deallocate_buckets(); // deallocate before m_values is set (might have another allocator)\n            m_values = std::move(other.m_values);\n            other.m_values.clear();\n\n            // we can only reuse m_buckets when both maps have the same allocator!\n            if (get_allocator() == other.get_allocator()) {\n                m_buckets = std::move(other.m_buckets);\n                other.m_buckets.clear();\n                m_max_bucket_capacity = std::exchange(other.m_max_bucket_capacity, 0);\n                m_shifts = std::exchange(other.m_shifts, initial_shifts);\n                m_max_load_factor = std::exchange(other.m_max_load_factor, default_max_load_factor);\n                m_hash = std::exchange(other.m_hash, {});\n                m_equal = std::exchange(other.m_equal, {});\n                other.allocate_buckets_from_shift();\n                other.clear_buckets();\n            } else {\n                // set max_load_factor *before* copying the other's buckets, so we have the same\n                // behavior\n                m_max_load_factor = other.m_max_load_factor;\n\n                // copy_buckets sets m_buckets, m_num_buckets, m_max_bucket_capacity, m_shifts\n                copy_buckets(other);\n                // clear's the other's buckets so other is now already usable.\n                other.clear_buckets();\n                m_hash = other.m_hash;\n                m_equal = other.m_equal;\n            }\n            // map \"other\" is now already usable, it's empty.\n        }\n        return *this;\n    }\n\n    auto operator=(std::initializer_list<value_type> ilist) -> table& {\n        clear();\n        insert(ilist);\n        return *this;\n    }\n\n    auto get_allocator() const noexcept -> allocator_type {\n        return m_values.get_allocator();\n    }\n\n    // iterators //////////////////////////////////////////////////////////////\n\n    auto begin() noexcept -> iterator {\n        return m_values.begin();\n    }\n\n    auto begin() const noexcept -> const_iterator {\n        return m_values.begin();\n    }\n\n    auto cbegin() const noexcept -> const_iterator {\n        return m_values.cbegin();\n    }\n\n    auto end() noexcept -> iterator {\n        return m_values.end();\n    }\n\n    auto cend() const noexcept -> const_iterator {\n        return m_values.cend();\n    }\n\n    auto end() const noexcept -> const_iterator {\n        return m_values.end();\n    }\n\n    // capacity ///////////////////////////////////////////////////////////////\n\n    [[nodiscard]] auto empty() const noexcept -> bool {\n        return m_values.empty();\n    }\n\n    [[nodiscard]] auto size() const noexcept -> size_t {\n        return m_values.size();\n    }\n\n    [[nodiscard]] static constexpr auto max_size() noexcept -> size_t {\n        if constexpr ((std::numeric_limits<value_idx_type>::max)() == (std::numeric_limits<size_t>::max)()) {\n            return size_t{1} << (sizeof(value_idx_type) * 8 - 1);\n        } else {\n            return size_t{1} << (sizeof(value_idx_type) * 8);\n        }\n    }\n\n    // modifiers //////////////////////////////////////////////////////////////\n\n    void clear() {\n        m_values.clear();\n        clear_buckets();\n    }\n\n    auto insert(value_type const& value) -> std::pair<iterator, bool> {\n        return emplace(value);\n    }\n\n    auto insert(value_type&& value) -> std::pair<iterator, bool> {\n        return emplace(std::move(value));\n    }\n\n    template <class P, std::enable_if_t<std::is_constructible_v<value_type, P&&>, bool> = true>\n    auto insert(P&& value) -> std::pair<iterator, bool> {\n        return emplace(std::forward<P>(value));\n    }\n\n    auto insert(const_iterator /*hint*/, value_type const& value) -> iterator {\n        return insert(value).first;\n    }\n\n    auto insert(const_iterator /*hint*/, value_type&& value) -> iterator {\n        return insert(std::move(value)).first;\n    }\n\n    template <class P, std::enable_if_t<std::is_constructible_v<value_type, P&&>, bool> = true>\n    auto insert(const_iterator /*hint*/, P&& value) -> iterator {\n        return insert(std::forward<P>(value)).first;\n    }\n\n    template <class InputIt>\n    void insert(InputIt first, InputIt last) {\n        while (first != last) {\n            insert(*first);\n            ++first;\n        }\n    }\n\n    void insert(std::initializer_list<value_type> ilist) {\n        insert(ilist.begin(), ilist.end());\n    }\n\n    // nonstandard API: *this is emptied.\n    // Also see \"A Standard flat_map\" https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p0429r9.pdf\n    auto extract() && -> value_container_type {\n        return std::move(m_values);\n    }\n\n    // nonstandard API:\n    // Discards the internally held container and replaces it with the one passed. Erases non-unique elements.\n    auto replace(value_container_type&& container) {\n        if (ANKERL_UNORDERED_DENSE_UNLIKELY(container.size() > max_size())) {\n            on_error_too_many_elements();\n        }\n        auto shifts = calc_shifts_for_size(container.size());\n        if (0 == bucket_count() || shifts < m_shifts || container.get_allocator() != m_values.get_allocator()) {\n            m_shifts = shifts;\n            deallocate_buckets();\n            allocate_buckets_from_shift();\n        }\n        clear_buckets();\n\n        m_values = std::move(container);\n\n        // can't use clear_and_fill_buckets_from_values() because container elements might not be unique\n        auto value_idx = value_idx_type{};\n\n        // loop until we reach the end of the container. duplicated entries will be replaced with back().\n        while (value_idx != static_cast<value_idx_type>(m_values.size())) {\n            auto const& key = get_key(m_values[value_idx]);\n\n            auto hash = mixed_hash(key);\n            auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);\n            auto bucket_idx = bucket_idx_from_hash(hash);\n\n            bool key_found = false;\n            while (true) {\n                auto const& bucket = at(m_buckets, bucket_idx);\n                if (dist_and_fingerprint > bucket.m_dist_and_fingerprint) {\n                    break;\n                }\n                if (dist_and_fingerprint == bucket.m_dist_and_fingerprint &&\n                    m_equal(key, get_key(m_values[bucket.m_value_idx]))) {\n                    key_found = true;\n                    break;\n                }\n                dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n                bucket_idx = next(bucket_idx);\n            }\n\n            if (key_found) {\n                if (value_idx != static_cast<value_idx_type>(m_values.size() - 1)) {\n                    m_values[value_idx] = std::move(m_values.back());\n                }\n                m_values.pop_back();\n            } else {\n                place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);\n                ++value_idx;\n            }\n        }\n    }\n\n    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto insert_or_assign(Key const& key, M&& mapped) -> std::pair<iterator, bool> {\n        return do_insert_or_assign(key, std::forward<M>(mapped));\n    }\n\n    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto insert_or_assign(Key&& key, M&& mapped) -> std::pair<iterator, bool> {\n        return do_insert_or_assign(std::move(key), std::forward<M>(mapped));\n    }\n\n    template <typename K,\n              typename M,\n              typename Q = T,\n              typename H = Hash,\n              typename KE = KeyEqual,\n              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>\n    auto insert_or_assign(K&& key, M&& mapped) -> std::pair<iterator, bool> {\n        return do_insert_or_assign(std::forward<K>(key), std::forward<M>(mapped));\n    }\n\n    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto insert_or_assign(const_iterator /*hint*/, Key const& key, M&& mapped) -> iterator {\n        return do_insert_or_assign(key, std::forward<M>(mapped)).first;\n    }\n\n    template <class M, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto insert_or_assign(const_iterator /*hint*/, Key&& key, M&& mapped) -> iterator {\n        return do_insert_or_assign(std::move(key), std::forward<M>(mapped)).first;\n    }\n\n    template <typename K,\n              typename M,\n              typename Q = T,\n              typename H = Hash,\n              typename KE = KeyEqual,\n              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>\n    auto insert_or_assign(const_iterator /*hint*/, K&& key, M&& mapped) -> iterator {\n        return do_insert_or_assign(std::forward<K>(key), std::forward<M>(mapped)).first;\n    }\n\n    // Single arguments for unordered_set can be used without having to construct the value_type\n    template <class K,\n              typename Q = T,\n              typename H = Hash,\n              typename KE = KeyEqual,\n              std::enable_if_t<!is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>\n    auto emplace(K&& key) -> std::pair<iterator, bool> {\n        auto hash = mixed_hash(key);\n        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);\n        auto bucket_idx = bucket_idx_from_hash(hash);\n\n        while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {\n            if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&\n                m_equal(key, m_values[at(m_buckets, bucket_idx).m_value_idx])) {\n                // found it, return without ever actually creating anything\n                return {begin() + static_cast<difference_type>(at(m_buckets, bucket_idx).m_value_idx), false};\n            }\n            dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n            bucket_idx = next(bucket_idx);\n        }\n\n        // value is new, insert element first, so when exception happens we are in a valid state\n        return do_place_element(dist_and_fingerprint, bucket_idx, std::forward<K>(key));\n    }\n\n    template <class... Args>\n    auto emplace(Args&&... args) -> std::pair<iterator, bool> {\n        // we have to instantiate the value_type to be able to access the key.\n        // 1. emplace_back the object so it is constructed. 2. If the key is already there, pop it later in the loop.\n        auto& key = get_key(m_values.emplace_back(std::forward<Args>(args)...));\n        auto hash = mixed_hash(key);\n        auto dist_and_fingerprint = dist_and_fingerprint_from_hash(hash);\n        auto bucket_idx = bucket_idx_from_hash(hash);\n\n        while (dist_and_fingerprint <= at(m_buckets, bucket_idx).m_dist_and_fingerprint) {\n            if (dist_and_fingerprint == at(m_buckets, bucket_idx).m_dist_and_fingerprint &&\n                m_equal(key, get_key(m_values[at(m_buckets, bucket_idx).m_value_idx]))) {\n                m_values.pop_back(); // value was already there, so get rid of it\n                return {begin() + static_cast<difference_type>(at(m_buckets, bucket_idx).m_value_idx), false};\n            }\n            dist_and_fingerprint = dist_inc(dist_and_fingerprint);\n            bucket_idx = next(bucket_idx);\n        }\n\n        // value is new, place the bucket and shift up until we find an empty spot\n        auto value_idx = static_cast<value_idx_type>(m_values.size() - 1);\n        if (ANKERL_UNORDERED_DENSE_UNLIKELY(is_full())) {\n            // increase_size just rehashes all the data we have in m_values\n            increase_size();\n        } else {\n            // place element and shift up until we find an empty spot\n            place_and_shift_up({dist_and_fingerprint, value_idx}, bucket_idx);\n        }\n        return {begin() + static_cast<difference_type>(value_idx), true};\n    }\n\n    template <class... Args>\n    auto emplace_hint(const_iterator /*hint*/, Args&&... args) -> iterator {\n        return emplace(std::forward<Args>(args)...).first;\n    }\n\n    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto try_emplace(Key const& key, Args&&... args) -> std::pair<iterator, bool> {\n        return do_try_emplace(key, std::forward<Args>(args)...);\n    }\n\n    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto try_emplace(Key&& key, Args&&... args) -> std::pair<iterator, bool> {\n        return do_try_emplace(std::move(key), std::forward<Args>(args)...);\n    }\n\n    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto try_emplace(const_iterator /*hint*/, Key const& key, Args&&... args) -> iterator {\n        return do_try_emplace(key, std::forward<Args>(args)...).first;\n    }\n\n    template <class... Args, typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto try_emplace(const_iterator /*hint*/, Key&& key, Args&&... args) -> iterator {\n        return do_try_emplace(std::move(key), std::forward<Args>(args)...).first;\n    }\n\n    template <\n        typename K,\n        typename... Args,\n        typename Q = T,\n        typename H = Hash,\n        typename KE = KeyEqual,\n        std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE> && is_neither_convertible_v<K&&, iterator, const_iterator>,\n                         bool> = true>\n    auto try_emplace(K&& key, Args&&... args) -> std::pair<iterator, bool> {\n        return do_try_emplace(std::forward<K>(key), std::forward<Args>(args)...);\n    }\n\n    template <\n        typename K,\n        typename... Args,\n        typename Q = T,\n        typename H = Hash,\n        typename KE = KeyEqual,\n        std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE> && is_neither_convertible_v<K&&, iterator, const_iterator>,\n                         bool> = true>\n    auto try_emplace(const_iterator /*hint*/, K&& key, Args&&... args) -> iterator {\n        return do_try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;\n    }\n\n    auto erase(iterator it) -> iterator {\n        auto hash = mixed_hash(get_key(*it));\n        auto bucket_idx = bucket_idx_from_hash(hash);\n\n        auto const value_idx_to_remove = static_cast<value_idx_type>(it - cbegin());\n        while (at(m_buckets, bucket_idx).m_value_idx != value_idx_to_remove) {\n            bucket_idx = next(bucket_idx);\n        }\n\n        do_erase(bucket_idx, [](value_type&& /*unused*/) {\n        });\n        return begin() + static_cast<difference_type>(value_idx_to_remove);\n    }\n\n    auto extract(iterator it) -> value_type {\n        auto hash = mixed_hash(get_key(*it));\n        auto bucket_idx = bucket_idx_from_hash(hash);\n\n        auto const value_idx_to_remove = static_cast<value_idx_type>(it - cbegin());\n        while (at(m_buckets, bucket_idx).m_value_idx != value_idx_to_remove) {\n            bucket_idx = next(bucket_idx);\n        }\n\n        auto tmp = std::optional<value_type>{};\n        do_erase(bucket_idx, [&tmp](value_type&& val) {\n            tmp = std::move(val);\n        });\n        return std::move(tmp).value();\n    }\n\n    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto erase(const_iterator it) -> iterator {\n        return erase(begin() + (it - cbegin()));\n    }\n\n    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto extract(const_iterator it) -> value_type {\n        return extract(begin() + (it - cbegin()));\n    }\n\n    auto erase(const_iterator first, const_iterator last) -> iterator {\n        auto const idx_first = first - cbegin();\n        auto const idx_last = last - cbegin();\n        auto const first_to_last = std::distance(first, last);\n        auto const last_to_end = std::distance(last, cend());\n\n        // remove elements from left to right which moves elements from the end back\n        auto const mid = idx_first + (std::min)(first_to_last, last_to_end);\n        auto idx = idx_first;\n        while (idx != mid) {\n            erase(begin() + idx);\n            ++idx;\n        }\n\n        // all elements from the right are moved, now remove the last element until all done\n        idx = idx_last;\n        while (idx != mid) {\n            --idx;\n            erase(begin() + idx);\n        }\n\n        return begin() + idx_first;\n    }\n\n    auto erase(Key const& key) -> size_t {\n        return do_erase_key(key, [](value_type&& /*unused*/) {\n        });\n    }\n\n    auto extract(Key const& key) -> std::optional<value_type> {\n        auto tmp = std::optional<value_type>{};\n        do_erase_key(key, [&tmp](value_type&& val) {\n            tmp = std::move(val);\n        });\n        return tmp;\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto erase(K&& key) -> size_t {\n        return do_erase_key(std::forward<K>(key), [](value_type&& /*unused*/) {\n        });\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto extract(K&& key) -> std::optional<value_type> {\n        auto tmp = std::optional<value_type>{};\n        do_erase_key(std::forward<K>(key), [&tmp](value_type&& val) {\n            tmp = std::move(val);\n        });\n        return tmp;\n    }\n\n    void swap(table& other) noexcept(noexcept(std::is_nothrow_swappable_v<value_container_type> &&\n                                              std::is_nothrow_swappable_v<Hash> && std::is_nothrow_swappable_v<KeyEqual>)) {\n        using std::swap;\n        swap(other, *this);\n    }\n\n    // lookup /////////////////////////////////////////////////////////////////\n\n    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto at(key_type const& key) -> Q& {\n        return do_at(key);\n    }\n\n    template <typename K,\n              typename Q = T,\n              typename H = Hash,\n              typename KE = KeyEqual,\n              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>\n    auto at(K const& key) -> Q& {\n        return do_at(key);\n    }\n\n    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto at(key_type const& key) const -> Q const& {\n        return do_at(key);\n    }\n\n    template <typename K,\n              typename Q = T,\n              typename H = Hash,\n              typename KE = KeyEqual,\n              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>\n    auto at(K const& key) const -> Q const& {\n        return do_at(key);\n    }\n\n    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto operator[](Key const& key) -> Q& {\n        return try_emplace(key).first->second;\n    }\n\n    template <typename Q = T, std::enable_if_t<is_map_v<Q>, bool> = true>\n    auto operator[](Key&& key) -> Q& {\n        return try_emplace(std::move(key)).first->second;\n    }\n\n    template <typename K,\n              typename Q = T,\n              typename H = Hash,\n              typename KE = KeyEqual,\n              std::enable_if_t<is_map_v<Q> && is_transparent_v<H, KE>, bool> = true>\n    auto operator[](K&& key) -> Q& {\n        return try_emplace(std::forward<K>(key)).first->second;\n    }\n\n    auto count(Key const& key) const -> size_t {\n        return find(key) == end() ? 0 : 1;\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto count(K const& key) const -> size_t {\n        return find(key) == end() ? 0 : 1;\n    }\n\n    auto find(Key const& key) -> iterator {\n        return do_find(key);\n    }\n\n    auto find(Key const& key) const -> const_iterator {\n        return do_find(key);\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto find(K const& key) -> iterator {\n        return do_find(key);\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto find(K const& key) const -> const_iterator {\n        return do_find(key);\n    }\n\n    auto contains(Key const& key) const -> bool {\n        return find(key) != end();\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto contains(K const& key) const -> bool {\n        return find(key) != end();\n    }\n\n    auto equal_range(Key const& key) -> std::pair<iterator, iterator> {\n        auto it = do_find(key);\n        return {it, it == end() ? end() : it + 1};\n    }\n\n    auto equal_range(const Key& key) const -> std::pair<const_iterator, const_iterator> {\n        auto it = do_find(key);\n        return {it, it == end() ? end() : it + 1};\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto equal_range(K const& key) -> std::pair<iterator, iterator> {\n        auto it = do_find(key);\n        return {it, it == end() ? end() : it + 1};\n    }\n\n    template <class K, class H = Hash, class KE = KeyEqual, std::enable_if_t<is_transparent_v<H, KE>, bool> = true>\n    auto equal_range(K const& key) const -> std::pair<const_iterator, const_iterator> {\n        auto it = do_find(key);\n        return {it, it == end() ? end() : it + 1};\n    }\n\n    // bucket interface ///////////////////////////////////////////////////////\n\n    auto bucket_count() const noexcept -> size_t { // NOLINT(modernize-use-nodiscard)\n        return m_buckets.size();\n    }\n\n    static constexpr auto max_bucket_count() noexcept -> size_t { // NOLINT(modernize-use-nodiscard)\n        return max_size();\n    }\n\n    // hash policy ////////////////////////////////////////////////////////////\n\n    [[nodiscard]] auto load_factor() const -> float {\n        return bucket_count() ? static_cast<float>(size()) / static_cast<float>(bucket_count()) : 0.0F;\n    }\n\n    [[nodiscard]] auto max_load_factor() const -> float {\n        return m_max_load_factor;\n    }\n\n    void max_load_factor(float ml) {\n        m_max_load_factor = ml;\n        if (bucket_count() != max_bucket_count()) {\n            m_max_bucket_capacity = static_cast<value_idx_type>(static_cast<float>(bucket_count()) * max_load_factor());\n        }\n    }\n\n    void rehash(size_t count) {\n        count = (std::min)(count, max_size());\n        auto shifts = calc_shifts_for_size((std::max)(count, size()));\n        if (shifts != m_shifts) {\n            m_shifts = shifts;\n            deallocate_buckets();\n            m_values.shrink_to_fit();\n            allocate_buckets_from_shift();\n            clear_and_fill_buckets_from_values();\n        }\n    }\n\n    void reserve(size_t capa) {\n        capa = (std::min)(capa, max_size());\n        if constexpr (has_reserve<value_container_type>) {\n            // std::deque doesn't have reserve(). Make sure we only call when available\n            m_values.reserve(capa);\n        }\n        auto shifts = calc_shifts_for_size((std::max)(capa, size()));\n        if (0 == bucket_count() || shifts < m_shifts) {\n            m_shifts = shifts;\n            deallocate_buckets();\n            allocate_buckets_from_shift();\n            clear_and_fill_buckets_from_values();\n        }\n    }\n\n    // observers //////////////////////////////////////////////////////////////\n\n    auto hash_function() const -> hasher {\n        return m_hash;\n    }\n\n    auto key_eq() const -> key_equal {\n        return m_equal;\n    }\n\n    // nonstandard API: expose the underlying values container\n    [[nodiscard]] auto values() const noexcept -> value_container_type const& {\n        return m_values;\n    }\n\n    // non-member functions ///////////////////////////////////////////////////\n\n    friend auto operator==(table const& a, table const& b) -> bool {\n        if (&a == &b) {\n            return true;\n        }\n        if (a.size() != b.size()) {\n            return false;\n        }\n        for (auto const& b_entry : b) {\n            auto it = a.find(get_key(b_entry));\n            if constexpr (is_map_v<T>) {\n                // map: check that key is here, then also check that value is the same\n                if (a.end() == it || !(b_entry.second == it->second)) {\n                    return false;\n                }\n            } else {\n                // set: only check that the key is here\n                if (a.end() == it) {\n                    return false;\n                }\n            }\n        }\n        return true;\n    }\n\n    friend auto operator!=(table const& a, table const& b) -> bool {\n        return !(a == b);\n    }\n};\n\n} // namespace detail\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class T,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class AllocatorOrContainer = std::allocator<std::pair<Key, T>>,\n                                        class Bucket = bucket_type::standard,\n                                        class BucketContainer = detail::default_container_t>\nusing map = detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, false>;\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class T,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class AllocatorOrContainer = std::allocator<std::pair<Key, T>>,\n                                        class Bucket = bucket_type::standard,\n                                        class BucketContainer = detail::default_container_t>\nusing segmented_map = detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, true>;\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class AllocatorOrContainer = std::allocator<Key>,\n                                        class Bucket = bucket_type::standard,\n                                        class BucketContainer = detail::default_container_t>\nusing set = detail::table<Key, void, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, false>;\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class AllocatorOrContainer = std::allocator<Key>,\n                                        class Bucket = bucket_type::standard,\n                                        class BucketContainer = detail::default_container_t>\nusing segmented_set = detail::table<Key, void, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, true>;\n\n#    if defined(ANKERL_UNORDERED_DENSE_PMR)\n\nnamespace pmr {\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class T,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class Bucket = bucket_type::standard>\nusing map = detail::table<Key,\n                          T,\n                          Hash,\n                          KeyEqual,\n                          ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<std::pair<Key, T>>,\n                          Bucket,\n                          detail::default_container_t,\n                          false>;\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class T,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class Bucket = bucket_type::standard>\nusing segmented_map = detail::table<Key,\n                                    T,\n                                    Hash,\n                                    KeyEqual,\n                                    ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<std::pair<Key, T>>,\n                                    Bucket,\n                                    detail::default_container_t,\n                                    true>;\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class Bucket = bucket_type::standard>\nusing set = detail::table<Key,\n                          void,\n                          Hash,\n                          KeyEqual,\n                          ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<Key>,\n                          Bucket,\n                          detail::default_container_t,\n                          false>;\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class Hash = hash<Key>,\n                                        class KeyEqual = std::equal_to<Key>,\n                                        class Bucket = bucket_type::standard>\nusing segmented_set = detail::table<Key,\n                                    void,\n                                    Hash,\n                                    KeyEqual,\n                                    ANKERL_UNORDERED_DENSE_PMR::polymorphic_allocator<Key>,\n                                    Bucket,\n                                    detail::default_container_t,\n                                    true>;\n\n} // namespace pmr\n\n#    endif\n\n// deduction guides ///////////////////////////////////////////////////////////\n\n// deduction guides for alias templates are only possible since C++20\n// see https://en.cppreference.com/w/cpp/language/class_template_argument_deduction\n\n} // namespace ANKERL_UNORDERED_DENSE_NAMESPACE\n} // namespace ankerl::unordered_dense\n\n// std extensions /////////////////////////////////////////////////////////////\n\nnamespace std { // NOLINT(cert-dcl58-cpp)\n\nANKERL_UNORDERED_DENSE_EXPORT template <class Key,\n                                        class T,\n                                        class Hash,\n                                        class KeyEqual,\n                                        class AllocatorOrContainer,\n                                        class Bucket,\n                                        class Pred,\n                                        class BucketContainer,\n                                        bool IsSegmented>\n// NOLINTNEXTLINE(cert-dcl58-cpp)\nauto erase_if(\n    ankerl::unordered_dense::detail::table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, IsSegmented>&\n        map,\n    Pred pred) -> size_t {\n    using map_t = ankerl::unordered_dense::detail::\n        table<Key, T, Hash, KeyEqual, AllocatorOrContainer, Bucket, BucketContainer, IsSegmented>;\n\n    // going back to front because erase() invalidates the end iterator\n    auto const old_size = map.size();\n    auto idx = old_size;\n    while (idx) {\n        --idx;\n        auto it = map.begin() + static_cast<typename map_t::difference_type>(idx);\n        if (pred(*it)) {\n            map.erase(it);\n        }\n    }\n\n    return old_size - map.size();\n}\n\n} // namespace std\n\n#endif\n#endif\n"
  },
  {
    "path": "kimimaro/__init__.py",
    "content": "\"\"\"\nKimimaro: TEASAR derived skeletonization for 3D densely labeled images.\n\nKimimaro is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n\nKimimaro is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\nGNU General Public License for more details.\n\nYou should have received a copy of the GNU General Public License\nalong with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.\n\"\"\"\n\nfrom .intake import skeletonize, DimensionError, synapses_to_targets, connect_points\nfrom .post import postprocess, join_close_components\nfrom .utility import (\n\textract_skeleton_from_binary_image,\n\tcross_sectional_area, \n\tcross_sectional_area_single,\n\toversegment,\n)\n"
  },
  {
    "path": "kimimaro/intake.py",
    "content": "\"\"\"\nThis file is part of Kimimaro.\n\nKimimaro is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n\nKimimaro is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\nGNU General Public License for more details.\n\nYou should have received a copy of the GNU General Public License\nalong with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.\n\"\"\"\n\nfrom collections import defaultdict\nfrom functools import partial\nimport gc\nimport multiprocessing as mp\nimport signal\nimport uuid\n\nimport numpy as np\nimport pathos.pools\nimport scipy.spatial\nfrom tqdm import tqdm\n\nfrom osteoid import Skeleton, Bbox\n\nimport cc3d # connected components\nfrom crackle import CrackleArray\nimport edt # euclidean distance transform\nimport fastremap\nimport fill_voids\n\nimport kimimaro.skeletontricks\nimport kimimaro.trace\n\nfrom . import sharedmemory as shm\nfrom .utility import compute_cc_labels, find_objects\n\nclass DimensionError(Exception):\n  pass\n\nDEFAULT_TEASAR_PARAMS = {\n  \"scale\": 1.5, \n  \"const\": 300,\n  \"pdrf_scale\": 100000,\n  \"pdrf_exponent\": 4,\n  \"soma_acceptance_threshold\": 3500,\n  \"soma_detection_threshold\": 750,\n  \"soma_invalidation_const\": 300,\n  \"soma_invalidation_scale\": 2\n}\n\ndef skeletonize(\n  all_labels, teasar_params=DEFAULT_TEASAR_PARAMS, anisotropy=(1,1,1),\n  object_ids=None, dust_threshold=1000, \n  progress=True, fix_branching=True, in_place=False, \n  fix_borders=True, parallel=1, parallel_chunk_size=100,\n  extra_targets_before=[], extra_targets_after=[],\n  fill_holes=False, fix_avocados=False,\n  voxel_graph=None\n):\n  \"\"\"\n  Skeletonize all non-zero labels in a given 2D or 3D image.\n\n  Required:\n    all_labels: a 2D or 3D numpy array of integer type (signed or unsigned) \n\n  Optional:\n    anisotropy: the physical dimensions of each axis (e.g. 4nm x 4nm x 40nm)\n    object_ids: If not none, zero out all labels other than those specified here.\n    teasar_params: {\n      scale: during the \"rolling ball\" invalidation phase, multiply \n          the DBF value by this.\n      const: during the \"rolling ball\" invalidation phase, this \n          is the minimum radius in chosen physical units (i.e. nm).\n      soma_detection_threshold: if object has a DBF value larger than this, \n          root will be placed at largest DBF value and special one time invalidation\n          will be run over that root location (see soma_invalidation scale)\n          expressed in chosen physical units (i.e. nm) \n      pdrf_scale: scale factor in front of dbf, used to weight dbf over euclidean distance (higher to pay more attention to dbf) (default 5000)\n      pdrf_exponent: exponent in dbf formula on distance from edge, faster if factor of 2 (default 16)\n      soma_invalidation_scale: the 'scale' factor used in the one time soma root invalidation (default .5)\n      soma_invalidation_const: the 'const' factor used in the one time soma root invalidation (default 0)\n                             (units in chosen physical units (i.e. nm))\n      max_paths: max paths to trace on a single object. Moves onto the next object after this point.\n    }\n    dust_threshold: don't bother skeletonizing connected components smaller than\n      this many voxels.\n    fill_holes: preemptively run a void filling algorithm on all connected\n      components and delete labels that get filled in. This can improve the\n      quality of the reconstruction if holes in the shapes are artifacts introduced\n      by the segmentation pipeline. This option incurs moderate overhead.\n\n      WARNING: THIS WILL REMOVE INPUT LABELS THAT ARE DEEMED TO BE HOLES.\n\n    extra_targets_before: List of x,y,z voxel coordinates that will all \n      be traced to from the root regardless of whether those points have \n      been invalidated. These targets will be applied BEFORE the regular\n      target selection algorithm is run.      \n\n      e.g. [ (x,y,z), (x,y,z) ]\n\n    extra_targets_after: Same as extra_targets_before but the additional\n      targets will be applied AFTER the usual algorithm runs.\n\n    progress: if true, display a progress bar\n    fix_branching: When enabled, zero the edge weights by of previously \n      traced paths. This causes branch points to occur closer to \n      the actual path divergence. However, there is a performance penalty\n      associated with this as dijkstra's algorithm is computed once per a path\n      rather than once per a skeleton.\n    in_place: if true, allow input labels to be modified to reduce\n      memory usage and possibly improve performance.\n    fix_borders: ensure that segments touching the border place a \n      skeleton endpoint in a predictable place to make merging \n      adjacent chunks easier.\n    fix_avocados: If nuclei are segmented seperately from somata\n      then we can try to detect and fix this issue.\n    voxel_graph: a connection graph that defines permissible \n      directions of motion between voxels. This is useful for\n      dealing with self-touches. The graph is defined by the\n      conventions used in cc3d.voxel_connectivity_graph \n      (https://github.com/seung-lab/connected-components-3d/blob/3.2.0/cc3d_graphs.hpp#L73-L92)\n    parallel: number of subprocesses to use.\n      <= 0: Use multiprocessing.count_cpu() \n         1: Only use the main process.\n      >= 2: Use this number of subprocesses.\n    parallel_chunk_size: default number of skeletons to \n      submit to each parallel process before returning results,\n      updating the progress bar, and submitting a new task set. \n      Setting this number too low results in excess IPC overhead,\n      and setting it too high can result in task starvation towards\n      the end of a job and infrequent progress bar updates. If the\n      chunk size is set higher than num tasks // parallel, that number\n      is used instead.\n\n  Returns: { $segid: osteoid.Skeleton, ... }\n  \"\"\"\n\n  anisotropy = np.array(anisotropy, dtype=np.float32)\n\n  all_labels = format_labels(all_labels, in_place=in_place)\n  all_labels = apply_object_mask(all_labels, object_ids)\n\n  if all_labels.size <= dust_threshold:\n    return {}\n  \n  if isinstance(all_labels, CrackleArray):\n    minlabel = all_labels.min()\n    maxlabel = all_labels.max()\n  else:\n    minlabel, maxlabel = fastremap.minmax(all_labels)\n\n  if minlabel == 0 and maxlabel == 0:\n    return {}\n\n  cc_labels, remapping = compute_cc_labels(all_labels, voxel_graph)\n  del all_labels\n\n  if isinstance(cc_labels, CrackleArray) and (fill_holes or voxel_graph or fix_avocados):\n    cc_labels = cc_labels.numpy()\n\n  if fill_holes:\n    cc_labels = fill_all_holes(cc_labels, progress)\n\n  extra_targets_before = points_to_labels(extra_targets_before, cc_labels)\n  extra_targets_after = points_to_labels(extra_targets_after, cc_labels)\n\n  def edtfn(labels):\n    if isinstance(labels, CrackleArray):\n      labels = labels[:]\n\n    return edt.edt(labels, \n      anisotropy=anisotropy,\n      black_border=(minlabel == maxlabel),\n      parallel=parallel,\n      voxel_graph=voxel_graph,\n    )\n\n  all_dbf = edtfn(cc_labels)\n  \n  if fix_avocados:\n    cc_labels, all_dbf, remapping = engage_avocado_protection(\n      cc_labels, all_dbf, remapping,\n      soma_detection_threshold=teasar_params.get('soma_detection_threshold', 0),\n      edtfn=edtfn,\n      progress=progress,\n    )\n\n  if isinstance(cc_labels, CrackleArray):\n    cc_ct_iterator = cc_labels.voxel_counts().items()\n  else:\n    cc_segids, pxct = fastremap.unique(cc_labels, return_counts=True)\n    cc_ct_iterator = zip(cc_segids, pxct)\n  \n  cc_segids = [ sid for sid, ct in cc_ct_iterator if ct > dust_threshold and sid != 0 ]\n\n  all_slices = find_objects(cc_labels)\n\n  border_targets = defaultdict(list)\n  if fix_borders:\n    border_targets = compute_border_targets(cc_labels, anisotropy)\n\n  print_quotes(parallel) # easter egg\n\n  if parallel <= 0:\n    parallel = mp.cpu_count()\n\n  if parallel == 1:\n    return skeletonize_subset(\n      all_dbf, cc_labels, voxel_graph, remapping, \n      teasar_params, anisotropy, all_slices, \n      border_targets, extra_targets_before, extra_targets_after,\n      progress, fix_borders, fix_branching, \n      cc_segids\n    )\n  else:\n    # The following section can't be moved into \n    # skeletonize parallel because then all_dbf \n    # and cc_labels can't be deleted to save memory.\n    suffix = uuid.uuid1().hex\n\n    dbf_shm_location = 'kimimaro-shm-dbf-' + suffix\n    cc_shm_location = 'kimimaro-shm-cc-labels-' + suffix\n    vg_shm_location = 'kimimaro-shm-voxel-graph-' + suffix\n\n    try:\n      dbf_mmap, all_dbf_shm = shm.ndarray( all_dbf.shape, all_dbf.dtype, dbf_shm_location, order='F')\n      all_dbf_shm[:] = all_dbf \n      del all_dbf \n\n      cc_mmap, cc_labels_shm = shm.ndarray( cc_labels.shape, cc_labels.dtype, cc_shm_location, order='F')    \n      cc_labels_shm[:] = cc_labels[:]\n      del cc_labels\n\n      voxel_graph_shm = None\n      vg_mmap = None\n      if voxel_graph is not None:\n        vg_mmap, voxel_graph_shm = shm.ndarray( voxel_graph.shape, voxel_graph.dtype, vg_shm_location, order='F')    \n        voxel_graph_shm[:] = voxel_graph\n        del voxel_graph\n\n      skeletons = skeletonize_parallel(      \n        all_dbf_shm, dbf_shm_location, \n        cc_labels_shm, cc_shm_location, remapping, \n        voxel_graph_shm, vg_shm_location,\n        teasar_params, anisotropy, all_slices, \n        border_targets, extra_targets_before, extra_targets_after,\n        progress, fix_borders, fix_branching, \n        cc_segids, parallel, parallel_chunk_size\n      )\n    finally:\n      dbf_mmap.close()\n      cc_mmap.close()\n      shm.unlink(dbf_shm_location)\n      shm.unlink(cc_shm_location)\n      if voxel_graph is not None:\n        vg_mmap.close()\n        shm.unlink(vg_shm_location)\n\n    return skeletons\n\ndef connect_points(\n  labels, start, end,\n  anisotropy=(1,1,1), \n  fill_holes=False, \n  in_place=False,\n  pdrf_scale=100000, \n  pdrf_exponent=4,\n):\n  \"\"\"\n  Extract a single centerline skeleton between\n  two preselected points from a binary image.\n\n  labels: a 2D or 3D binary image\n  start: an (x,y,z) tuple\n  end: an (x,y,z) tuple\n\n  anisotropy: the physical dimensions of each axis (e.g. 4nm x 4nm x 40nm)\n  fill_holes: preemptively run a void filling algorithm on all connected\n    components and delete labels that get filled in. This can improve the\n    quality of the reconstruction if holes in the shapes are artifacts introduced\n    by the segmentation pipeline.\n\n  pdrf_scale: scale factor in front of dbf, used to weight dbf over euclidean distance (higher to pay more attention to dbf)\n  pdrf_exponent: exponent in dbf formula on distance from edge, faster if factor of 2\n  \"\"\"\n  anisotropy = np.array(anisotropy, dtype=np.float32)\n  start = tuple(start)\n  end = tuple(end)\n\n  labels = labels.astype(np.bool)\n  labels = format_labels(labels, in_place=in_place)\n\n  cc_labels, remapping = compute_cc_labels(labels)\n  if cc_labels[start] == 0 or cc_labels[start] != cc_labels[end]:\n    raise ValueError(\"Cannot extract centerline from disconnected components.\")\n  del cc_labels\n  del remapping\n\n  skel = kimimaro.trace.point_to_point(\n    labels, start, end,\n    anisotropy=anisotropy, \n    pdrf_scale=pdrf_scale, pdrf_exponent=pdrf_exponent,\n  )\n  skel.vertices *= anisotropy\n  skel.space = 'physical'\n  return skel\n\ndef format_labels(labels, in_place):\n  if isinstance(labels, CrackleArray):\n    return labels\n\n  if in_place:\n    labels = fastremap.asfortranarray(labels)\n  else:\n    labels = np.copy(labels, order='F')\n\n  if labels.dtype == bool:\n    labels = labels.view(np.uint8)\n\n  original_shape = labels.shape\n\n  while labels.ndim < 3:\n    labels = labels[..., np.newaxis ]\n\n  while labels.ndim > 3:\n    if labels.shape[-1] == 1:\n      labels = labels[..., 0]\n    else:\n      raise DimensionError(\n        \"Input labels may be no more than three non-trivial dimensions. Got: {}\".format(\n          original_shape\n        )\n      )\n\n  return labels\n\ndef skeletonize_parallel(\n    all_dbf_shm, dbf_shm_location, \n    cc_labels_shm, cc_shm_location, remapping, \n    voxel_graph_shm, vg_shm_location,\n    teasar_params, anisotropy, all_slices, \n    border_targets, extra_targets_before, extra_targets_after,\n    progress, fix_borders, fix_branching, \n    cc_segids, parallel, chunk_size\n  ):\n    prevsigint = signal.getsignal(signal.SIGINT)\n    prevsigterm = signal.getsignal(signal.SIGTERM)\n    \n    # Don't fork, spawn entirely new processes. This\n    # avoids accidental deadlocks.\n    mp.set_start_method(\"spawn\", force=True)\n    \n    executor = pathos.pools.ProcessPool(parallel)\n\n    def cleanup(signum, frame):\n      shm.unlink(dbf_shm_location)\n      shm.unlink(cc_shm_location)\n      executor.terminate()\n\n    signal.signal(signal.SIGINT, cleanup)\n    signal.signal(signal.SIGTERM, cleanup)   \n\n    vg_shape = voxel_graph_shm.shape if voxel_graph_shm else None\n    vg_dtype = voxel_graph_shm.dtype if voxel_graph_shm else None\n\n    skeletonizefn = partial(parallel_skeletonize_subset, \n      dbf_shm_location, all_dbf_shm.shape, all_dbf_shm.dtype, \n      cc_shm_location, cc_labels_shm.shape, cc_labels_shm.dtype,\n      vg_shm_location, vg_shape, vg_dtype,\n      remapping, teasar_params, anisotropy, all_slices, \n      border_targets, extra_targets_before, extra_targets_after, \n      False, # progress, use our own progress bar below\n      fix_borders, fix_branching, \n    )\n\n    ccids = []\n    if chunk_size < len(cc_segids) // parallel:\n      for i in range(0, len(cc_segids), chunk_size):\n        ccids.append(cc_segids[i:i+chunk_size])\n    else:\n      for i in range(parallel):\n        ccids.append(cc_segids[i::parallel])\n\n    skeletons = defaultdict(list)\n    with tqdm(total=len(cc_segids), disable=(not progress), desc=\"Skeletonizing Labels\") as pbar:\n      for skels in executor.uimap(skeletonizefn, ccids):\n        for segid, skel in skels.items():\n          skeletons[segid].append(skel)\n        pbar.update(len(skels))\n    executor.close()\n    executor.join()\n    executor.clear()\n\n    signal.signal(signal.SIGINT, prevsigint)\n    signal.signal(signal.SIGTERM, prevsigterm)\n    \n    shm.unlink(dbf_shm_location)\n    shm.unlink(cc_shm_location)\n    shm.unlink(vg_shm_location)\n\n    return merge(skeletons)\n\ndef parallel_skeletonize_subset(    \n    dbf_shm_location, dbf_shape, dbf_dtype, \n    cc_shm_location, cc_shape, cc_dtype, \n    vg_shm_location, vg_shape, vg_dtype,\n    *args, **kwargs\n  ):\n  \n  dbf_mmap, all_dbf = shm.ndarray( dbf_shape, dtype=dbf_dtype, location=dbf_shm_location, order='F')\n  cc_mmap, cc_labels = shm.ndarray( cc_shape, dtype=cc_dtype, location=cc_shm_location, order='F')\n\n  if vg_shape is None:\n    vg_mmap, voxel_graph = None, None\n  else:\n    vg_mmap, voxel_graph = shm.ndarray( vg_shape, dtype=vg_dtype, location=vg_shm_location, order='F')\n\n  skels = skeletonize_subset(all_dbf, cc_labels, voxel_graph, *args, **kwargs)\n\n  dbf_mmap.close()\n  cc_mmap.close()\n  if vg_mmap:\n    vg_mmap.close()\n\n  return skels\n\ndef skeletonize_subset(\n    all_dbf, cc_labels, voxel_graph, remapping, \n    teasar_params, anisotropy, all_slices, \n    border_targets, extra_targets_before, extra_targets_after,\n    progress, fix_borders, fix_branching, \n    cc_segids\n  ):\n\n  skeletons = defaultdict(list)\n\n  with tqdm(cc_segids, disable=(not progress), desc=\"Skeletonizing Labels\") as pbar:\n    for segid in pbar:\n\n      pbar.set_postfix(label=str(remapping[segid]))\n\n      # Crop DBF to ROI\n      slices = all_slices[segid - 1]\n      if slices is None:\n        continue\n\n      roi = Bbox.from_slices(slices)\n      if roi.volume() <= 1:\n        continue\n\n      if isinstance(cc_labels, CrackleArray):\n        labels = cc_labels.decompress(label=segid, crop=True)\n        label_slcs = (slices[0], slices[1], slice(None))\n        labels = np.asfortranarray(labels[label_slcs])\n      else:\n        labels = cc_labels[slices]\n        labels = (labels == segid)\n\n      dbf = np.where(labels, all_dbf[slices], 0.0)\n      cropped_voxel_graph = (voxel_graph[slices] if voxel_graph is not None else None)\n\n      manual_targets_before = []\n      manual_targets_after = []\n      root = None \n\n      def translate_to_roi(targets):\n        targets = np.array(targets)\n        targets -= roi.minpt.astype(np.uint32)\n        return targets.tolist()      \n\n      # We only source a predetermined root from \n      # border_targets because we understand that it's\n      # located at a reasonable place at the edge of the\n      # shape. In theory, extra targets can be positioned\n      # anywhere within the shape or off the shape, making it \n      # a dicey proposition. \n      if len(border_targets[segid]) > 0:\n        manual_targets_before = translate_to_roi(border_targets[segid])\n        root = manual_targets_before.pop()\n\n      if segid in extra_targets_before and len(extra_targets_before[segid]) > 0:\n        manual_targets_before.extend( translate_to_roi(extra_targets_before[segid]) )\n\n      if segid in extra_targets_after and len(extra_targets_after[segid]) > 0:\n        manual_targets_after.extend( translate_to_roi(extra_targets_after[segid]) )\n\n      skeleton = kimimaro.trace.trace(\n        labels, \n        dbf, \n        anisotropy=anisotropy, \n        fix_branching=fix_branching, \n        manual_targets_before=manual_targets_before,\n        manual_targets_after=manual_targets_after,\n        root=root,\n        voxel_graph=cropped_voxel_graph,\n        **teasar_params\n      )\n\n      if skeleton.empty():\n        continue\n\n      skeleton.vertices += roi.minpt.astype(skeleton.vertices.dtype, copy=False)\n\n      orig_segid = remapping[segid]\n      skeleton.id = orig_segid\n      skeleton.vertices = np.multiply(skeleton.vertices, anisotropy, dtype=np.float32)\n      skeleton.space = 'physical'\n      skeletons[orig_segid].append(skeleton)\n\n  return merge(skeletons)\n\ndef apply_object_mask(all_labels, object_ids):\n  if object_ids is None:\n    return all_labels\n\n  if isinstance(all_labels, CrackleArray):\n    mask = all_labels.labels()\n    mask = { u: 0 for u in mask }\n    for segid in object_ids:\n      mask[segid] = segid\n    return all_labels.remap(mask).condense()\n\n  if len(object_ids) == 1:\n    all_labels = kimimaro.skeletontricks.zero_out_all_except(all_labels, object_ids[0]) # faster\n  else:\n    all_labels = fastremap.mask_except(all_labels, object_ids, in_place=True)\n\n  return all_labels\n\ndef points_to_labels(pts, cc_labels):\n  mapping = defaultdict(list)\n  for pt in pts:\n    pt = tuple(pt)\n    mapping[ cc_labels[pt] ].append(pt)\n  return mapping\n\ndef compute_border_targets(cc_labels, anisotropy):\n  sx, sy, sz = cc_labels.shape\n\n  if isinstance(cc_labels, CrackleArray):\n    cc_labels = cc_labels.numpy()\n\n  planes = (\n    ( cc_labels[:,:,0], (0, 1), lambda x,y: (x, y, 0) ),     # top xy\n    ( cc_labels[:,:,-1], (0, 1), lambda x,y: (x, y, sz-1) ), # bottom xy\n    ( cc_labels[:,0,:], (0, 2), lambda x,z: (x, 0, z) ),     # left xz\n    ( cc_labels[:,-1,:], (0, 2), lambda x,z: (x, sy-1, z) ), # right xz\n    ( cc_labels[0,:,:], (1, 2), lambda y,z: (0, y, z) ),     # front yz\n    ( cc_labels[-1,:,:], (1, 2), lambda y,z: (sx-1, y, z) )  # back yz\n  )\n\n  target_list = defaultdict(set)\n\n  for plane, dims, rotatefn in planes:\n    wx, wy = anisotropy[dims[0]], anisotropy[dims[1]]\n    plane = np.copy(plane, order='F')\n    cc_plane = cc3d.connected_components(np.ascontiguousarray(plane))\n    dt_plane = edt.edt(cc_plane, black_border=True, anisotropy=(wx, wy))\n\n    plane_targets = kimimaro.skeletontricks.find_border_targets(\n      dt_plane, cc_plane, wx, wy\n    )\n\n    plane = plane[..., np.newaxis]\n    cc_plane = cc_plane[..., np.newaxis]\n    remapping = kimimaro.skeletontricks.get_mapping(plane, cc_plane)\n\n    for label, pt in plane_targets.items():\n      label = remapping[label]\n      target_list[label].add(\n        rotatefn( int(pt[0]), int(pt[1]) )\n      )\n\n  target_list.default_factory = lambda: np.array([], np.uint32)\n  for label, pts in target_list.items():\n    target_list[label] = np.array(list(pts), dtype=np.uint32)\n\n  return target_list\n\ndef merge(skeletons):\n  merged_skels = {}\n  for segid, skels in skeletons.items():\n    skel = Skeleton.simple_merge(skels)\n    merged_skels[segid] = skel.consolidate()\n\n  return merged_skels\n\ndef argmax(arr):\n  if arr.flags['C_CONTIGUOUS']:\n    return np.unravel_index(np.argmax(arr), arr.shape, order='C')\n  return np.unravel_index(np.argmax(arr.T), arr.shape, order='F')\n\ndef engage_avocado_protection(\n  cc_labels, all_dbf, remapping,\n  soma_detection_threshold, edtfn, \n  progress\n):\n  orig_cc_labels = np.copy(cc_labels, order='F')\n\n  unchanged = set()\n\n  # This loop handles nested avocados\n  # Unless there are deeply nested double avocados,\n  # this should complete in 2-3 passes. We limit it\n  # to 20 just to make sure this loop terminates no matter what.\n  # Avocados aren't the end of the world.\n  for _ in tqdm(range(20), disable=(not progress), desc=\"Avocado Pass\"): \n    # Note: Divide soma_detection_threshold by a bit more than 2 because the nucleii are going to be\n    # about a factor of 2 or less smaller than what we'd expect from a cell. For example,\n    # in an avocado I saw, the DBF of the nucleus was 499 when the detection threshold was \n    # set to 1100.\n    candidates = set(fastremap.unique(cc_labels * (all_dbf > soma_detection_threshold / 2.5)))\n    candidates -= unchanged\n    candidates.discard(0)\n\n    cc_labels, unchanged_this_cycle, changes = engage_avocado_protection_single_pass(\n      cc_labels, all_dbf,\n      candidates=candidates,\n      progress=progress,\n    )\n    unchanged |= unchanged_this_cycle\n\n    if len(changes) == 0:\n      break \n    \n    all_dbf = edtfn(cc_labels)\n\n  # Downstream logic assumes cc_labels is contigiously numbered\n  cc_labels, _ = fastremap.renumber(cc_labels, in_place=True)\n  cc_remapping = kimimaro.skeletontricks.get_mapping(orig_cc_labels, cc_labels)\n\n  adjusted_remapping = {}\n  for new_cc, cc in cc_remapping.items():\n    if cc in remapping:\n      adjusted_remapping[new_cc] = remapping[cc]\n\n  return cc_labels, all_dbf, adjusted_remapping\n\ndef engage_avocado_protection_single_pass(\n  cc_labels, all_dbf, \n  candidates=None, progress=False\n):\n  \"\"\"\n  For each candidate, check if there's a fruit around the\n  avocado pit roughly from the center (the max EDT).\n  \"\"\"\n\n  if candidates is None:\n    candidates = fastremap.unique(cc_labels)\n\n  candidates = [ label for label in candidates if label != 0 ]\n\n  unchanged = set()\n  changed = set()\n\n  if len(candidates) == 0:\n    return cc_labels, unchanged, changed\n\n  def paint_walls(binimg):\n    \"\"\"\n    Ensure that inclusions that touch the wall are handled\n    by performing a 2D fill on each wall.\n    \"\"\"\n    binimg[:,:,0 ] = fill_voids.fill(binimg[:,:,0 ])\n    binimg[:,:,-1] = fill_voids.fill(binimg[:,:,-1])\n    binimg[:,0,: ] = fill_voids.fill(binimg[:,0,: ])\n    binimg[:,-1,:] = fill_voids.fill(binimg[:,-1,:])\n    binimg[0,:,: ] = fill_voids.fill(binimg[0,:,: ])\n    binimg[-1,:,:] = fill_voids.fill(binimg[-1,:,:])\n    return binimg\n\n  slcs = find_objects(cc_labels)\n\n  for label in tqdm(candidates, disable=(not progress), desc=\"Fixing Avocados\"):\n    slc = slcs[label - 1]\n    offset = Bbox.from_slices(slc).minpt\n    binimg = paint_walls(cc_labels[slc] == label) # image of the pit\n    coord = argmax(binimg * all_dbf[slc]) + offset\n\n    (pit, fruit) = kimimaro.skeletontricks.find_avocado_fruit(\n      cc_labels, coord[0], coord[1], coord[2]\n    )\n    if pit == fruit and pit not in changed:\n      unchanged.add(pit)\n    else:\n      unchanged.discard(pit)\n      unchanged.discard(fruit)\n      changed.add(pit)\n      changed.add(fruit)\n      binimg |= (cc_labels[slc] == fruit)\n\n    fruit = np.asarray(fruit, dtype=cc_labels.dtype)\n    binimg, N = fill_voids.fill(binimg, in_place=True, return_fill_count=True)\n    cc_labels[slc] *= ~binimg\n    cc_labels[slc] += fruit * binimg\n\n  return cc_labels, unchanged, changed\n\ndef synapses_to_targets(labels, synapses, progress=False):\n  \"\"\"\n  Turn the output of synapse detection and assignment, usually \n  centroid + pre/post into actionable targets. For a given \n  labeled volume, take the centroid and a pre or post label\n  and find the nearest voxel for that label and add the coordinates\n  of that voxel to a list of targets.\n\n  labels: a 3d array containing labels\n  synapses: { label: [ (centroid, swc_label), (centroid, swc_label), ... ] }\n    where centroid is an (x,y,z) float triple in voxel coordinate space\n      where the origin is the same as for labels\n    where swc_label is the label to be added to the vertex attributes for\n      the resulting target.\n    where label is a presynaptic OR a postsynaptic label\n      (submit two items to cover both)\n\n  Returns: { (x,y,z): swc_label, ... } targets for skeletonization\n  \"\"\"\n  while labels.ndim > 3:\n    labels = labels[...,0]\n\n  targets = {}\n\n  for label, pairs in tqdm(synapses.items(), disable=(not progress), desc='Converting Synapses to Targets'):\n    point_cloud = np.vstack((labels == label).nonzero()).T # [ [x,y,z], ... ]\n    if len(point_cloud) == 0:\n      continue\n\n    swc_labels = defaultdict(list) \n    for centroid, swc_label in pairs:\n      swc_labels[swc_label].append(centroid)\n\n    for swc_label, centroids in swc_labels.items():\n      distances = scipy.spatial.distance.cdist(point_cloud, centroids)\n      minima = np.unique(np.argmin(distances, axis=0))\n      tmp_targets = [ tuple(point_cloud[idx]) for idx in minima ]\n      targets.update({ target: swc_label for target in tmp_targets })\n\n  return targets\n\ndef fill_all_holes(cc_labels, progress=False, return_fill_count=False):\n  \"\"\"\n  Fills the holes in each connected component and removes components that\n  get filled in. The idea is that holes (entirely contained labels or background) \n  are artifacts in cell segmentations. A common example is a nucleus segmented \n  separately from the rest of the cell or errors in a manual segmentation leaving\n  a void in a dendrite.\n\n  cc_labels: an image containing connected components with labels smaller than\n    the number of voxels in the image.\n  progress: Display a progress bar or not.\n  return_fill_count: if specified, return a tuple (filled_image, N) where N is\n    the number of voxels that were filled in.\n\n  Returns: filled_in_labels\n  \"\"\"\n  labels = fastremap.unique(cc_labels)\n  labels_set = set(labels)\n  labels_set.discard(0)\n\n  all_slices = find_objects(cc_labels)\n  pixels_filled = 0\n\n  for label in tqdm(labels, disable=(not progress), desc=\"Filling Holes\"):\n    if label not in labels_set:\n      continue\n\n    slices = all_slices[label - 1]\n    if slices is None:\n      continue\n\n    binary_image = (cc_labels[slices] == label)\n    binary_image, N = fill_voids.fill(\n      binary_image, in_place=True, \n      return_fill_count=True\n    )\n    pixels_filled += N\n    if N == 0:\n      continue \n\n    sub_labels = set(fastremap.unique(cc_labels[slices] * binary_image))\n    sub_labels.remove(label)\n    labels_set -= sub_labels\n    cc_labels[slices] = cc_labels[slices] * ~binary_image + label * binary_image\n\n  if return_fill_count:\n    return cc_labels, pixels_filled\n  return cc_labels\n\ndef print_quotes(parallel):\n  if parallel == -1:\n    print(\"Against the power of will I possess... The capability of my body is nothing.\")\n  elif parallel == -2:\n    print(\"I will see the truth of this world... OROCHIMARU-SAMA WILL SHOW ME!!!\")\n\n  if -2 <= parallel < 0:\n    print(\"CURSED SEAL OF THE EARTH!!!\")  \n"
  },
  {
    "path": "kimimaro/post.py",
    "content": "\"\"\"\nPostprocessing for joining skeletons chunks generated by\nskeletonizing adjacent image chunks. \n\nAuthors: Alex Bae and Will Silversmith\nAffiliation: Seung Lab, Princeton Neuroscience Institue\nDate: June 2018 - June 2019\n\nThis file is part of Kimimaro.\n\nKimimaro is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n\nKimimaro is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\nGNU General Public License for more details.\n\nYou should have received a copy of the GNU General Public License\nalong with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.\n\"\"\"\nfrom typing import Sequence\n\nfrom collections import defaultdict\n\nimport fastremap\nimport networkx as nx\nimport numpy as np\n\nfrom scipy import spatial\nfrom scipy.sparse import lil_matrix\nfrom scipy.sparse.csgraph import dijkstra\nimport scipy.sparse.csgraph as csgraph\nimport scipy.spatial.distance\n\ntry:\n  from pykdtree.kdtree import KDTree\nexcept ImportError:\n  from scipy.spatial import cKDTree as KDTree\n\nfrom osteoid import Skeleton, Bbox\n\nimport kimimaro.skeletontricks\n\n## Public API of Module\n\ndef postprocess(\n  skeleton:Skeleton, \n  dust_threshold:float = 1500.0, \n  tick_threshold:float = 3000.0,\n) -> Skeleton:\n  \"\"\"\n  Postprocessing of a skeleton enables aggregation of adjacent\n  or overlapping skeletonized image chunks to be fused into a\n  single coherent skeleton.  \n\n  The following steps are applied:\n  1) Remove disconnected components smaller than the \n      dust threshold (measured in physical distance).\n  2) Skeletons are supposed to be trees, so we remove\n    any loops that were introduced by joining chunks \n    together. Loops that occur inside the lumen of a \n    neuron might be collapsed into their centroid. Loops\n    that occur due to, e.g. mergers are broken arbitarily.\n  3) Disconnected components that are closer than the sum\n     of their boundary distance are connected.\n  4) Small \"ticks\", or branches from the main skeleton, are\n     removed one at a time, from smallest to largest. Branches\n     larger than the physical tick_threshold are preserved. \n\n  Returns: Skeleton\n  \"\"\"\n  label = skeleton.id\n\n  # necessary for removing trivial loops etc\n  # remove_loops and remove_ticks assume a \n  # clean representation\n  skeleton = skeleton.consolidate() \n\n  skeleton = remove_dust(skeleton, dust_threshold) \n  skeleton = remove_loops(skeleton)\n  skeleton = join_close_components(skeleton, restrict_by_radius=True)\n  skeleton = remove_ticks(skeleton, tick_threshold)\n  skeleton.id = label\n  return skeleton.consolidate()\n\ndef join_close_components(\n  skeletons:Sequence[Skeleton], \n  radius:float = np.inf,\n  restrict_by_radius:bool = False,\n) -> Skeleton:\n  \"\"\"\n  Given a set of skeletons which may contain multiple connected components,\n  attempt to connect each component to the nearest other component via the\n  nearest two vertices. Repeat until no components remain or no points closer\n  than `radius` are available.\n\n  radius: in same units as skeletons, don't join pieces \n    further away than this.\n  restrict_by_radius: If the skeletons have a radius property,\n    don't join pieces if the neighboring nodes are further away\n    than r1 + r2.\n\n  Returns: Skeleton\n  \"\"\"\n  if radius is None:\n    radius = np.inf\n\n  if radius is not None and radius <= 0:\n    raise ValueError(\"radius must be greater than zero: \" + str(radius))\n\n  try:\n    iter(skeletons)\n  except TypeError:\n    skeletons = [ skeletons ]\n\n  skels = []\n  for skeleton in skeletons:\n    skels += skeleton.components()\n\n  skels = [ skl.consolidate() for skl in skels if not skl.empty() ]\n\n  if len(skels) == 1:\n    return skels[0]\n  elif len(skels) == 0:\n    return Skeleton()\n\n  N = len(skels)\n  radii_matrix = np.full( (N, N), np.inf, dtype=np.float32 )\n  index_matrix = np.full( (N, N, 2), np.iinfo(np.uint32).max, dtype=np.uint32 )\n\n  if restrict_by_radius:\n    radius = 2 * np.max([ np.max(s.radii) for s in skels ])\n    radius = max(radius, 0)\n\n  def compute_nearest(tree, i, j):\n    s1, s2 = skels[i], skels[j]\n    r, idx = tree.query(\n      s2.vertices, \n      k=1, \n      distance_upper_bound=(radius + 0.000001), # < bound, so +epsilon\n    )\n    idx_s2 = np.argmin(r)\n    idx_s1 = idx[idx_s2]\n\n    local_radius = r[idx_s2]\n\n    if (\n      restrict_by_radius\n      and not np.isinf(local_radius)\n      and hasattr(s1, \"radii\")\n      and hasattr(s2, \"radii\")\n      and local_radius > (s1.radii[idx_s1] + s2.radii[idx_s2])\n    ):\n      local_radius = np.inf\n\n    radii_matrix[i,j] = local_radius\n    radii_matrix[j,i] = local_radius\n\n    index_matrix[i,j] = ( idx_s1, idx_s2 )\n    index_matrix[j,i] = index_matrix[j,i]\n\n  def symmetric_delete(matrix, k):\n    matrix = np.delete(matrix, k, axis=0)\n    return np.delete(matrix, k, axis=1)\n\n  for i in range(N):\n    tree = KDTree(skels[i].vertices)\n    for j in range(i + 1, N):  # compute upper triangle only\n      compute_nearest(tree, i, j)\n    del tree\n\n  while len(skels) > 1:\n    \n    if np.all(radii_matrix) == np.inf:\n      break\n\n    min_radius = np.min(radii_matrix)\n    if np.isinf(min_radius) or min_radius > radius:\n      break\n\n    i, j = np.unravel_index( np.argmin(radii_matrix), radii_matrix.shape )\n    s1, s2 = skels[i], skels[j]\n    fused = Skeleton.simple_merge([s1, s2])\n\n    fused.edges = np.concatenate([\n      fused.edges,\n      [[ index_matrix[i,j,0], index_matrix[i,j,1] + s1.vertices.shape[0] ]]\n    ])\n    skels[i] = None\n    skels[j] = None\n    skels = [ fused ] + [ _ for _ in skels if _ is not None ]\n\n    radii_matrix = symmetric_delete(radii_matrix, i)\n    radii_matrix = symmetric_delete(radii_matrix, j - 1)\n    \n    N = len(skels)\n    radii_matrix2 = np.full((N,N), np.inf, dtype=np.float32)\n    radii_matrix2[1:,1:] = radii_matrix\n    radii_matrix = radii_matrix2\n    del radii_matrix2\n\n    index_matrix = symmetric_delete(index_matrix, i)\n    index_matrix = symmetric_delete(index_matrix, j - 1)\n    \n    index_matrix2 = np.full((N,N,2), np.iinfo(np.uint32).max, dtype=np.uint32 )\n    index_matrix2[1:,1:] = index_matrix\n    index_matrix = index_matrix2\n    del index_matrix2\n\n    tree = KDTree(skels[0].vertices)\n    for j in range(1,N):\n      compute_nearest(tree, 0, j)\n    del tree\n\n  return Skeleton.simple_merge(skels).consolidate()\n\n## Implementation Details Below\n\ndef remove_dust(skeleton, dust_threshold):\n  \"\"\"Dust threshold in physical cable length.\"\"\"\n  \n  if skeleton.empty() or dust_threshold == 0:\n    return skeleton\n\n  skels = [] \n  for skel in skeleton.components():\n    if skel.cable_length() > dust_threshold:\n      skels.append(skel)\n\n  return Skeleton.simple_merge(skels)\n\ndef remove_ticks(skeleton, threshold):\n  \"\"\"\n  Simple merging of individual TESAR cubes results in lots of little \n  ticks due to the edge effect. We can remove them by thresholding\n  the path length from a given branch to the \"main body\" of the neurite. \n  We successively remove paths from shortest to longest until no branches\n  below threshold remain.\n\n  If TEASAR parameters were chosen such that they allowed for spines to\n  be traced, this is also an opportunity to correct for that.\n\n  This algorithm is O(N^2) in the number of terminal nodes.\n\n  Parameters:\n    threshold: The maximum length in nanometers that may be culled.\n\n  Returns: tick free skeleton\n  \"\"\"\n  if skeleton.empty() or threshold == 0:\n    return skeleton\n\n  skels = []\n  for component in skeleton.components():\n    skels.append(_remove_ticks(component, threshold))\n\n  return Skeleton.simple_merge(skels).consolidate(remove_disconnected_vertices=False)\n\ndef _remove_ticks(skeleton, threshold):\n  \"\"\"\n  For a single connected component, remove \"ticks\" below a threshold. \n  Ticks are a path connecting a terminal node to a branch point that\n  are physically shorter than the specified threshold. \n\n  Every time a tick is removed, it potentially changes the topology\n  of the components. Once a branch point's number of edges drops to\n  two, the two paths connecting to it can be unified into one. Sometimes\n  a single object exists that has no branches but is below threshold. We\n  do not delete these objects as there would be nothing left.\n\n  Each time the minimum length tick is removed, it can change which \n  tick is the new minimum tick and requires reevaluation of the whole \n  skeleton. Previously, we did not perform this reevaluation and it \n  resulted in the ends of neurites being clipped. \n\n  This makes the algorithm quadratic in the number of terminal branches.\n  As high resolution skeletons can have tens of thousands of nodes and \n  dozens of branches, a full topological reevaluation becomes relatively \n  expensive. However, we only need to know the graph of distances between\n  critical points, defined as the set of branch points and terminal points, \n  in the skeleton in order to evaluate the topology. \n\n  Therefore, we first compute this distance graph before proceeding with\n  tick removal. The algorithm remains quadratic in the number of terminal\n  points, but the constant speed up is very large as we move from a regime\n  of tens of thousands to hundreds of thousands of points needing reevaluation\n  to at most hundreds and often only a handful in typical cases. In the \n  pathological case of a skeleton with numerous single point extrusions,\n  the performance of the algorithm collapses approximately to the previous\n  regime (though without the assistence of the constant factor of numpy speed).\n\n  Requires:\n    skeleton: a Skeleton that is guaranteed to be a single \n      connected component.\n    threshold: distance in nanometers below which a branch is considered\n      a \"tick\" eligible to be removed.\n\n  Returns: a \"tick\" free Skeleton\n  \"\"\"\n  if skeleton.empty():\n    return skeleton\n\n  dgraph = kimimaro.skeletontricks.create_distance_graph(skeleton)\n  vertices = skeleton.vertices\n  edges = skeleton.edges\n\n  unique_nodes, unique_counts = fastremap.unique(edges, return_counts=True)\n  terminal_nodes = set(unique_nodes[ unique_counts == 1 ])\n\n  branch_idx = np.where(unique_counts >= 3)[0]\n\n  branch_counts = defaultdict(int)\n  for i in branch_idx:\n    branch_counts[unique_nodes[i]] = unique_counts[i]\n\n  G = nx.Graph()\n  G.add_edges_from(edges)\n\n  terminal_superedges = set([ edg for edg in dgraph.keys() if (edg[0] in terminal_nodes or edg[1] in terminal_nodes) ])\n\n  def fuse_edge(edg1):\n    unify = [ edg for edg in dgraph.keys() if edg1 in edg ]\n    new_dist = 0.0\n    for edg in unify:\n      terminal_superedges.discard(edg)\n      new_dist += dgraph[edg]\n      del dgraph[edg]\n    unify = set([ item for sublist in unify for item in sublist ])\n    unify.remove(edg1)\n    dgraph[tuple(unify)] = new_dist\n    terminal_superedges.add(tuple(unify))\n    branch_counts[edg1] = 0\n\n  while len(dgraph) > 1:\n    min_edge = min(terminal_superedges, key=dgraph.get)\n    e1, e2 = min_edge\n\n    if branch_counts[e1] == 1 and branch_counts[e2] == 1:\n      break\n    elif dgraph[min_edge] >= threshold:\n      break\n\n    path = nx.shortest_path(G, e1, e2)\n    path = [ (path[i], path[i+1]) for i in range(len(path) - 1) ]\n    G.remove_edges_from(path)\n\n    del dgraph[min_edge]\n    terminal_superedges.remove(min_edge)\n    branch_counts[e1] -= 1\n    branch_counts[e2] -= 1\n\n    if branch_counts[e1] == 2:\n      fuse_edge(e1)\n    if branch_counts[e2] == 2:\n      fuse_edge(e2)\n\n  skel = skeleton.clone()\n  skel.edges = np.array(list(G.edges), dtype=np.uint32)\n  return skel\n\ndef _create_distance_graph(skeleton):\n  \"\"\"\n  Creates the distance \"supergraph\" from a single connected component \n  skeleton as described in _remove_ticks.\n\n  Returns: a distance \"supergraph\" describing the physical distance\n    between the critical points in the skeleton's structure.\n\n  Example skeleton with output:\n\n      60nm   60nm   60nm     \n    1------2------3------4\n      30nm |  70nm \\\n           5        ----6\n\n  { \n    (1,2): 60,  \n    (2,3): 60,\n    (2,5): 30,\n    (3,4): 60,\n    (3,6): 70,\n  }\n  \"\"\"\n  vertices = skeleton.vertices\n  edges = skeleton.edges\n\n  unique_nodes, unique_counts = fastremap.unique(edges, return_counts=True)\n  terminal_nodes = unique_nodes[ unique_counts == 1 ]\n  branch_nodes = set(unique_nodes[ unique_counts >= 3 ])\n  \n  critical_points = set(terminal_nodes)\n  critical_points.update(branch_nodes)\n\n  tree = defaultdict(set)\n\n  for e1, e2 in edges:\n    tree[e1].add(e2)\n    tree[e2].add(e1)\n\n  # The below depth first search would be\n  # more elegantly implemented as recursion,\n  # but it quickly blows the stack, mandating\n  # an iterative implementation.\n\n  stack = [ terminal_nodes[0] ]\n  parents = [ -1 ]\n  dist_stack = [ 0.0 ]\n  root_stack = [ terminal_nodes[0] ]\n  distgraph = defaultdict(float) # the distance \"supergraph\"\n\n  while stack:\n    node = stack.pop()\n    dist = dist_stack.pop()\n    root = root_stack.pop()\n    parent = parents.pop()\n\n    if node in critical_points and node != root:\n      distgraph[ (root, node) ] = dist\n      dist = 0.0\n      root = node\n\n    for child in tree[node]:\n      if child != parent:\n        stack.append(child)\n        parents.append(node)\n        dist_stack.append(\n          dist + np.linalg.norm(vertices[node,:] - vertices[child,:])\n        )\n        root_stack.append(root)\n\n  return distgraph\n\ndef remove_loops(skeleton):\n  if skeleton.empty():\n    return skeleton\n\n  skels = []\n  for component in skeleton.components():\n    skels.append(_remove_loops(component))\n\n  return Skeleton.simple_merge(skels).consolidate(remove_disconnected_vertices=False)\n\ndef _remove_loops(skeleton):\n  nodes = skeleton.vertices\n  edges = np.copy(skeleton.edges).astype(np.int32)\n\n  while True: # Loop until all cycles are removed\n    edges = edges.astype(np.int32)\n    cycle_path = kimimaro.skeletontricks.find_cycle(edges)\n    # cycle_path = kimimaro.skeletontricks.find_cycle_cython(edges)\n\n    if len(cycle_path) == 0:\n      break\n\n    edges_cycle = path2edge(cycle_path)\n\n    edges_cycle = np.array(edges_cycle, dtype=np.uint32)\n    edges_cycle.sort(axis=1, kind='quicksort')\n\n    nodes_cycle = fastremap.unique(edges_cycle)\n    nodes_cycle = nodes_cycle.astype(np.int32)\n    \n    unique_nodes, unique_counts = fastremap.unique(edges, return_counts=True)\n    branch_nodes = unique_nodes[ unique_counts >= 3 ]\n\n    # branch cycles are cycle nodes that coincide with a branch point\n    branch_cycle = nodes_cycle[np.isin(nodes_cycle,branch_nodes)]\n    branch_cycle = branch_cycle.astype(np.int32)\n\n    # Summary:\n    # 0 external branches: isolated loop, just remove it\n    # 1 external branch  : remove the loop but draw a line\n    #   from the branch point to the farthest node in the loop.\n    # 2 external branches: remove the shortest path between\n    #   the two entry/exit points. \n    # 3+ external branches: collapse the cycle into its centroid\n    #   if the radius of the centroid is less than the EDT radius\n    #   of the pixel located at the centroid. Otherwise, arbitrarily\n    #   cut an edge from the cycle to break it. This radius rule prevents\n    #   issues where we collapse to a point outside of the neurite.\n\n    # Loop with a tail\n    if branch_cycle.shape[0] == 1:\n      branch_cycle_point = nodes[branch_cycle, :]\n      cycle_points = nodes[nodes_cycle, :]\n\n      dist = np.sum((cycle_points - branch_cycle_point) ** 2, 1)\n      end_node = nodes_cycle[np.argmax(dist)]\n\n      edges = remove_row(edges, edges_cycle)        \n      new_edge = np.array([[branch_cycle[0], end_node]], dtype=np.int32) \n      edges = np.concatenate((edges, new_edge), 0)\n\n    # Loop with an entrance and an exit\n    elif branch_cycle.shape[0] == 2:\n\n      # compute the shortest path between the two branch points\n      path = np.array(cycle_path[1:])\n      pos = np.where(np.isin(path, branch_cycle))[0]\n      if (pos[1] - pos[0]) < len(path) / 2:\n        path = path[pos[0]:pos[1]+1]\n      else:\n        path = np.concatenate((path[pos[1]:], path[:pos[0]+1]), 0)\n\n      edge_path = path2edge(path)\n      edge_path.sort(axis=1, kind='quicksort')\n\n      row_valid = np.ones(edges_cycle.shape[0])\n      for i in range(edge_path.shape[0]):\n        row_valid -= (edges_cycle[:,0] == edge_path[i,0]) * (edges_cycle[:,1] == edge_path[i,1])\n\n      row_valid = row_valid.astype(bool)\n      edge_path = edges_cycle[row_valid,:]\n\n      edges = remove_row(edges, edge_path)\n\n    # Totally isolated loop\n    elif branch_cycle.shape[0] == 0:\n      edges = remove_row(edges, edges_cycle)\n\n    # Loops with many ways in and out\n    # looks like here we unify them into their\n    # centroid. This doesn't work well if the loop\n    # is large.\n    else:\n      branch_cycle_points = nodes[branch_cycle,:]\n\n      centroid = np.mean(branch_cycle_points, axis=0)\n      dist = (nodes - centroid)\n      dist *= dist\n      dist = np.sum(dist, axis=1)\n      intersect_node = np.argmin(dist)\n      intersect_point = nodes[intersect_node,:]\n\n      dist = np.sum((branch_cycle_points - intersect_point) ** 2, 1)\n      dist = np.sqrt(np.max(dist))\n\n      # Fix the \"stargate\" issue where a large loop\n      # can join lots of things to the near center\n      # by just making a tiny snip if the distance\n      # is greater than the radius of the connected node.\n      if dist > skeleton.radii[ intersect_node ]:\n        edges = remove_row(edges, edges_cycle[:1,:])\n        continue\n\n      edges = remove_row(edges, edges_cycle)      \n\n      new_edges = np.zeros((branch_cycle.shape[0], 2))\n      new_edges[:,0] = branch_cycle\n      new_edges[:,1] = intersect_node\n\n      if np.isin(intersect_node, branch_cycle):\n        idx = np.where(branch_cycle == intersect_node)\n        new_edges = np.delete(new_edges, idx, 0)\n\n      edges = np.concatenate((edges,new_edges), 0)\n\n  skeleton.vertices = nodes\n  skeleton.edges = edges.astype(np.uint32)\n  return skeleton\n\ndef path2edge(path):\n  \"\"\"\n  path: sequence of nodes\n\n  Returns: sequence separated into edges\n  \"\"\"\n  edges = np.zeros([len(path) - 1, 2], dtype=np.uint32)\n  edges[:,0] = path[0:-1]\n  edges[:,1] = path[1:]\n  return edges\n\ndef remove_row(array, rows2remove): \n  array.sort(axis=1, kind='quicksort')\n  if array.size == 0:\n    return array.astype(np.int32, copy=False)\n\n  rows2remove.sort(axis=1, kind='quicksort')\n\n  for i in range(rows2remove.shape[0]):  \n    idx = find_row(array,rows2remove[i,:])  \n    if np.sum(idx == -1) == 0: \n      array = np.delete(array, idx, axis=0) \n  \n  return array.astype(np.int32, copy=False)\n\ndef find_row(array, row): \n  \"\"\" \n  array: array to search for  \n  row: row to find  \n   Returns: row indices \n  \"\"\" \n  matches = (array[:,0] == row[0])\n  matches &= (array[:,1] == row[1])\n  idx = np.where(matches)\n  if len(idx) == 0:\n    return -1\n  return idx[0]\n"
  },
  {
    "path": "kimimaro/sharedmemory.py",
    "content": "from collections import defaultdict\nimport errno\nimport mmap\nimport os\nimport sys\nimport time\n\nimport multiprocessing as mp\n\nimport numpy as np\n\nfrom osteoid import Bbox, Vec\n\nfrom .utility import mkdir\n\nSHM_DIRECTORY = '/dev/shm/'\nEMULATED_SHM_DIRECTORY = '/tmp/kimimaro-shm'\n\nEMULATE_SHM = not os.path.isdir(SHM_DIRECTORY)\nPLATFORM_SHM_DIRECTORY = SHM_DIRECTORY if not EMULATE_SHM else EMULATED_SHM_DIRECTORY\n\nclass SharedMemoryReadError(Exception):\n  pass\n\nclass SharedMemoryAllocationError(Exception):\n  pass\n\ndef ndarray(shape, dtype, location, order='F', readonly=False, lock=None, **kwargs):\n  \"\"\"\n  Create a shared memory numpy array. \n  Lock is only necessary while doing multiprocessing on \n  platforms without /dev/shm type  shared memory as \n  filesystem emulation will be used instead.\n\n  Allocating the shared array requires cleanup on your part.\n  A shared memory file will be located at sharedmemory.PLATFORM_SHM_DIRECTORY + location\n  and must be unlinked when you're done. It will outlive the program.\n\n  You should also call .close() on the mmap file handle when done. However,\n  this is less of a problem because the operating system will close the\n  file handle on process termination.\n\n  Parameters:\n  shape: same as numpy.ndarray\n  dtype: same as numpy.ndarray\n  location: the shared memory filename \n  lock: (optional) multiprocessing.Lock\n\n  Returns: (mmap filehandle, shared ndarray)\n  \"\"\"\n  if EMULATE_SHM:\n    return ndarray_fs(\n      shape, dtype, location, lock, \n      readonly, order, emulate_shm=True, **kwargs\n    )\n  return ndarray_shm(shape, dtype, location, readonly, order, **kwargs)\n\ndef ndarray_fs(\n    shape, dtype, location, lock, \n    readonly=False, order='F', emulate_shm=False,\n    **kwargs\n  ):\n  \"\"\"Emulate shared memory using the filesystem.\"\"\"\n  dbytes = np.dtype(dtype).itemsize\n  nbytes = Vec(*shape).rectVolume() * dbytes\n\n  if emulate_shm:\n    directory = mkdir(EMULATED_SHM_DIRECTORY)\n    filename = os.path.join(directory, location)\n  else:\n    filename = location\n\n  if lock:\n    lock.acquire()\n\n  try:\n    allocate_shm_file(filename, nbytes, dbytes, readonly)\n  finally:\n    if lock:\n      lock.release()\n\n  with open(filename, 'r+b') as f:\n    array_like = mmap.mmap(f.fileno(), 0) # map entire file\n  \n  renderbuffer = np.ndarray(buffer=array_like, dtype=dtype, shape=shape, order=order, **kwargs)\n  renderbuffer.setflags(write=(not readonly))\n  return array_like, renderbuffer\n\ndef allocate_shm_file(filename, nbytes, dbytes, readonly):\n  try:\n    size = os.path.getsize(filename)\n    exists = True\n  except FileNotFoundError:\n    size = 0\n    exists = False\n\n  if readonly and not exists:\n    raise SharedMemoryReadError(filename + \" has not been allocated. Requested \" + str(nbytes) + \" bytes.\")\n  elif readonly and size != nbytes:\n    raise SharedMemoryReadError(\"{} exists, but the allocation size ({} bytes) does not match the request ({} bytes).\".format(\n      filename, size, nbytes\n    ))\n\n  if exists: \n    if size > nbytes:\n      with open(filename, 'wb') as f:\n        os.ftruncate(f.fileno(), nbytes)\n    elif size < nbytes:\n      # too small? just remake it below\n      os.unlink(filename) \n\n  exists = os.path.exists(filename)\n\n  if not exists:\n    # Previously we were writing out real files full of zeros, \n    # but a) that takes forever and b) modern OSes support sparse\n    # files (i.e. gigabytes of zeros that take up only a few real bytes).\n    #\n    # The following should take advantage of this functionality and be faster.\n    # It should work on Python 2.7 Unix, and Python 3.5+ on Unix and Windows.\n    #\n    # References:\n    #   https://stackoverflow.com/questions/8816059/create-file-of-particular-size-in-python\n    #   https://docs.python.org/3/library/os.html#os.ftruncate\n    #   https://docs.python.org/2/library/os.html#os.ftruncate\n    #\n    with open(filename, 'wb') as f:\n      os.ftruncate(f.fileno(), nbytes)\n\ndef ndarray_shm(shape, dtype, location, readonly=False, order='F', **kwargs):\n  \"\"\"Create a shared memory numpy array. Requires /dev/shm to exist.\"\"\"\n  import posix_ipc\n  from posix_ipc import O_CREAT\n  import psutil\n\n  nbytes = Vec(*shape).rectVolume() * np.dtype(dtype).itemsize\n  available = psutil.virtual_memory().available\n\n  preexisting = 0\n  # This might only work on Ubuntu\n  shmloc = os.path.join(SHM_DIRECTORY, location)\n  if os.path.exists(shmloc):\n    preexisting = os.path.getsize(shmloc)\n  elif readonly:\n    raise SharedMemoryReadError(shmloc + \" has not been allocated. Requested \" + str(nbytes) + \" bytes.\")\n\n  if readonly and preexisting != nbytes:\n    raise SharedMemoryReadError(\"{} exists, but the allocation size ({} bytes) does not match the request ({} bytes).\".format(\n      shmloc, preexisting, nbytes\n    ))\n\n  if (nbytes - preexisting) > available:\n    overallocated = nbytes - preexisting - available\n    overpercent = (100 * overallocated / (preexisting + available))\n    raise SharedMemoryAllocationError(\"\"\"\n      Requested more memory than is available. \n\n      Shared Memory Location:  {}\n\n      Shape:                   {}\n      Requested Bytes:         {} \n      \n      Available Bytes:         {} \n      Preexisting Bytes*:      {} \n\n      Overallocated Bytes*:    {} (+{:.2f}%)\n\n      * Preexisting is only correct on linux systems that support /dev/shm/\"\"\" \\\n        .format(location, shape, nbytes, available, preexisting, overallocated, overpercent))\n\n  # This might seem like we're being \"extra safe\" but consider\n  # a threading condition where the condition of the shared memory\n  # was adjusted between the check above and now. Better to make sure\n  # that we don't accidently change anything if readonly is set.\n  flags = 0 if readonly else O_CREAT \n  size = 0 if readonly else int(nbytes) \n\n  try:\n    shared = posix_ipc.SharedMemory(location, flags=flags, size=size)\n    array_like = mmap.mmap(shared.fd, shared.size)\n    os.close(shared.fd)\n    renderbuffer = np.ndarray(buffer=array_like, dtype=dtype, shape=shape, order=order, **kwargs)\n  except OSError as err:\n    if err.errno == errno.ENOMEM: # Out of Memory\n      posix_ipc.unlink_shared_memory(location)      \n    raise\n\n  renderbuffer.setflags(write=(not readonly))\n  return array_like, renderbuffer\n\ndef unlink(location):\n  if EMULATE_SHM:\n    return unlink_fs(location)\n  return unlink_shm(location)\n\ndef unlink_shm(location):\n  import posix_ipc\n  try:\n    posix_ipc.unlink_shared_memory(location)\n  except posix_ipc.ExistentialError:\n    return False\n  return True\n\ndef unlink_fs(location):\n  directory = mkdir(EMULATED_SHM_DIRECTORY)\n  try:\n    filename = os.path.join(directory, location)\n    os.unlink(filename)\n    return True\n  except OSError:\n    return False\n"
  },
  {
    "path": "kimimaro/trace.py",
    "content": "\"\"\"\nSkeletonization algorithm based on TEASAR (Sato et al. 2000).\n\nAuthors: Alex Bae and Will Silversmith\nAffiliation: Seung Lab, Princeton Neuroscience Institue\nDate: June 2018 - Februrary 2025\n\nThis file is part of Kimimaro.\n\nKimimaro is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n\nKimimaro is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\nGNU General Public License for more details.\n\nYou should have received a copy of the GNU General Public License\nalong with Kimimaro.  If not, see <https://www.gnu.org/licenses/>.\n\"\"\"\nfrom collections import defaultdict\nfrom math import log\n\nimport dijkstra3d\nimport edt\nimport fill_voids\nimport numpy as np\nfrom scipy import ndimage\n\nimport kimimaro.skeletontricks\n\nfrom osteoid import Skeleton\n\ndef trace(\n    labels, DBF, \n    scale=10, const=10, anisotropy=(1,1,1), \n    soma_detection_threshold=1100, \n    soma_acceptance_threshold=4000, \n    pdrf_scale=5000, pdrf_exponent=16,\n    soma_invalidation_scale=0.5,\n    soma_invalidation_const=0,\n    fix_branching=True,\n    manual_targets_before=[],\n    manual_targets_after=[],\n    root=None,\n    max_paths=None,\n    voxel_graph=None,\n  ):\n  \"\"\"\n  Given the euclidean distance transform of a label (\"Distance to Boundary Function\"), \n  convert it into a skeleton using an algorithm based on TEASAR. \n\n  DBF: Result of the euclidean distance transform. Must represent a single label,\n       assumed to be expressed in chosen physical units (i.e. nm)\n  scale: during the \"rolling ball\" invalidation phase, multiply the DBF value by this.\n  const: during the \"rolling ball\" invalidation phase, this is the minimum radius in chosen physical units (i.e. nm).\n  anisotropy: (x,y,z) conversion factor for voxels to chosen physical units (i.e. nm)\n  soma_detection_threshold: if object has a DBF value larger than this, \n    root will be placed at largest DBF value and special one time invalidation\n    will be run over that root location (see soma_invalidation scale)\n    expressed in chosen physical units (i.e. nm) \n  pdrf_scale: scale factor in front of dbf, used to weight dbf over euclidean distance (higher to pay more attention to dbf) (default 5000)\n  pdrf_exponent: exponent in dbf formula on distance from edge, faster if factor of 2 (default 16)\n  soma_invalidation_scale: the 'scale' factor used in the one time soma root invalidation (default .5)\n  soma_invalidation_const: the 'const' factor used in the one time soma root invalidation (default 0)\n                           (units in chosen physical units (i.e. nm))\n  fix_branching: When enabled, zero out the graph edge weights traversed by \n    of previously found paths. This causes branch points to occur closer to \n    the actual path divergence. However, there is a large performance penalty\n    associated with this as dijkstra's algorithm is computed once per a path\n    rather than once per a skeleton.\n  manual_targets_before: list of (x,y,z) that correspond to locations that must \n    have paths drawn to. Used for specifying root and border targets for\n    merging adjacent chunks out-of-core. Targets are applied before ordinary\n    target selection.\n  manual_targets_after: Same as manual_targets_before but the additional \n    targets are applied after the usual algorithm runs. The current \n    invalidation status of the shape makes no difference.\n  max_paths: If a label requires drawing this number of paths or more,\n    abort and move onto the next label.\n  root: If you want to force the root to be a particular voxel, you can\n    specify it here.\n  voxel_graph: a connection graph that defines permissible \n    directions of motion between voxels. This is useful for\n    dealing with self-touches. The graph is defined by the\n    conventions used in cc3d.voxel_connectivity_graph \n    (https://github.com/seung-lab/connected-components-3d/blob/3.2.0/cc3d_graphs.hpp#L73-L92)\n\n  Based on the algorithm by:\n\n  M. Sato, I. Bitter, M. Bender, A. Kaufman, and M. Nakajima. \n  \"TEASAR: tree-structure extraction algorithm for accurate and robust skeletons\"  \n    Proc. the Eighth Pacific Conference on Computer Graphics and Applications. Oct. 2000.\n    doi:10.1109/PCCGA.2000.883951 (https://ieeexplore.ieee.org/document/883951/)\n\n  Returns: Skeleton object\n  \"\"\"\n  dbf_max = np.max(DBF)\n  labels = np.asfortranarray(labels)\n  DBF = np.asfortranarray(DBF)\n\n  soma_mode = False\n  # > 5000 nm, gonna be a soma or blood vessel\n  # For somata: specially handle the root by \n  # placing it at the approximate center of the soma\n  if dbf_max > soma_detection_threshold:\n    labels, num_voxels_filled = fill_voids.fill(labels, in_place=True, return_fill_count=True)\n    if num_voxels_filled > 0:\n      del DBF\n      DBF = edt.edt(\n        labels, \n        anisotropy=anisotropy, \n        black_border=np.all(labels),\n        voxel_graph=voxel_graph,\n      )\n    dbf_max = np.max(DBF) \n    soma_mode = dbf_max > soma_acceptance_threshold\n\n  soma_radius = 0.0\n\n  if soma_mode:\n    if root is not None:\n      manual_targets_before.insert(0, root)\n    root = find_soma_root(DBF, dbf_max)    \n    soma_radius = dbf_max * soma_invalidation_scale + soma_invalidation_const\n  elif root is None:\n    root = find_root(labels, anisotropy, voxel_graph)\n  \n  if root is None:\n    return Skeleton()\n \n  free_space_radius = 0 if not soma_mode else DBF[root]\n  # DBF: Distance to Boundary Field\n  # DAF: Distance from any voxel Field (distance from root field)\n  # PDRF: Penalized Distance from Root Field\n  DBF = kimimaro.skeletontricks.zero2inf(DBF) # DBF[ DBF == 0 ] = np.inf\n  DAF, target = dijkstra3d.euclidean_distance_field(\n    labels, root, \n    anisotropy=anisotropy, \n    free_space_radius=free_space_radius,\n    voxel_graph=voxel_graph,\n    return_max_location=True,\n  )\n  DAF = kimimaro.skeletontricks.inf2zero(DAF) # DAF[ DAF == np.inf ] = 0\n  target_finder = kimimaro.skeletontricks.CachedTargetFinder(labels, DAF)\n  PDRF = compute_pdrf(dbf_max, pdrf_scale, pdrf_exponent, DBF, DAF, DAF[target])\n  del DAF\n\n  # Use dijkstra propogation w/o a target to generate a field of\n  # pointers from each voxel to its parent. Then we can rapidly\n  # compute multiple paths by simply hopping pointers using path_from_parents\n  if not fix_branching:\n    parents = dijkstra3d.parental_field(PDRF, root, voxel_graph=voxel_graph)\n    del PDRF\n  else:\n    parents = PDRF\n\n  if soma_mode:\n    invalidated, labels = kimimaro.skeletontricks.roll_invalidation_ball_inside_component(\n      labels, DBF, \n      soma_invalidation_scale,\n      soma_invalidation_const, \n      anisotropy,\n      [root],\n      voxel_connectivity_graph=voxel_graph,\n    )\n  # This target is only valid if no \n  # invalidations have occured yet.\n  elif len(manual_targets_before) == 0:\n    manual_targets_before.append(target)\n  \n  paths = compute_paths(\n    root, labels, DBF, target_finder, \n    parents, scale, const, anisotropy, \n    soma_mode, soma_radius, fix_branching,\n    manual_targets_before, manual_targets_after, \n    max_paths, voxel_graph\n  )\n\n  skel = Skeleton.simple_merge(\n    [ Skeleton.from_path(path) for path in paths if len(path) > 0 ]\n  ).consolidate()\n\n  verts = skel.vertices.flatten().astype(np.uint32)\n  skel.radii = DBF[verts[::3], verts[1::3], verts[2::3]]\n  skel.transform = np.array([\n    [anisotropy[0], 0, 0, 0],\n    [0, anisotropy[1], 0, 0],\n    [0, 0, anisotropy[2], 0],\n  ], dtype=np.float32)\n\n  return skel\n\ndef compute_paths(\n    root, labels, DBF, target_finder, \n    parents, scale, const, anisotropy, \n    soma_mode, soma_radius, fix_branching,\n    manual_targets_before, manual_targets_after,\n    max_paths, voxel_graph\n  ):\n  \"\"\"\n  Given the labels, DBF, DAF, dijkstra parents,\n  and associated invalidation knobs, find the set of paths \n  that cover the object. Somas are given special treatment\n  in that we attempt to cull vertices within a radius of the\n  root vertex.\n  \"\"\"\n  paths = []\n  valid_labels = np.count_nonzero(labels)\n  root = tuple(root)\n\n  if max_paths is None:\n    max_paths = valid_labels\n\n  if len(manual_targets_before) + len(manual_targets_after) >= max_paths:\n    return []\n\n  parents[tuple(root)] = 0 # provide initial rail for dijkstra.railroad\n\n  while (valid_labels > 0 or manual_targets_before or manual_targets_after) \\\n    and len(paths) < max_paths:\n\n    if manual_targets_before:\n      target = manual_targets_before.pop()\n    elif valid_labels == 0:\n      target = manual_targets_after.pop()\n    else:\n      target = target_finder.find_target(labels)\n\n    if fix_branching:\n      # Draw a path (a \"road\") from the target to the nearest zero weighted\n      # path (a \"rail\"). This has some minor efficiencies vs drawing\n      # from a target all the way to the source. Also, target -> source\n      # is much more efficient than source -> target for three reasons.\n      # (a) target -> catches a rail instead of exploring all rails\n      # (b) target has a natural edge effect that restrict exploration\n      # (c) in soma, target -> source follows gradients vs fights them\n      path = dijkstra3d.railroad(\n        parents, target, voxel_graph=voxel_graph\n      )\n    else:\n      path = dijkstra3d.path_from_parents(parents, target)\n    \n    if soma_mode:\n      dist_to_soma_root = np.linalg.norm(anisotropy * (path - root), axis=1)\n      # remove all path points which are within soma_radius of root\n      path = np.concatenate(\n        (path[:1,:], path[dist_to_soma_root > soma_radius, :])\n      )\n\n    if valid_labels > 0:\n      invalidated, labels = kimimaro.skeletontricks.roll_invalidation_ball_inside_component(\n        labels, DBF, scale, const, \n        anisotropy, path,\n        voxel_connectivity_graph=voxel_graph,\n      )      \n      valid_labels -= invalidated\n\n    for vertex in path:\n      if fix_branching:\n        parents[tuple(vertex)] = 0.0\n\n    paths.append(path)\n\n  return paths\n\ndef find_soma_root(DBF, dbf_max):\n  \"\"\"\n  This perhaps overcomplicates things, but it's possible,\n  for example in a rectangular cuboid, for there to be\n  many multiple maxima at the center of a shape. We pick\n  the one closest to the centroid of the shape to ensure\n  the choice is sensible.\n\n  Returns: (x,y,z) as integers\n  \"\"\"\n  maxima = (DBF == dbf_max)\n  com = ndimage.measurements.center_of_mass(maxima)\n  com = np.asarray(com, dtype=np.float32)\n  \n  coords = np.where(maxima)\n  coords = np.vstack( coords ).T\n  root = np.argmin(\n    np.sum((coords - com) ** 2, axis=1)\n  )\n\n  return tuple(coords[root].astype(np.uint32))\n\ndef find_root(labels, anisotropy, voxel_graph):\n  \"\"\"\n  \"4.4 DAF:  Compute distance from any voxel field\"\n  Compute DAF, but we immediately convert to the PDRF\n  The extremal point of the PDRF is a valid root node\n  even if the DAF is computed from an arbitrary pixel.\n  \"\"\"\n  any_voxel = kimimaro.skeletontricks.first_label(labels)   \n  if any_voxel is None: \n    return None\n\n  DAF, target = dijkstra3d.euclidean_distance_field(\n    labels, any_voxel, \n    anisotropy=anisotropy,\n    return_max_location=True,\n    voxel_graph=voxel_graph,\n  )\n  return target\n\ndef is_power_of_two(num):\n  if int(num) != num:\n    return False\n  return num != 0 and ((num & (num - 1)) == 0)\n\ndef compute_pdrf(\n  dbf_max, pdrf_scale, \n  pdrf_exponent, DBF, DAF,\n  max_daf\n):\n  \"\"\"\n  Add p(v) to the DAF (pp. 4, section 4.5)\n  \"4.5 PDRF: Compute penalized distance from root voxel field\"\n  Let M > max(DBF)\n  p(v) = 5000 * (1 - DBF(v) / M)^16\n  5000 is chosen to allow skeleton segments to be up to 3000 voxels\n  long without exceeding floating point precision.\n\n  IMPLEMENTATION NOTE: \n  Appearently repeated *= is much faster than \"** f(16)\" \n  12,740.0 microseconds vs 4 x 560 = 2,240 microseconds (5.69x)\n\n  More clearly written:\n  PDRF = DAF + 5000 * ((1 - DBF * M) ** 16)\n  \"\"\"\n  f = lambda x: np.float32(x)\n  M = f( 1 / (dbf_max ** 1.01) )\n\n  # First branch is much faster than ** which presumably\n  # uses logarithms to do the exponentiation.\n  PDRF = np.empty(DBF.shape, dtype=np.float32, order=\"F\")\n  np.multiply(DBF, M, out=PDRF)\n  np.subtract(f(1), PDRF, out=PDRF)\n  if is_power_of_two(pdrf_exponent) and (pdrf_exponent < (2 ** 16)):\n    for _ in range(int(np.log2(pdrf_exponent))):\n      PDRF *= PDRF # ^pdrf_exponent\n  else: \n    np.power(PDRF, pdrf_exponent, out=PDRF)\n\n  PDRF *= f(pdrf_scale)\n\n  # provide trickle of gradient so open spaces don't collapse\n  if max_daf != 0:\n    DAF *= (1 / max_daf)\n    PDRF += DAF\n\n  return np.asfortranarray(PDRF)\n\ndef point_to_point(\n  binary_img, start, end,\n  anisotropy=(1,1,1), \n  pdrf_scale=100000, \n  pdrf_exponent=4,\n):\n  \"\"\"\n  Trace a single centerline path from \n  start to end.\n  \"\"\"\n  DBF = edt.edt(\n    binary_img, \n    anisotropy=anisotropy,\n    black_border=True,\n  )\n  dbf_max = np.max(DBF)\n\n  DBF = kimimaro.skeletontricks.zero2inf(DBF) # DBF[ DBF == 0 ] = np.inf\n  DAF, target = dijkstra3d.euclidean_distance_field(\n    binary_img, start, \n    anisotropy=anisotropy,\n    return_max_location=True,\n  )\n  DAF = kimimaro.skeletontricks.inf2zero(DAF) # DAF[ DAF == np.inf ] = 0\n  PDRF = compute_pdrf(dbf_max, pdrf_scale, pdrf_exponent, DBF, DAF, DAF[target])\n  del DAF\n\n  path = dijkstra3d.dijkstra(PDRF, end, start)\n  skel = Skeleton.from_path(path)\n\n  verts = skel.vertices.flatten().astype(np.uint32)\n  skel.radii = DBF[verts[::3], verts[1::3], verts[2::3]]\n  return skel\n"
  },
  {
    "path": "kimimaro/utility.py",
    "content": "from typing import Dict, Union, List, Tuple, Optional\n\nfrom collections import defaultdict\nimport copy\nimport os\n\nimport numpy as np\nimport numpy.typing as npt\nimport scipy.ndimage\nfrom tqdm import tqdm\n\nfrom osteoid import Skeleton, Bbox, Vec\n\nimport kimimaro.skeletontricks\n\nimport cc3d\nfrom crackle import CrackleArray\nimport dijkstra3d\nimport fastremap\nimport fill_voids\nimport xs3d\n\nXS_PROP = {\n  \"id\": \"cross_sectional_area\",\n  \"data_type\": \"float32\",\n  \"num_components\": 1,\n}\n\nXS_CONTACT_PROP = {\n  \"id\": \"cross_sectional_area_contacts\",\n  \"data_type\": \"uint8\",\n  \"num_components\": 1,  \n}\n\ndef toabs(path):\n  path = os.path.expanduser(path)\n  return os.path.abspath(path)\n\ndef mkdir(path):\n  path = toabs(path)\n\n  try:\n    if path != '' and not os.path.exists(path):\n      os.makedirs(path)\n  except OSError as e:\n    if e.errno == 17: # File Exists\n      time.sleep(0.1)\n      return mkdir(path)\n    else:\n      raise\n\n  return path\n\ndef extract_skeleton_from_binary_image(image):\n  verts, edges = kimimaro.skeletontricks.extract_edges_from_binary_image(image)\n  return Skeleton(verts, edges)\n\ndef compute_cc_labels(all_labels, voxel_graph = None):\n  if isinstance(all_labels, CrackleArray):\n    if voxel_graph is not None:\n      all_labels = all_labels[:]\n    else:\n      return all_labels.connected_components(\n        connectivity=26,\n        memory_target=int(500e6), \n        return_mapping=True,\n      )\n\n  tmp_labels = all_labels\n  if np.dtype(all_labels.dtype).itemsize > 1:\n    tmp_labels, remapping = fastremap.renumber(all_labels, in_place=False)\n\n  if voxel_graph is not None:\n    cc_labels = cc3d.color_connectivity_graph(voxel_graph, connectivity=26)\n    cc_labels *= all_labels > 0\n  else:\n    cc_labels = cc3d.connected_components(tmp_labels)\n  \n  cc_labels = fastremap.refit(cc_labels)\n\n  del tmp_labels\n  remapping = kimimaro.skeletontricks.get_mapping(all_labels, cc_labels) \n  return cc_labels, remapping\n\ndef find_objects(labels):\n  \"\"\"  \n  scipy.ndimage.find_objects performs about 7-8x faster on C \n  ordered arrays, so we just do it that way and convert\n  the results if it's in F order.\n  \"\"\"\n  if isinstance(labels, CrackleArray):\n    bbxes = labels.bounding_boxes()\n    bbxes.pop(0)\n    result = list(bbxes.items())\n    result.sort(key=lambda x: x[0])\n    return [ x[1] for x in result ]\n\n  if labels.flags['C_CONTIGUOUS']:\n    return scipy.ndimage.find_objects(labels)\n  else:\n    all_slices = scipy.ndimage.find_objects(labels.T)\n    return [ (slcs and slcs[::-1]) for slcs in all_slices ]    \n\ndef add_property(skel, prop):\n  needs_prop = True\n  for skel_prop in skel.extra_attributes:\n    if skel_prop[\"id\"] == prop[\"id\"]:\n      needs_prop = False\n      break\n\n  if needs_prop:\n    skel.extra_attributes.append(prop)\n\ndef shape_iterator(all_labels, skeletons, fill_holes, in_place, progress, fn):\n  iterator = skeletons\n  if type(skeletons) == dict:\n    iterator = skeletons.values()\n    total = len(skeletons)\n  elif hasattr(skeletons, \"vertices\"):\n    iterator = [ skeletons ]\n    total = 1\n  else:\n    total = len(skeletons)\n\n  if all_labels.dtype == bool:\n    remapping = { True: 1, False: 0, 1:1, 0:0 }\n  else:\n    all_labels, remapping = fastremap.renumber(all_labels, in_place=in_place)\n\n  all_slices = find_objects(all_labels)\n\n  with tqdm(iterator, desc=\"Labels\", disable=(not progress), total=total) as pbar:\n    for skel in pbar:\n      if all_labels.dtype == bool:\n        label = 1\n      else:\n        label = skel.id\n\n      pbar.set_postfix(label=str(label))\n\n      if label == 0:\n        continue\n\n      if label not in remapping:\n        continue\n\n      label = remapping[label]\n      slices = all_slices[label - 1]\n      if slices is None:\n        continue\n\n      roi = Bbox.from_slices(slices)\n      if roi.volume() <= 1:\n        continue\n\n      roi.grow(1)\n      roi.minpt = Vec.clamp(roi.minpt, Vec(0,0,0), roi.maxpt)\n      slices = roi.to_slices()\n\n      binimg = np.asfortranarray(all_labels[slices] == label)\n      if fill_holes:\n        binimg = fill_voids.fill(binimg, in_place=True)\n\n      fn(skel, binimg, roi)\n\n  return iterator\n\ndef cross_sectional_area_single(\n  binimg:npt.NDArray[np.bool_], \n  skel:Skeleton, \n  roi:Optional[Bbox] = None,\n  anisotropy:npt.NDArray[np.float32] = np.array([1,1,1], dtype=np.float32),\n  smoothing_window:int = 1,\n  progress:bool = False,\n  in_place:bool = False,\n  multipass:bool = False,\n  repair_contacts:bool = False,\n  visualize_section_planes:bool = False,\n  step:int = 1,\n) -> Skeleton:\n  \"\"\"\n  Analyze the cross sectional area for a single skeleton given \n  an overlapping binary image. For many skeletons at once, \n  use cross_sectional_area which may be faster.\n\n  When the smoothing_window is >1, these plane normal \n  vectors will be smoothed with a rolling average. This\n  is useful since there can be high frequency\n  oscillations in the skeleton.\n\n  This function will add the following attributes to\n  each skeleton provided.\n\n  skel.cross_sectional_area: float32 array of cross \n    sectional area per a vertex.\n\n  skel.cross_sectional_area_contacts: uint8 array\n    where non-zero entries indicate that the image\n    border was contacted during the cross section\n    computation, indicating a possible underestimate.\n\n    The first six bits are a bitfield xxyyzz that\n    tell you which image faces were touched and\n    alternate from low (0) to high (size-1).\n\n  multipass: When True, preserve existing cross_sectional_area\n    and contact values and allow values with zero to be recalculated.\n    This is useful for example, when using a large skeleton with\n    different sections of an image. Very similar to repair_contacts,\n    except that any vertex can be considered, not just contacts.\n\n  repair_contacts: When True, only examine vertices\n    that have a nonzero value for \n    skel.cross_sectional_area_contacts. This is intended\n    to be used as a second pass after widening the image.\n\n  visualize_section_planes: For debugging, paint section planes\n    and display them using microviewer.\n\n  step: when > 1, skip (step-1) vertices. This can be used to\n    go faster. These days, evaluating a single vertex takes \n    between a few hundred microseconds to a few thousand microseconds.\n      example calculation: \n      1 msec x 100,000 vertices = 100 sec\n      A neuron I recently examined had over 300,000 vertices across \n      the entire dataset.\n      Kimimaro's benchmark task produced 622,293 vertices over 1667 objects \n      using reasonable parameters and took a little over 4 minutes on an M3 \n      processor (or about 2.5 msec/vertex). The most expensive shape was the soma.\n  \"\"\"\n  assert step > 0\n  assert smoothing_window > 0\n\n  cross_sections = None\n  if visualize_section_planes:\n    cross_sections = np.zeros(binimg.shape, dtype=np.uint32, order=\"F\")\n\n  if skel.space == \"physical\":\n    all_verts = (skel.vertices / anisotropy).round().astype(int)\n  else:\n    all_verts = np.copy(skel.vertices)\n\n  if roi is not None:\n    all_verts -= roi.minpt\n\n  mapping = { tuple(v): i for i, v in enumerate(all_verts) }\n\n  visited = np.zeros([ all_verts.shape[0] ], dtype=bool)\n\n  if repair_contacts or (multipass and hasattr(skel, \"cross_sectional_area\")):\n    areas = skel.cross_sectional_area\n    contacts = skel.cross_sectional_area_contacts\n  else:\n    areas = np.zeros([all_verts.shape[0]], dtype=np.float32)\n    contacts = np.zeros([all_verts.shape[0]], dtype=np.uint8)\n\n  branch_pts = set(skel.branches())\n  branch_pt_vals = defaultdict(list)\n\n  paths = skel.paths()\n\n  normal = np.array([1,0,0], dtype=np.float32)\n\n  shape = np.array(binimg.shape)\n\n  try:\n    xs3d.set_shape(binimg)\n    \n    for path in tqdm(paths, disable=(not progress), desc=\"Cross Section Analysis Paths\"):\n      if skel.space == \"physical\":\n        path = (path / anisotropy).round().astype(int)\n      if roi is not None:\n        path -= roi.minpt\n\n      normals = (path[1:] - path[:-1]).astype(np.float32)\n      normals = np.concatenate([ normals, [normals[-1]] ])\n\n      # Running the filter in the forward and then backwards\n      # direction eliminates phase shift.\n      normals = moving_average(normals, smoothing_window)\n      normals = moving_average(normals[::-1], smoothing_window)[::-1]\n\n      normals /= np.linalg.norm(normals, axis=1, keepdims=True)   \n\n      end_i = len(path) - 1\n      ct = 0\n\n      for i, vert in enumerate(path):\n        ct += 1\n\n        if ct < step and not (i == 0 or i == end_i):\n          continue\n        elif ct == step:\n          ct = 0\n\n        if ( \n             (vert[0] < 0) \n          or (vert[0] >= shape[0])\n          or (vert[1] < 0) \n          or (vert[1] >= shape[1])\n          or (vert[2] < 0) \n          or (vert[2] >= shape[2])\n        ):\n          continue\n\n        idx = mapping[tuple(vert)]\n        normal = normals[i]\n\n        if (\n          areas[idx] == 0 \n          or (idx in branch_pts) \n          or (repair_contacts and contacts[idx] > 0 and not visited[idx])\n        ):\n          visited[idx] = True\n          areas[idx], contact = xs3d.cross_sectional_area(\n            binimg, vert, \n            normal, anisotropy,\n            return_contact=True,\n            use_persistent_data=True,\n          )\n          if repair_contacts:\n            contacts[idx] = contact\n          else:\n            contacts[idx] |= contact # accumulate for branch points\n          if idx in branch_pts:\n            branch_pt_vals[idx].append(areas[idx])\n          if visualize_section_planes:\n            img = xs3d.cross_section(\n              binimg, vert, \n              normal, anisotropy,\n            )\n            cross_sections[img > 0] = idx\n  finally:\n    xs3d.clear_shape()\n\n  if visualize_section_planes:\n    import microviewer\n    microviewer.view(cross_sections, seg=True)\n\n  for idx, vals in branch_pt_vals.items():\n    areas[idx] = sum(vals) / len(vals)\n\n  skel.cross_sectional_area = areas\n  skel.cross_sectional_area_contacts = contacts\n\n  add_property(skel, XS_PROP)\n  add_property(skel, XS_CONTACT_PROP)\n\n  return skel\n\ndef cross_sectional_area(\n  all_labels:np.ndarray, \n  skeletons:Union[Dict[int,Skeleton],List[Skeleton],Skeleton],\n  anisotropy:np.ndarray = np.array([1,1,1], dtype=np.float32),\n  smoothing_window:int = 1,\n  progress:bool = False,\n  in_place:bool = False,\n  fill_holes:bool = False,\n  multipass:bool = False,\n  repair_contacts:bool = False,\n  visualize_section_planes:bool = False,\n  step:int = 1,\n) -> Union[Dict[int,Skeleton],List[Skeleton],Skeleton]:\n  \"\"\"\n  Given a set of skeletons, find the cross sectional area\n  for each vertex indicated by the sectioning plane\n  defined by the vector pointing to the next vertex.\n\n  When the smoothing_window is >1, these plane normal \n  vectors will be smoothed with a rolling average. This\n  is useful since there can be high frequency\n  oscillations in the skeleton.\n\n  This function will add the following attributes to\n  each skeleton provided.\n\n  skel.cross_sectional_area: float32 array of cross \n    sectional area per a vertex.\n\n  skel.cross_sectional_area_contacts: uint8 array\n    where non-zero entries indicate that the image\n    border was contacted during the cross section\n    computation, indicating a possible underestimate.\n\n    The first six bits are a bitfield xxyyzz that\n    tell you which image faces were touched and\n    alternate from low (0) to high (size-1).\n\n  multipass: When True, preserve existing cross_sectional_area\n    and contact values and allow values with zero to be recalculated.\n    This is useful for example, when using a large skeleton with\n    different sections of an image. Very similar to repair_contacts,\n    except that any vertex can be considered, not just contacts.\n\n  repair_contacts: When True, only examine vertices\n    that have a nonzero value for \n    skel.cross_sectional_area_contacts. This is intended\n    to be used as a second pass after widening the image.\n\n  visualize_section_planes: For debugging, paint section planes\n    and display them using microviewer.\n\n  step: when > 1, skip (step-1) vertices. This can be used to\n    go faster. These days, evaluating a single vertex takes \n    between a few hundred microseconds to a few thousand microseconds.\n      example calculation: \n      1 msec x 100,000 vertices = 100 sec\n      A neuron I recently examined had over 300,000 vertices across \n      the entire dataset.\n      Kimimaro's benchmark task produced 622,293 vertices over 1667 objects \n      using reasonable parameters and took a little over 4 minutes on an M3 \n      processor (or about 2.5 msec/vertex). The most expensive shape was the soma.\n  \"\"\"\n  assert step > 0\n  assert smoothing_window > 0\n\n  def cross_sectional_area_helper(skel, binimg, roi):\n    cross_sections = None\n    if visualize_section_planes:\n      cross_sections = np.zeros(binimg.shape, dtype=np.uint32, order=\"F\")\n\n    if skel.space == \"physical\":\n      all_verts = (skel.vertices / anisotropy).round().astype(int)\n    else:\n      all_verts = np.copy(skel.vertices)\n\n    all_verts -= roi.minpt\n\n    mapping = { tuple(v): i for i, v in enumerate(all_verts) }\n\n    visited = np.zeros([ all_verts.shape[0] ], dtype=bool)\n\n    if repair_contacts or (multipass and hasattr(skel, \"cross_sectional_area\")):\n      areas = skel.cross_sectional_area\n      contacts = skel.cross_sectional_area_contacts\n    else:\n      areas = np.zeros([all_verts.shape[0]], dtype=np.float32)\n      contacts = np.zeros([all_verts.shape[0]], dtype=np.uint8)\n\n    branch_pts = set(skel.branches())\n    branch_pt_vals = defaultdict(list)\n\n    paths = skel.paths()\n\n    normal = np.array([1,0,0], dtype=np.float32)\n\n    shape = np.array(binimg.shape)\n\n    for path in paths:\n      if skel.space == \"physical\":\n        path = (path / anisotropy).round().astype(int)\n      path -= roi.minpt\n\n      normals = (path[1:] - path[:-1]).astype(np.float32)\n      normals = np.concatenate([ normals, [normals[-1]] ])\n\n      # Running the filter in the forward and then backwards\n      # direction eliminates phase shift.\n      normals = moving_average(normals, smoothing_window)\n      normals = moving_average(normals[::-1], smoothing_window)[::-1]\n\n      normals /= np.linalg.norm(normals, axis=1, keepdims=True)   \n\n      end_i = len(path) - 1\n      ct = 0\n\n      for i, vert in enumerate(path):\n        ct += 1\n\n        if ct < step and not (i == 0 or i == end_i):\n          continue\n        elif ct == step:\n          ct = 0\n\n        if ( \n             (vert[0] < 0) \n          or (vert[0] >= shape[0])\n          or (vert[1] < 0) \n          or (vert[1] >= shape[1])\n          or (vert[2] < 0) \n          or (vert[2] >= shape[2])\n        ):\n          continue\n\n        idx = mapping[tuple(vert)]\n        normal = normals[i]\n\n        if (\n          areas[idx] == 0 \n          or (idx in branch_pts) \n          or (repair_contacts and contacts[idx] > 0 and not visited[idx])\n        ):\n          visited[idx] = True\n          areas[idx], contact = xs3d.cross_sectional_area(\n            binimg, vert, \n            normal, anisotropy,\n            return_contact=True,\n            use_persistent_data=True,\n          )\n          if repair_contacts:\n            contacts[idx] = contact\n          else:\n            contacts[idx] |= contact # accumulate for branch points\n          if idx in branch_pts:\n            branch_pt_vals[idx].append(areas[idx])\n          if visualize_section_planes:\n            img = xs3d.cross_section(\n              binimg, vert, \n              normal, anisotropy,\n            )\n            cross_sections[img > 0] = idx\n\n    if visualize_section_planes:\n      import microviewer\n      microviewer.view(cross_sections, seg=True)\n\n    for idx, vals in branch_pt_vals.items():\n      areas[idx] = sum(vals) / len(vals)\n\n    skel.cross_sectional_area = areas\n    skel.cross_sectional_area_contacts = contacts\n\n  try:\n    xs3d.set_shape(all_labels)\n    if isinstance(all_labels, CrackleArray):\n      bboxes = all_labels.bounding_boxes()\n      iterator = tqdm(\n        all_labels.each(crop=True, labels=list(skeletons.keys())),\n        disable=(not progress),\n        desc=\"Cross Section Analysis Paths\"\n      )\n      for label, binimg in iterator:\n        slc = Bbox.from_slices(bboxes[label])\n        cross_sectional_area_helper(skeletons[label], binimg, slc)\n    else:\n      shape_iterator(\n        all_labels, skeletons, \n        fill_holes, in_place, progress, \n        cross_sectional_area_helper\n      )\n  finally:\n    xs3d.clear_shape()\n\n  if hasattr(skeletons, \"vertices\"):\n    skelitr = [ skeletons ]\n  elif isinstance(skeletons, dict):\n    skelitr = skeletons.values()\n  else:\n    skelitr = iter(skeletons)\n\n  for skel in skelitr:\n    add_property(skel, XS_PROP)\n    add_property(skel, XS_CONTACT_PROP)\n\n    if not hasattr(skel, \"cross_sectional_area\"):\n      skel.cross_sectional_area = np.full(len(skel.vertices), -1, dtype=np.float32, order=\"F\")\n    if not hasattr(skel, \"cross_sectional_area_contacts\"):\n      skel.cross_sectional_area_contacts = np.zeros(len(skel.vertices), dtype=np.uint8, order=\"F\")\n\n  return skeletons\n\ndef oversegment(\n  all_labels:np.ndarray, \n  skeletons:Union[Dict[int,Skeleton],List[Skeleton],Skeleton],\n  anisotropy:np.ndarray = np.array([1,1,1], dtype=np.float32),\n  progress:bool = False,\n  fill_holes:bool = False,\n  in_place:bool = False,\n  downsample:int = 0,\n) -> Tuple[np.ndarray, Union[Dict[int,Skeleton],List[Skeleton],Skeleton]]:\n  \"\"\"\n  Use skeletons to create an oversegmentation of a pre-existing set\n  of labels. This is useful for proofreading systems that work by merging\n  labels.\n\n  For each skeleton, get the feature map from its euclidean distance\n  field. The final image is the composite of all these feature maps\n  numbered from 1.\n\n  Each skeleton will have a new property skel.segments that associates\n  a label to each vertex.\n  \"\"\"\n  prop = {\n    \"id\": \"segments\",\n    \"data_type\": \"uint64\",\n    \"num_components\": 1,\n  }\n\n  skeletons = copy.deepcopy(skeletons)\n\n  # Initialize segments attribute for all skeletons\n  if hasattr(skeletons, \"vertices\"):\n    skeleton_list = [skeletons]\n  elif isinstance(skeletons, dict):\n    skeleton_list = list(skeletons.values())\n  else:\n    skeleton_list = skeletons\n    \n  all_features = np.zeros(all_labels.shape, dtype=np.uint64, order=\"F\")\n  next_label = 0\n\n  def oversegment_helper(skel, binimg, roi):\n    nonlocal next_label\n    nonlocal all_features\n\n    segment_skel = skel\n    if downsample > 0:\n      segment_skel = skel.downsample(downsample)\n\n    vertices = (segment_skel.vertices / anisotropy).round().astype(int)\n    vertices -= roi.minpt\n\n    field, feature_map = dijkstra3d.euclidean_distance_field(\n      binimg, vertices, \n      anisotropy=anisotropy, \n      return_feature_map=True\n    )\n    del field\n\n    add_property(skel, prop)\n\n    # Fortran order efficient version of:\n    # feature_map[binimg] += next_label\n\n    flat_binary_image = binimg.ravel('F')\n    flat_feature_map = feature_map.ravel('F')\n    flat_feature_map[flat_binary_image] += next_label\n    \n    next_label += vertices.shape[0]\n    all_features[roi.to_slices()] += feature_map\n\n  # iterator is an iterable list of skeletons, not the shape iterator\n  iterator = shape_iterator(\n    all_labels, skeletons, fill_holes, in_place, progress, \n    oversegment_helper\n  )\n\n  all_features, mapping = fastremap.renumber(all_features)\n  \n  for skel in skeleton_list:\n    vertices = (skel.vertices / anisotropy).round().astype(int)\n    skel.segments = all_features[vertices[:,0], vertices[:,1], vertices[:,2]]\n\n  return all_features, skeletons\n\n# From SO: https://stackoverflow.com/questions/14313510/how-to-calculate-rolling-moving-average-using-python-numpy-scipy\ndef moving_average(a:np.ndarray, n:int, mode:str = \"symmetric\") -> np.ndarray:\n  if n <= 0:\n    raise ValueError(f\"Window size ({n}), must be >= 1.\")\n  elif n == 1:\n    return a\n\n  if len(a) == 0:\n    return a\n\n  if a.ndim == 2:\n    a = np.pad(a, [[n, n],[0,0]], mode=mode)\n  else:\n    a = np.pad(a, [n, n], mode=mode)\n\n  ret = np.cumsum(a, dtype=float, axis=0)\n  ret = (ret[n:] - ret[:-n])[:-n]\n  ret /= float(n)\n  return ret\n\n"
  },
  {
    "path": "kimimaro_cli/LICENSE",
    "content": "                    GNU GENERAL PUBLIC LICENSE\n                       Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>\n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n                            Preamble\n\n  The GNU General Public License is a free, copyleft license for\nsoftware and other kinds of works.\n\n  The licenses for most software and other practical works are designed\nto take away your freedom to share and change the works.  By contrast,\nthe GNU General Public License is intended to guarantee your freedom to\nshare and change all versions of a program--to make sure it remains free\nsoftware for all its users.  We, the Free Software Foundation, use the\nGNU General Public License for most of our software; it applies also to\nany other work released this way by its authors.  You can apply it to\nyour programs, too.\n\n  When we speak of free software, we are referring to freedom, not\nprice.  Our General Public Licenses are designed to make sure that you\nhave the freedom to distribute copies of free software (and charge for\nthem if you wish), that you receive source code or can get it if you\nwant it, that you can change the software or use pieces of it in new\nfree programs, and that you know you can do these things.\n\n  To protect your rights, we need to prevent others from denying you\nthese rights or asking you to surrender the rights.  Therefore, you have\ncertain responsibilities if you distribute copies of the software, or if\nyou modify it: responsibilities to respect the freedom of others.\n\n  For example, if you distribute copies of such a program, whether\ngratis or for a fee, you must pass on to the recipients the same\nfreedoms that you received.  You must make sure that they, too, receive\nor can get the source code.  And you must show them these terms so they\nknow their rights.\n\n  Developers that use the GNU GPL protect your rights with two steps:\n(1) assert copyright on the software, and (2) offer you this License\ngiving you legal permission to copy, distribute and/or modify it.\n\n  For the developers' and authors' protection, the GPL clearly explains\nthat there is no warranty for this free software.  For both users' and\nauthors' sake, the GPL requires that modified versions be marked as\nchanged, so that their problems will not be attributed erroneously to\nauthors of previous versions.\n\n  Some devices are designed to deny users access to install or run\nmodified versions of the software inside them, although the manufacturer\ncan do so.  This is fundamentally incompatible with the aim of\nprotecting users' freedom to change the software.  The systematic\npattern of such abuse occurs in the area of products for individuals to\nuse, which is precisely where it is most unacceptable.  Therefore, we\nhave designed this version of the GPL to prohibit the practice for those\nproducts.  If such problems arise substantially in other domains, we\nstand ready to extend this provision to those domains in future versions\nof the GPL, as needed to protect the freedom of users.\n\n  Finally, every program is threatened constantly by software patents.\nStates should not allow patents to restrict development and use of\nsoftware on general-purpose computers, but in those that do, we wish to\navoid the special danger that patents applied to a free program could\nmake it effectively proprietary.  To prevent this, the GPL assures that\npatents cannot be used to render the program non-free.\n\n  The precise terms and conditions for copying, distribution and\nmodification follow.\n\n                       TERMS AND CONDITIONS\n\n  0. Definitions.\n\n  \"This License\" refers to version 3 of the GNU General Public License.\n\n  \"Copyright\" also means copyright-like laws that apply to other kinds of\nworks, such as semiconductor masks.\n\n  \"The Program\" refers to any copyrightable work licensed under this\nLicense.  Each licensee is addressed as \"you\".  \"Licensees\" and\n\"recipients\" may be individuals or organizations.\n\n  To \"modify\" a work means to copy from or adapt all or part of the work\nin a fashion requiring copyright permission, other than the making of an\nexact copy.  The resulting work is called a \"modified version\" of the\nearlier work or a work \"based on\" the earlier work.\n\n  A \"covered work\" means either the unmodified Program or a work based\non the Program.\n\n  To \"propagate\" a work means to do anything with it that, without\npermission, would make you directly or secondarily liable for\ninfringement under applicable copyright law, except executing it on a\ncomputer or modifying a private copy.  Propagation includes copying,\ndistribution (with or without modification), making available to the\npublic, and in some countries other activities as well.\n\n  To \"convey\" a work means any kind of propagation that enables other\nparties to make or receive copies.  Mere interaction with a user through\na computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays \"Appropriate Legal Notices\"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\ntells the user that there is no warranty for the work (except to the\nextent that warranties are provided), that licensees may convey the\nwork under this License, and how to view a copy of this License.  If\nthe interface presents a list of user commands or options, such as a\nmenu, a prominent item in the list meets this criterion.\n\n  1. Source Code.\n\n  The \"source code\" for a work means the preferred form of the work\nfor making modifications to it.  \"Object code\" means any non-source\nform of a work.\n\n  A \"Standard Interface\" means an interface that either is an official\nstandard defined by a recognized standards body, or, in the case of\ninterfaces specified for a particular programming language, one that\nis widely used among developers working in that language.\n\n  The \"System Libraries\" of an executable work include anything, other\nthan the work as a whole, that (a) is included in the normal form of\npackaging a Major Component, but which is not part of that Major\nComponent, and (b) serves only to enable use of the work with that\nMajor Component, or to implement a Standard Interface for which an\nimplementation is available to the public in source code form.  A\n\"Major Component\", in this context, means a major essential component\n(kernel, window system, and so on) of the specific operating system\n(if any) on which the executable work runs, or a compiler used to\nproduce the work, or an object code interpreter used to run it.\n\n  The \"Corresponding Source\" for a work in object code form means all\nthe source code needed to generate, install, and (for an executable\nwork) run the object code and to modify the work, including scripts to\ncontrol those activities.  However, it does not include the work's\nSystem Libraries, or general-purpose tools or generally available free\nprograms which are used unmodified in performing those activities but\nwhich are not part of the work.  For example, Corresponding Source\nincludes interface definition files associated with source files for\nthe work, and the source code for shared libraries and dynamically\nlinked subprograms that the work is specifically designed to require,\nsuch as by intimate data communication or control flow between those\nsubprograms and other parts of the work.\n\n  The Corresponding Source need not include anything that users\ncan regenerate automatically from other parts of the Corresponding\nSource.\n\n  The Corresponding Source for a work in source code form is that\nsame work.\n\n  2. Basic Permissions.\n\n  All rights granted under this License are granted for the term of\ncopyright on the Program, and are irrevocable provided the stated\nconditions are met.  This License explicitly affirms your unlimited\npermission to run the unmodified Program.  The output from running a\ncovered work is covered by this License only if the output, given its\ncontent, constitutes a covered work.  This License acknowledges your\nrights of fair use or other equivalent, as provided by copyright law.\n\n  You may make, run and propagate covered works that you do not\nconvey, without conditions so long as your license otherwise remains\nin force.  You may convey covered works to others for the sole purpose\nof having them make modifications exclusively for you, or provide you\nwith facilities for running those works, provided that you comply with\nthe terms of this License in conveying all material for which you do\nnot control copyright.  Those thus making or running the covered works\nfor you must do so exclusively on your behalf, under your direction\nand control, on terms that prohibit them from making any copies of\nyour copyrighted material outside their relationship with you.\n\n  Conveying under any other circumstances is permitted solely under\nthe conditions stated below.  Sublicensing is not allowed; section 10\nmakes it unnecessary.\n\n  3. Protecting Users' Legal Rights From Anti-Circumvention Law.\n\n  No covered work shall be deemed part of an effective technological\nmeasure under any applicable law fulfilling obligations under article\n11 of the WIPO copyright treaty adopted on 20 December 1996, or\nsimilar laws prohibiting or restricting circumvention of such\nmeasures.\n\n  When you convey a covered work, you waive any legal power to forbid\ncircumvention of technological measures to the extent such circumvention\nis effected by exercising rights under this License with respect to\nthe covered work, and you disclaim any intention to limit operation or\nmodification of the work as a means of enforcing, against the work's\nusers, your or third parties' legal rights to forbid circumvention of\ntechnological measures.\n\n  4. Conveying Verbatim Copies.\n\n  You may convey verbatim copies of the Program's source code as you\nreceive it, in any medium, provided that you conspicuously and\nappropriately publish on each copy an appropriate copyright notice;\nkeep intact all notices stating that this License and any\nnon-permissive terms added in accord with section 7 apply to the code;\nkeep intact all notices of the absence of any warranty; and give all\nrecipients a copy of this License along with the Program.\n\n  You may charge any price or no price for each copy that you convey,\nand you may offer support or warranty protection for a fee.\n\n  5. Conveying Modified Source Versions.\n\n  You may convey a work based on the Program, or the modifications to\nproduce it from the Program, in the form of source code under the\nterms of section 4, provided that you also meet all of these conditions:\n\n    a) The work must carry prominent notices stating that you modified\n    it, and giving a relevant date.\n\n    b) The work must carry prominent notices stating that it is\n    released under this License and any conditions added under section\n    7.  This requirement modifies the requirement in section 4 to\n    \"keep intact all notices\".\n\n    c) You must license the entire work, as a whole, under this\n    License to anyone who comes into possession of a copy.  This\n    License will therefore apply, along with any applicable section 7\n    additional terms, to the whole of the work, and all its parts,\n    regardless of how they are packaged.  This License gives no\n    permission to license the work in any other way, but it does not\n    invalidate such permission if you have separately received it.\n\n    d) If the work has interactive user interfaces, each must display\n    Appropriate Legal Notices; however, if the Program has interactive\n    interfaces that do not display Appropriate Legal Notices, your\n    work need not make them do so.\n\n  A compilation of a covered work with other separate and independent\nworks, which are not by their nature extensions of the covered work,\nand which are not combined with it such as to form a larger program,\nin or on a volume of a storage or distribution medium, is called an\n\"aggregate\" if the compilation and its resulting copyright are not\nused to limit the access or legal rights of the compilation's users\nbeyond what the individual works permit.  Inclusion of a covered work\nin an aggregate does not cause this License to apply to the other\nparts of the aggregate.\n\n  6. Conveying Non-Source Forms.\n\n  You may convey a covered work in object code form under the terms\nof sections 4 and 5, provided that you also convey the\nmachine-readable Corresponding Source under the terms of this License,\nin one of these ways:\n\n    a) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by the\n    Corresponding Source fixed on a durable physical medium\n    customarily used for software interchange.\n\n    b) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by a\n    written offer, valid for at least three years and valid for as\n    long as you offer spare parts or customer support for that product\n    model, to give anyone who possesses the object code either (1) a\n    copy of the Corresponding Source for all the software in the\n    product that is covered by this License, on a durable physical\n    medium customarily used for software interchange, for a price no\n    more than your reasonable cost of physically performing this\n    conveying of source, or (2) access to copy the\n    Corresponding Source from a network server at no charge.\n\n    c) Convey individual copies of the object code with a copy of the\n    written offer to provide the Corresponding Source.  This\n    alternative is allowed only occasionally and noncommercially, and\n    only if you received the object code with such an offer, in accord\n    with subsection 6b.\n\n    d) Convey the object code by offering access from a designated\n    place (gratis or for a charge), and offer equivalent access to the\n    Corresponding Source in the same way through the same place at no\n    further charge.  You need not require recipients to copy the\n    Corresponding Source along with the object code.  If the place to\n    copy the object code is a network server, the Corresponding Source\n    may be on a different server (operated by you or a third party)\n    that supports equivalent copying facilities, provided you maintain\n    clear directions next to the object code saying where to find the\n    Corresponding Source.  Regardless of what server hosts the\n    Corresponding Source, you remain obligated to ensure that it is\n    available for as long as needed to satisfy these requirements.\n\n    e) Convey the object code using peer-to-peer transmission, provided\n    you inform other peers where the object code and Corresponding\n    Source of the work are being offered to the general public at no\n    charge under subsection 6d.\n\n  A separable portion of the object code, whose source code is excluded\nfrom the Corresponding Source as a System Library, need not be\nincluded in conveying the object code work.\n\n  A \"User Product\" is either (1) a \"consumer product\", which means any\ntangible personal property which is normally used for personal, family,\nor household purposes, or (2) anything designed or sold for incorporation\ninto a dwelling.  In determining whether a product is a consumer product,\ndoubtful cases shall be resolved in favor of coverage.  For a particular\nproduct received by a particular user, \"normally used\" refers to a\ntypical or common use of that class of product, regardless of the status\nof the particular user or of the way in which the particular user\nactually uses, or expects or is expected to use, the product.  A product\nis a consumer product regardless of whether the product has substantial\ncommercial, industrial or non-consumer uses, unless such uses represent\nthe only significant mode of use of the product.\n\n  \"Installation Information\" for a User Product means any methods,\nprocedures, authorization keys, or other information required to install\nand execute modified versions of a covered work in that User Product from\na modified version of its Corresponding Source.  The information must\nsuffice to ensure that the continued functioning of the modified object\ncode is in no case prevented or interfered with solely because\nmodification has been made.\n\n  If you convey an object code work under this section in, or with, or\nspecifically for use in, a User Product, and the conveying occurs as\npart of a transaction in which the right of possession and use of the\nUser Product is transferred to the recipient in perpetuity or for a\nfixed term (regardless of how the transaction is characterized), the\nCorresponding Source conveyed under this section must be accompanied\nby the Installation Information.  But this requirement does not apply\nif neither you nor any third party retains the ability to install\nmodified object code on the User Product (for example, the work has\nbeen installed in ROM).\n\n  The requirement to provide Installation Information does not include a\nrequirement to continue to provide support service, warranty, or updates\nfor a work that has been modified or installed by the recipient, or for\nthe User Product in which it has been modified or installed.  Access to a\nnetwork may be denied when the modification itself materially and\nadversely affects the operation of the network or violates the rules and\nprotocols for communication across the network.\n\n  Corresponding Source conveyed, and Installation Information provided,\nin accord with this section must be in a format that is publicly\ndocumented (and with an implementation available to the public in\nsource code form), and must require no special password or key for\nunpacking, reading or copying.\n\n  7. Additional Terms.\n\n  \"Additional permissions\" are terms that supplement the terms of this\nLicense by making exceptions from one or more of its conditions.\nAdditional permissions that are applicable to the entire Program shall\nbe treated as though they were included in this License, to the extent\nthat they are valid under applicable law.  If additional permissions\napply only to part of the Program, that part may be used separately\nunder those permissions, but the entire Program remains governed by\nthis License without regard to the additional permissions.\n\n  When you convey a copy of a covered work, you may at your option\nremove any additional permissions from that copy, or from any part of\nit.  (Additional permissions may be written to require their own\nremoval in certain cases when you modify the work.)  You may place\nadditional permissions on material, added by you to a covered work,\nfor which you have or can give appropriate copyright permission.\n\n  Notwithstanding any other provision of this License, for material you\nadd to a covered work, you may (if authorized by the copyright holders of\nthat material) supplement the terms of this License with terms:\n\n    a) Disclaiming warranty or limiting liability differently from the\n    terms of sections 15 and 16 of this License; or\n\n    b) Requiring preservation of specified reasonable legal notices or\n    author attributions in that material or in the Appropriate Legal\n    Notices displayed by works containing it; or\n\n    c) Prohibiting misrepresentation of the origin of that material, or\n    requiring that modified versions of such material be marked in\n    reasonable ways as different from the original version; or\n\n    d) Limiting the use for publicity purposes of names of licensors or\n    authors of the material; or\n\n    e) Declining to grant rights under trademark law for use of some\n    trade names, trademarks, or service marks; or\n\n    f) Requiring indemnification of licensors and authors of that\n    material by anyone who conveys the material (or modified versions of\n    it) with contractual assumptions of liability to the recipient, for\n    any liability that these contractual assumptions directly impose on\n    those licensors and authors.\n\n  All other non-permissive additional terms are considered \"further\nrestrictions\" within the meaning of section 10.  If the Program as you\nreceived it, or any part of it, contains a notice stating that it is\ngoverned by this License along with a term that is a further\nrestriction, you may remove that term.  If a license document contains\na further restriction but permits relicensing or conveying under this\nLicense, you may add to a covered work material governed by the terms\nof that license document, provided that the further restriction does\nnot survive such relicensing or conveying.\n\n  If you add terms to a covered work in accord with this section, you\nmust place, in the relevant source files, a statement of the\nadditional terms that apply to those files, or a notice indicating\nwhere to find the applicable terms.\n\n  Additional terms, permissive or non-permissive, may be stated in the\nform of a separately written license, or stated as exceptions;\nthe above requirements apply either way.\n\n  8. Termination.\n\n  You may not propagate or modify a covered work except as expressly\nprovided under this License.  Any attempt otherwise to propagate or\nmodify it is void, and will automatically terminate your rights under\nthis License (including any patent licenses granted under the third\nparagraph of section 11).\n\n  However, if you cease all violation of this License, then your\nlicense from a particular copyright holder is reinstated (a)\nprovisionally, unless and until the copyright holder explicitly and\nfinally terminates your license, and (b) permanently, if the copyright\nholder fails to notify you of the violation by some reasonable means\nprior to 60 days after the cessation.\n\n  Moreover, your license from a particular copyright holder is\nreinstated permanently if the copyright holder notifies you of the\nviolation by some reasonable means, this is the first time you have\nreceived notice of violation of this License (for any work) from that\ncopyright holder, and you cure the violation prior to 30 days after\nyour receipt of the notice.\n\n  Termination of your rights under this section does not terminate the\nlicenses of parties who have received copies or rights from you under\nthis License.  If your rights have been terminated and not permanently\nreinstated, you do not qualify to receive new licenses for the same\nmaterial under section 10.\n\n  9. Acceptance Not Required for Having Copies.\n\n  You are not required to accept this License in order to receive or\nrun a copy of the Program.  Ancillary propagation of a covered work\noccurring solely as a consequence of using peer-to-peer transmission\nto receive a copy likewise does not require acceptance.  However,\nnothing other than this License grants you permission to propagate or\nmodify any covered work.  These actions infringe copyright if you do\nnot accept this License.  Therefore, by modifying or propagating a\ncovered work, you indicate your acceptance of this License to do so.\n\n  10. Automatic Licensing of Downstream Recipients.\n\n  Each time you convey a covered work, the recipient automatically\nreceives a license from the original licensors, to run, modify and\npropagate that work, subject to this License.  You are not responsible\nfor enforcing compliance by third parties with this License.\n\n  An \"entity transaction\" is a transaction transferring control of an\norganization, or substantially all assets of one, or subdividing an\norganization, or merging organizations.  If propagation of a covered\nwork results from an entity transaction, each party to that\ntransaction who receives a copy of the work also receives whatever\nlicenses to the work the party's predecessor in interest had or could\ngive under the previous paragraph, plus a right to possession of the\nCorresponding Source of the work from the predecessor in interest, if\nthe predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\nrights granted under this License, and you may not initiate litigation\n(including a cross-claim or counterclaim in a lawsuit) alleging that\nany patent claim is infringed by making, using, selling, offering for\nsale, or importing the Program or any portion of it.\n\n  11. Patents.\n\n  A \"contributor\" is a copyright holder who authorizes use under this\nLicense of the Program or a work on which the Program is based.  The\nwork thus licensed is called the contributor's \"contributor version\".\n\n  A contributor's \"essential patent claims\" are all patent claims\nowned or controlled by the contributor, whether already acquired or\nhereafter acquired, that would be infringed by some manner, permitted\nby this License, of making, using, or selling its contributor version,\nbut do not include claims that would be infringed only as a\nconsequence of further modification of the contributor version.  For\npurposes of this definition, \"control\" includes the right to grant\npatent sublicenses in a manner consistent with the requirements of\nthis License.\n\n  Each contributor grants you a non-exclusive, worldwide, royalty-free\npatent license under the contributor's essential patent claims, to\nmake, use, sell, offer for sale, import and otherwise run, modify and\npropagate the contents of its contributor version.\n\n  In the following three paragraphs, a \"patent license\" is any express\nagreement or commitment, however denominated, not to enforce a patent\n(such as an express permission to practice a patent or covenant not to\nsue for patent infringement).  To \"grant\" such a patent license to a\nparty means to make such an agreement or commitment not to enforce a\npatent against the party.\n\n  If you convey a covered work, knowingly relying on a patent license,\nand the Corresponding Source of the work is not available for anyone\nto copy, free of charge and under the terms of this License, through a\npublicly available network server or other readily accessible means,\nthen you must either (1) cause the Corresponding Source to be so\navailable, or (2) arrange to deprive yourself of the benefit of the\npatent license for this particular work, or (3) arrange, in a manner\nconsistent with the requirements of this License, to extend the patent\nlicense to downstream recipients.  \"Knowingly relying\" means you have\nactual knowledge that, but for the patent license, your conveying the\ncovered work in a country, or your recipient's use of the covered work\nin a country, would infringe one or more identifiable patents in that\ncountry that you have reason to believe are valid.\n\n  If, pursuant to or in connection with a single transaction or\narrangement, you convey, or propagate by procuring conveyance of, a\ncovered work, and grant a patent license to some of the parties\nreceiving the covered work authorizing them to use, propagate, modify\nor convey a specific copy of the covered work, then the patent license\nyou grant is automatically extended to all recipients of the covered\nwork and works based on it.\n\n  A patent license is \"discriminatory\" if it does not include within\nthe scope of its coverage, prohibits the exercise of, or is\nconditioned on the non-exercise of one or more of the rights that are\nspecifically granted under this License.  You may not convey a covered\nwork if you are a party to an arrangement with a third party that is\nin the business of distributing software, under which you make payment\nto the third party based on the extent of your activity of conveying\nthe work, and under which the third party grants, to any of the\nparties who would receive the covered work from you, a discriminatory\npatent license (a) in connection with copies of the covered work\nconveyed by you (or copies made from those copies), or (b) primarily\nfor and in connection with specific products or compilations that\ncontain the covered work, unless you entered into that arrangement,\nor that patent license was granted, prior to 28 March 2007.\n\n  Nothing in this License shall be construed as excluding or limiting\nany implied license or other defenses to infringement that may\notherwise be available to you under applicable patent law.\n\n  12. No Surrender of Others' Freedom.\n\n  If conditions are imposed on you (whether by court order, agreement or\notherwise) that contradict the conditions of this License, they do not\nexcuse you from the conditions of this License.  If you cannot convey a\ncovered work so as to satisfy simultaneously your obligations under this\nLicense and any other pertinent obligations, then as a consequence you may\nnot convey it at all.  For example, if you agree to terms that obligate you\nto collect a royalty for further conveying from those to whom you convey\nthe Program, the only way you could satisfy both those terms and this\nLicense would be to refrain entirely from conveying the Program.\n\n  13. Use with the GNU Affero General Public License.\n\n  Notwithstanding any other provision of this License, you have\npermission to link or combine any covered work with a work licensed\nunder version 3 of the GNU Affero General Public License into a single\ncombined work, and to convey the resulting work.  The terms of this\nLicense will continue to apply to the part which is the covered work,\nbut the special requirements of the GNU Affero General Public License,\nsection 13, concerning interaction through a network will apply to the\ncombination as such.\n\n  14. Revised Versions of this License.\n\n  The Free Software Foundation may publish revised and/or new versions of\nthe GNU General Public License from time to time.  Such new versions will\nbe similar in spirit to the present version, but may differ in detail to\naddress new problems or concerns.\n\n  Each version is given a distinguishing version number.  If the\nProgram specifies that a certain numbered version of the GNU General\nPublic License \"or any later version\" applies to it, you have the\noption of following the terms and conditions either of that numbered\nversion or of any later version published by the Free Software\nFoundation.  If the Program does not specify a version number of the\nGNU General Public License, you may choose any version ever published\nby the Free Software Foundation.\n\n  If the Program specifies that a proxy can decide which future\nversions of the GNU General Public License can be used, that proxy's\npublic statement of acceptance of a version permanently authorizes you\nto choose that version for the Program.\n\n  Later license versions may give you additional or different\npermissions.  However, no additional obligations are imposed on any\nauthor or copyright holder as a result of your choosing to follow a\nlater version.\n\n  15. Disclaimer of Warranty.\n\n  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM\nIS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\nALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n  16. Limitation of Liability.\n\n  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\nWILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\nTHE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\nGENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\nUSE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\nDATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\nPARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\nEVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\nSUCH DAMAGES.\n\n  17. Interpretation of Sections 15 and 16.\n\n  If the disclaimer of warranty and limitation of liability provided\nabove cannot be given local legal effect according to their terms,\nreviewing courts shall apply local law that most closely approximates\nan absolute waiver of all civil liability in connection with the\nProgram, unless a warranty or assumption of liability accompanies a\ncopy of the Program in return for a fee.\n\n                     END OF TERMS AND CONDITIONS\n\n            How to Apply These Terms to Your New Programs\n\n  If you develop a new program, and you want it to be of the greatest\npossible use to the public, the best way to achieve this is to make it\nfree software which everyone can redistribute and change under these terms.\n\n  To do so, attach the following notices to the program.  It is safest\nto attach them to the start of each source file to most effectively\nstate the exclusion of warranty; and each file should have at least\nthe \"copyright\" line and a pointer to where the full notice is found.\n\n    <one line to give the program's name and a brief idea of what it does.>\n    Copyright (C) <year>  <name of author>\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation, either version 3 of the License, or\n    (at your option) any later version.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU General Public License for more details.\n\n    You should have received a copy of the GNU General Public License\n    along with this program.  If not, see <https://www.gnu.org/licenses/>.\n\nAlso add information on how to contact you by electronic and paper mail.\n\n  If the program does terminal interaction, make it output a short\nnotice like this when it starts in an interactive mode:\n\n    <program>  Copyright (C) <year>  <name of author>\n    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.\n    This is free software, and you are welcome to redistribute it\n    under certain conditions; type `show c' for details.\n\nThe hypothetical commands `show w' and `show c' should show the appropriate\nparts of the General Public License.  Of course, your program's commands\nmight be different; for a GUI interface, you would use an \"about box\".\n\n  You should also get your employer (if you work as a programmer) or school,\nif any, to sign a \"copyright disclaimer\" for the program, if necessary.\nFor more information on this, and how to apply and follow the GNU GPL, see\n<https://www.gnu.org/licenses/>.\n\n  The GNU General Public License does not permit incorporating your program\ninto proprietary programs.  If your program is a subroutine library, you\nmay consider it more useful to permit linking proprietary applications with\nthe library.  If this is what you want to do, use the GNU Lesser General\nPublic License instead of this License.  But first, please read\n<https://www.gnu.org/licenses/why-not-lgpl.html>.\n"
  },
  {
    "path": "kimimaro_cli/__init__.py",
    "content": "import os\n\nimport click\nimport numpy as np\nfrom osteoid import Skeleton\n\nimport kimimaro\nfrom kimimaro.utility import mkdir\nimport fastremap\nfrom tqdm import tqdm\n\nfrom . import codecs\n\nclass Tuple3(click.ParamType):\n  \"\"\"A command line option type consisting of 3 comma-separated integers.\"\"\"\n  name = 'tuple3'\n  def convert(self, value, param, ctx):\n    if isinstance(value, str):\n      try:\n        value = tuple(map(int, value.split(',')))\n      except ValueError:\n        self.fail(f\"'{value}' does not contain a comma delimited list of 3 integers.\")\n      if len(value) != 3:\n        self.fail(f\"'{value}' does not contain a comma delimited list of 3 integers.\")\n    return value\n\n\n@click.group()\ndef main():\n  \"\"\"\n  Skeletonize all labels in a segmented volumetric image\n  by applying a TEASAR based algorithm and outputs them\n  as SWC.\n\n  Does not accept continuously valued images such as raw\n  microscopy images.\n\n  Input File Formats Supported: npy\n  \n  This program is free software: you can redistribute it and/or modify\n  it under the terms of the GNU General Public License as published by\n  the Free Software Foundation, either version 3 of the License, or\n  (at your option) any later version. Run \"igneous license\" for details.  \n  \"\"\"\n  pass\n\n@main.command()\n@click.argument(\"src\")\n@click.option('--scale', type=float, default=4, help=\"Adds multiple of boundary distance to invalidation zone. (You should set this!)\", show_default=True)\n@click.option('--const', type=float, default=10, help=\"Adds constant physical distance to invalidation zone. (You should set this!)\", show_default=True)\n@click.option('--pdrf-scale', type=int, default=1e5, help=\"Constant multiplier of penalty field.\", show_default=True)\n@click.option('--pdrf-exponent', type=int, default=4, help=\"Exponent of penalty field. Powers of two are faster. Too big can cause floating point errors.\", show_default=True)\n@click.option('--soma-detect', type=float, default=750, help=\"Perform more expensive check for somas for distance to boundary values above this threshold. e.g. 750 nm\", show_default=True)\n@click.option('--soma-accept', type=float, default=1100, help=\"Distance to boundary values above this threshold trigger special soma processing. e.g. 750 nm\", show_default=True)\n@click.option('--soma-scale', type=float, default=2, help=\"Adds multiple of boundary distance to invalidation zone around a soma. (You should set this!)\", show_default=True)\n@click.option('--soma-const', type=float, default=300, help=\"Adds constant physical distance to invalidation zone around a soma. (You should set this!)\", show_default=True)\n@click.option('--anisotropy', type=Tuple3(), default=\"1,1,1\", help=\"Physical size of voxel in x,y,z axes.\", show_default=True)\n@click.option('--dust', type=int, default=1000, help=\"Skip connected components with fewer voxels than this.\", show_default=True)\n@click.option('--progress', is_flag=True, default=False, help=\"Show progress bar.\", show_default=True)\n@click.option('--fill-holes/--no-fill-holes', is_flag=True, default=True, help=\"Fill holes in each connected component. (slower)\", show_default=True)\n@click.option('--fix-avocados', is_flag=True, default=False, help=\"Use heuristics to combine nucleii with cell bodies. (slower)\", show_default=True)\n@click.option('--fix-borders', is_flag=True, default=False, help=\"Center the skeleton where the shape contacts the border.\", show_default=True)\n@click.option('--fix-branches', is_flag=True, default=True, help=\"Improves quality of forked shapes. (slower for highly branched shapes)\", show_default=True)\n@click.option('--max-paths', type=int, default=None, help=\"Maximum number of paths to trace per object.\", show_default=True)\n@click.option('-p', '--parallel', type=int, default=1, help=\"Number of processes to use.\", show_default=True)\n@click.option('-o', '--outdir', type=str, default=\"kimimaro_out\", help=\"Where to write the SWC files.\", show_default=True)\n@click.option(\"--cross-section\", type=int, default=0, help=\"Turn on cross section analysis. The integer value gives the normal smoothing window, 0=off.\", show_default=True)\ndef forge(\n  src,\n  scale, const, pdrf_scale, pdrf_exponent,\n  soma_detect, soma_accept, soma_scale, soma_const,\n  anisotropy, dust, progress, fill_holes, \n  fix_avocados, fix_branches, fix_borders,\n  parallel, max_paths, outdir, cross_section,\n):\n  \"\"\"Skeletonize an input image and write out SWCs.\"\"\"\n  labels = codecs.load(src)\n\n  skels = kimimaro.skeletonize(\n    labels,\n    teasar_params={\n      \"scale\": scale,\n      \"const\": const,\n      \"pdrf_scale\": pdrf_scale,\n      \"pdrf_exponent\": pdrf_exponent,\n      \"soma_detection_threshold\": soma_detect,\n      \"soma_acceptance_threshold\": soma_accept,\n      \"soma_invalidation_scale\": soma_scale,\n      \"soma_invalidation_const\": soma_const,\n      \"max_paths\": max_paths,\n    },\n    anisotropy=anisotropy,\n    dust_threshold=dust,\n    progress=progress,\n    fill_holes=fill_holes,\n    fix_avocados=fix_avocados,\n    fix_branching=fix_branches,\n    fix_borders=fix_borders,\n    parallel=parallel,\n  )\n\n  directory = mkdir(outdir)\n\n  for label, skel in skels.items():\n    fname = os.path.join(directory, f\"{label}.swc\")\n    with open(fname, \"wt\") as f:\n      f.write(skel.to_swc())\n\n  if progress:\n    print(f\"kimimaro: wrote {len(skels)} skeletons to {directory}\")\n\n  if cross_section > 0:\n    skels = kimimaro.cross_sectional_area(\n      labels, \n      skels,\n      anisotropy=anisotropy,\n      progress=progress,\n      smoothing_window=cross_section,\n      fill_holes=fill_holes,\n    )\n\n    for label, skel in skels.items():\n      fname = os.path.join(directory, f\"{label}_xs_area.npy\")\n      np.save(fname, skel.cross_sectional_area)\n      fname = os.path.join(directory, f\"{label}_xs_area_contacts.npy\")\n      np.save(fname, skel.cross_sectional_area_contacts)\n\n    if progress:\n      print(f\"Wrote cross sectional area and border contacts to {directory}\")  \n\n@main.group()\ndef swc():\n  \"\"\"Utilities for managing SWC files. Use forge to create new skeletons.\"\"\"\n  pass\n\n@swc.command(\"from\")\n@click.argument(\"src\", nargs=-1)\ndef from_image(src):\n  \"\"\"Convert a binary image that has already been skeletonized by a thinning algorithm into an swc.\"\"\"\n\n  for srcpath in tqdm(src):\n    try:\n      image = codecs.load(srcpath)\n    except ImportError:\n      print(f\"kimimaro: {srcpath} format not installed.\")\n      return\n\n    skel = kimimaro.extract_skeleton_from_binary_image(image)\n\n    with open(f\"{basename}.swc\", \"wt\") as f:\n      f.write(skel.to_swc())\n\n@swc.command(\"to\")\n@click.argument(\"src\", nargs=-1)\n@click.option('--format', type=str, default=\"npy\", help=\"Which format to use. Options: npy, tiff\", show_default=True)\ndef to_image(src, format):\n  \"\"\"Convert an swc into a binary image.\"\"\"\n  if format not in (\"npy\", \"tiff\"):\n    print(f\"kimimaro: invalid format {format}. npy or tiff allowed.\")\n\n  for srcpath in tqdm(src):\n    with open(srcpath, 'rt') as f:\n      skel = Skeleton.from_swc(f.read())\n\n    xmin, xmax = fastremap.minmax(skel.vertices[:,0])\n    ymin, ymax = fastremap.minmax(skel.vertices[:,1])\n    zmin, zmax = fastremap.minmax(skel.vertices[:,2])\n\n    image = np.zeros((int(zmax-zmin), int(ymax-ymin), int(xmax-xmin)), dtype=bool, order='F')\n    \n    minpt = np.array([int(xmin),int(ymin),int(zmin)])\n    drawpts = skel.vertices - minpt\n    drawpts = np.asfortranarray(drawpts, dtype=np.int32)\n    \n    image[np.where((drawpts[:, 0] >= xmin) & (drawpts[:, 0] < xmax) & \n                   (drawpts[:, 1] >= ymin) & (drawpts[:, 1] < ymax) & \n                   (drawpts[:, 2] >= zmin) & (drawpts[:, 2] < zmax))] = True\n\n    basename, ext = os.path.splitext(srcpath)\n\n    if format == \"npy\":\n      np.save(f\"{basename}.npy\", image)\n    elif format == \"tiff\":\n      try:\n        import tifffile\n        tifffile.imwrite(f\"{basename}.tiff\", \n                         image.astype(np.float32, copy=False), \n                         photometric='minisblack',\n                         metadata={'axes': 'ZYX'},\n                         imagej=True)\n      except ImportError:\n        print(\"kimimaro: tifffile not installed. Run pip install tifffile.\")\n        return\n    else:\n      raise ValueError(\"should never happen\")\n\n@main.command()\n@click.argument(\"filename\")\n@click.option('--port', type=int, default=8080, help=\"Which port to run the microviewer on for npy files.\", show_default=True)\n@click.option('--color-by', type=str, default='r', help=\"For skeleton visualization. r = radius, c = components, x = cross sectional area (if available).\", show_default=True)\ndef view(filename, port, color_by):\n  \"\"\"Visualize a .swc or .npy file.\"\"\"\n  import microviewer\n\n  basename, ext = os.path.splitext(filename)\n\n  if ext == \".swc\":\n    with open(filename, \"rt\") as swc:\n      skel = Skeleton.from_swc(swc.read())\n    microviewer.objects([ skel ], skeleton_color_by=color_by)\n  elif ext == \".npy\":\n    labels = np.load(filename)\n    microviewer.view(labels, seg=True, port=port)\n  elif ext == \".ckl\":\n    import crackle\n    labels = crackle.load(filename)\n    microviewer.view(labels, seg=True, port=port)\n  else:\n    print(\"kimimaro: {filename} was not a .swc, .npy, or .ckl file.\")\n\n@main.command()\ndef license():\n  \"\"\"Prints the license for this library and cli tool.\"\"\"\n  path = os.path.join(os.path.dirname(__file__), 'LICENSE')\n  with open(path, 'rt') as f:\n    print(f.read())\n\n"
  },
  {
    "path": "kimimaro_cli/codecs.py",
    "content": "import numpy as np\nimport os\nimport gzip\n\ndef normalize_file_ext(filename):\n  filename, ext = os.path.splitext(filename)\n\n  two_pass = ('.ckl', '.cpso')\n\n  if ext in two_pass:\n    return ext\n\n  while True:\n    filename, ext2 = os.path.splitext(filename)\n    if ext2 in two_pass:\n      return ext2\n    elif ext2 == '':\n      return ext\n    ext = ext2\n\ndef load(filename):\n  ext = normalize_file_ext(filename)\n\n  if ext == \".ckl\":\n    import crackle\n    return crackle.aload(filename)\n  elif ext == \".npy\":\n    if filename.endswith(\".gz\"):\n      with gzip.GzipFile(filename, \"rb\") as f:\n        image = np.load(f)\n    else:\n      image = np.load(filename)\n  elif ext == \".nrrd\":\n    import nrrd\n    image, header = nrrd.read(filename)\n    if image.shape[0] == 3 and image.ndim == 3:\n      image = image[...,np.newaxis]\n      image = np.transpose(image, axes=[1,2,3,0])\n    return image\n  elif ext == \".nii\":\n    import nibabel as nib\n    image = nib.load(filename)\n    image = np.array(image.dataobj)\n  elif ext in (\".tif\", \".tiff\"):\n    import tifffile\n    image = tifffile.imread(srcpath)\n  else:\n    raise ValueError(\"Data type not supported: \" + ext)\n\n  return np.asfortranarray(image)\n"
  },
  {
    "path": "manual_testing/manual_test.py",
    "content": "import kimimaro\nimport numpy as np\n\nfrom PIL import Image \n\nimg = Image.open('./crossstreet.png').asarray()\nprint(img)"
  },
  {
    "path": "manylinux2010.Dockerfile",
    "content": "FROM quay.io/pypa/manylinux2010_x86_64 \nMAINTAINER William Silversmith\n\nADD . /kimimaro\n\nWORKDIR \"/kimimaro\"\n\nENV CC \"gcc\"\nENV CXX \"g++\"\n\nRUN rm -rf *.so build __pycache__ dist \n\nRUN /opt/python/cp36-cp36m/bin/pip3.6 install pip --upgrade\nRUN /opt/python/cp36-cp36m/bin/pip3.6 install numpy\nRUN /opt/python/cp36-cp36m/bin/pip3.6 install -r requirements.txt\nRUN /opt/python/cp36-cp36m/bin/python3.6 setup.py develop\nRUN /opt/python/cp36-cp36m/bin/python3.6 -m pytest -v -x automated_test.py\n\nRUN /opt/python/cp37-cp37m/bin/pip3.7 install pip --upgrade\nRUN /opt/python/cp37-cp37m/bin/pip3.7 install numpy\nRUN /opt/python/cp37-cp37m/bin/pip3.7 install -r requirements.txt\nRUN /opt/python/cp37-cp37m/bin/python3.7 setup.py develop\nRUN /opt/python/cp37-cp37m/bin/python3.7 -m pytest -v -x automated_test.py\n\nRUN /opt/python/cp38-cp38/bin/pip3.8 install pip --upgrade\nRUN /opt/python/cp38-cp38/bin/pip3.8 install numpy\nRUN /opt/python/cp38-cp38/bin/pip3.8 install -r requirements.txt\nRUN /opt/python/cp38-cp38/bin/python3.8 setup.py develop\nRUN /opt/python/cp38-cp38/bin/python3.8 -m pytest -v -x automated_test.py\n\nRUN /opt/python/cp36-cp36m/bin/python3.6 setup.py bdist_wheel\nRUN /opt/python/cp37-cp37m/bin/python3.7 setup.py bdist_wheel\nRUN /opt/python/cp38-cp38/bin/python3.8 setup.py bdist_wheel\n\nRUN for whl in `ls dist/*.whl`; do auditwheel repair $whl --plat manylinux2010_x86_64; done"
  },
  {
    "path": "manylinux2014.Dockerfile",
    "content": "FROM quay.io/pypa/manylinux2014_x86_64 \nMAINTAINER William Silversmith\n\nADD . /kimimaro\n\nWORKDIR \"/kimimaro\"\n\nENV CC \"gcc\"\nENV CXX \"g++\"\n\nRUN rm -rf *.so build __pycache__ dist \n\nRUN /opt/python/cp36-cp36m/bin/pip3.6 install pip --upgrade\nRUN /opt/python/cp36-cp36m/bin/pip3.6 install numpy\nRUN /opt/python/cp36-cp36m/bin/pip3.6 install -r requirements.txt\nRUN /opt/python/cp36-cp36m/bin/python3.6 setup.py develop\nRUN /opt/python/cp36-cp36m/bin/python3.6 -m pytest -v -x automated_test.py\n\nRUN /opt/python/cp37-cp37m/bin/pip3.7 install pip --upgrade\nRUN /opt/python/cp37-cp37m/bin/pip3.7 install numpy\nRUN /opt/python/cp37-cp37m/bin/pip3.7 install -r requirements.txt\nRUN /opt/python/cp37-cp37m/bin/python3.7 setup.py develop\nRUN /opt/python/cp37-cp37m/bin/python3.7 -m pytest -v -x automated_test.py\n\nRUN /opt/python/cp38-cp38/bin/pip3.8 install pip --upgrade\nRUN /opt/python/cp38-cp38/bin/pip3.8 install numpy\nRUN /opt/python/cp38-cp38/bin/pip3.8 install -r requirements.txt\nRUN /opt/python/cp38-cp38/bin/python3.8 setup.py develop\nRUN /opt/python/cp38-cp38/bin/python3.8 -m pytest -v -x automated_test.py\n\nRUN /opt/python/cp36-cp36m/bin/python3.6 setup.py bdist_wheel\nRUN /opt/python/cp37-cp37m/bin/python3.7 setup.py bdist_wheel\nRUN /opt/python/cp38-cp38/bin/python3.8 setup.py bdist_wheel\n\nRUN /opt/python/cp39-cp39/bin/pip3.9 install pip --upgrade\nRUN /opt/python/cp39-cp39/bin/pip3.9 install numpy\nRUN /opt/python/cp39-cp39/bin/pip3.9 install -r requirements.txt\nRUN /opt/python/cp39-cp39/bin/python3.9 setup.py develop\nRUN /opt/python/cp39-cp39/bin/python3.9 -m pytest -v -x automated_test.py\n\nRUN /opt/python/cp36-cp36m/bin/python3.6 setup.py bdist_wheel\nRUN /opt/python/cp37-cp37m/bin/python3.7 setup.py bdist_wheel\nRUN /opt/python/cp38-cp38/bin/python3.8 setup.py bdist_wheel\nRUN /opt/python/cp39-cp39/bin/python3.9 setup.py bdist_wheel\n\nRUN for whl in `ls dist/*.whl`; do auditwheel repair $whl --plat manylinux2014_x86_64; done\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\n    \"setuptools>=61.0.0\",\n    \"wheel\",\n    \"cython\",\n    \"numpy>=1.16.1\"\n]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"kimimaro\"\nversion = \"5.8.1\"\nauthors = [\n    {name = \"William Silversmith\", email = \"ws9@princeton.edu\"},\n    {name = \"Alex Bae\"},\n    {name = \"Forrest Collman\"},\n    {name = \"Peter Li\"},\n    {name = \"Nina Shamsi\"}\n]\ndescription = \"Skeletonize densely labeled image volumes.\"\nreadme = \"README.md\"\nrequires-python = \">=3.9.0,<4.0.0\"\nlicense = \"GPL-3.0-or-later\"\nkeywords = [\n    \"volumetric-data\",\n    \"numpy\",\n    \"teasar\",\n    \"skeletonization\",\n    \"centerline\",\n    \"medial-axis-transform\",\n    \"centerline-extraction\",\n    \"computer-vision-algorithms\",\n    \"connectomics\",\n    \"image-processing\",\n    \"biomedical-image-processing\",\n    \"voxel\"\n]\nclassifiers = [\n    \"Intended Audience :: Developers\",\n    \"Development Status :: 5 - Production/Stable\",\n    \"Programming Language :: Python\",\n    \"Programming Language :: Python :: 3\",\n    \"Programming Language :: Python :: 3.9\",\n    \"Programming Language :: Python :: 3.10\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Programming Language :: Python :: 3.12\",\n    \"Programming Language :: Python :: 3.13\",\n    \"Topic :: Scientific/Engineering\",\n    \"Intended Audience :: Science/Research\",\n    \"Operating System :: POSIX\",\n    \"Operating System :: MacOS\",\n    \"Operating System :: Microsoft :: Windows :: Windows 10\"\n]\n\ndependencies = [\n    \"click\",\n    \"connected-components-3d>=3.16.0\",\n    \"dijkstra3d>=1.15.0\",\n    \"fill-voids>=2.0.0\",\n    \"edt>=2.1.0\",\n    \"fastremap>=1.10.2\",\n    \"networkx\",\n    \"numpy>=1.16.1\",\n    \"osteoid\",\n    \"pathos\",\n    \"pytest\",\n    \"scipy>=1.1.0\",\n    \"tqdm\",\n    \"xs3d>=1.2.0,<2\"\n]\n\n[project.optional-dependencies]\naccel = [\n    \"pykdtree\",\n]\nview = [ \n    \"microviewer\",\n    \"crackle-codec\",\n    \"vtk\",\n]\ntif = [ \n    \"tifffile\",\n]\nnii = [\n    \"nibabel\",\n]\nnrrd = [\n    \"pynrrd\",\n]\nall_formats = [\n    \"tifffile\",\n    \"nibabel\",\n    \"pynrrd\",\n]\nall = [ \n    \"tifffile\",\n    \"nibabel\",\n    \"pynrrd\",\n    \"microviewer\",\n    \"vtk\",\n    \"pykdtree\",\n]\n\n[project.urls]\nHomepage = \"https://github.com/seung-lab/kimimaro/\"\n\n[project.scripts]\nkimimaro = \"kimimaro_cli:main\"\n\n[tool.setuptools]\npackages = [\"kimimaro\", \"kimimaro_cli\"]\ninclude-package-data = true\n\n[tool.setuptools.package-dir]\nkimimaro = \"kimimaro\"\nkimimaro_cli = \"kimimaro_cli\"\n\n"
  },
  {
    "path": "requirements-dev.txt",
    "content": "pytest\ncrackle-codec"
  },
  {
    "path": "requirements.txt",
    "content": "click\nconnected-components-3d>=3.16.0\ncrackle-codec>=0.33.0\ndijkstra3d>=1.15.0\nfill-voids>=2.0.0\nedt>=3.0.0\nfastremap>=1.10.2\nmicroviewer\nnetworkx\nnumpy>=1.16.1\nosteoid\npathos\nposix_ipc\npsutil\nscipy>=1.1.0\ntqdm\nxs3d>=1.11.0,<2\n"
  },
  {
    "path": "setup.py",
    "content": "#!/usr/bin/env python\nimport os\nimport setuptools\nimport sys\n\nclass NumpyImport:\n  def __repr__(self):\n    import numpy as np\n\n    return np.get_include()\n\n  __fspath__ = __repr__\n\nextra_compile_args = []\nif sys.platform == 'win32':\n  extra_compile_args += [\n    '/std:c++17', '/O2'\n  ]\nelse:\n  extra_compile_args += [\n    '-std=c++17', '-O3'\n  ]\n\nif sys.platform == 'darwin':\n  extra_compile_args += [ '-stdlib=libc++', '-mmacosx-version-min=10.9' ]\n\nsetuptools.setup(\n    ext_modules=[\n      setuptools.Extension(\n        'kimimaro.skeletontricks',\n        sources=[ './ext/skeletontricks/skeletontricks.pyx' ],\n        language='c++',\n        include_dirs=[ str(NumpyImport()) ],\n        extra_compile_args=extra_compile_args,\n      ),\n    ],\n)"
  },
  {
    "path": "tox.ini",
    "content": "[tox]\nenvlist = py38,py39,py310,py311,py312\n\n[testenv]\nplatform = darwin\ndeps = \n\tsetuptools\n\twheel\n\tcython\n\t-rrequirements.txt\n\toldest-supported-numpy\n\ncommands = \n\tpython setup.py develop\n\tpython setup.py bdist_wheel"
  }
]