Repository: seung-lab/kimimaro Branch: master Commit: 66f488e8ff06 Files: 37 Total size: 504.3 KB Directory structure: gitextract_n_5c9fyh/ ├── .dockerignore ├── .github/ │ └── workflows/ │ ├── build_wheel.yml │ └── test.yml ├── .gitignore ├── AUTHORS ├── CITATION.cff ├── ChangeLog ├── LICENSE ├── MANIFEST.in ├── README.md ├── automated_test.py ├── benchmarks/ │ ├── README.md │ ├── benchmark.py │ └── kimimaro.numbers ├── build_linux.sh ├── ext/ │ └── skeletontricks/ │ ├── dijkstra_invalidation.hpp │ ├── libdivide.h │ ├── skeletontricks.hpp │ ├── skeletontricks.pyx │ └── unordered_dense.hpp ├── kimimaro/ │ ├── __init__.py │ ├── intake.py │ ├── post.py │ ├── sharedmemory.py │ ├── trace.py │ └── utility.py ├── kimimaro_cli/ │ ├── LICENSE │ ├── __init__.py │ └── codecs.py ├── manual_testing/ │ └── manual_test.py ├── manylinux2010.Dockerfile ├── manylinux2014.Dockerfile ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ build *.egg-info benchmarks __pycache__ manual_testing .eggs .git .tox .pytest_cache ================================================ FILE: .github/workflows/build_wheel.yml ================================================ name: Build Wheels on: workflow_dispatch: push: tags: - '*' env: CIBW_SKIP: pp* *-musllinux* cp36* cp37* cp38* jobs: build_wheels: name: Build wheels on ${{matrix.arch}} for ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, windows-latest, macos-latest] arch: [auto] include: - os: ubuntu-latest arch: aarch64 steps: - uses: actions/checkout@v4 - name: Set up QEMU if: ${{ matrix.arch == 'aarch64' }} uses: docker/setup-qemu-action@v1 - name: Build wheels uses: pypa/cibuildwheel@v3.2.0 # to supply options, put them in 'env', like: env: CIBW_ARCHS_LINUX: ${{matrix.arch}} CIBW_BEFORE_BUILD: pip install numpy setuptools wheel cython CIBW_ARCHS_MACOS: "x86_64 arm64" - name: Upload built wheels uses: actions/upload-artifact@v4 with: name: built-wheels-${{ matrix.os }}-${{ matrix.arch }} path: ./wheelhouse/*.whl if-no-files-found: warn ================================================ FILE: .github/workflows/test.yml ================================================ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Test Suite on: push: branches: [ master ] pull_request: branches: [ master ] jobs: build: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install cython numpy setuptools wheel build pip install -r requirements.txt -r requirements-dev.txt python -m build --wheel pip install dist/*.whl - name: Test with pytest run: | python setup.py develop python -m pytest -v -x automated_test.py ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ test.py .DS_Store # Itelli J .idea/ ext/skeletontricks/skeletontricks.cpp ================================================ FILE: AUTHORS ================================================ Jingpeng Wu William Silversmith ================================================ FILE: CITATION.cff ================================================ cff-version: 1.1.0 message: "If you use this software, please cite it as below." authors: - family-names: "Silversmith" given-names: "William" orcid: "https://orcid.org/0000-0002-5485-5341" - family-names: "Bae" given-names: "J. Alexander" orcid: "https://orcid.org/0000-0002-4681-6342" - family-names: "Li" given-names: "Peter H." orcid: "https://orcid.org/0000-0001-6193-4454" - family-names: "Wilson" given-names: "A.M." orcid: "https://orcid.org/0000-0002-3822-5200" title: "Kimimaro: Skeletonize densely labeled 3D image segmentations" version: 3.0.0 date-released: 2021-09-29 doi: 10.5281/zenodo.5539913 ================================================ FILE: ChangeLog ================================================ CHANGES ======= 2.0.2 ----- * test: faster execution for cube and solid color tests * fix(trace): skip adding DAF if max is 0 * test: check extremely sparse images (one or two voxels with no dust threshold) * chore: drop py35 testing add .dockerignore 2.0.1 ----- * fix(windows): use np.uintp before casting to size\_t * fix: appveyor needs numpy installed first * chore: new build system for binary distribution 2.0.0 ----- * fix(intake): solid color blocks were causing errors (#56) * perf: faster somas (#55) * fix: python3.8 compiles cpp code (#52) * chore: update travis to use python 3.7 and 3.8 * add python3.8 test 1.6.0 ----- * feat: avocado protection (🥑) (#43) * chore: update ChangeLog 1.5.0 ----- * chore: add skeleton for manual testing * feat: add fill\_holes argument (#50) 1.4.2 ----- * chore: loosen networkx requirement (#49) * Update README.md * docs: update memory usage diagram for version 1.4.0 1.4.1 ----- * perf: switch source and target for dijkstra 1.3.3 ----- * refactor: make type of 0L clear to std::max on Windows * Revert "fix: don't assume vertices are uint32" * fix: don't assume vertices are uint32 * chore: update ChangeLog 1.3.2 ----- * fix: several additional algorithms required 64-bit addressable changes 1.3.1 ----- * chore: bump dijkstra requirement * fix: 64-bit addressable \_roll\_invalidation\_cube (#42) * docs: shout out to fill\_voids * fix: remove unnecessary PIL import 1.3.0 ----- * docs: describe max\_paths in the function docstring * fix: soma center was being overriden by fix\_borders * perf: only recompute EDT for soma if some voxels were filled * perf: use bidirectional dijkstra on somata (increases peak memory usage) 1.2.1 ----- * docs: remove non-ascii character from README.md * docs: link back to papers using Kimimaro 1.2.0 ----- * docs: show how to use synapses\_to\_targets * feat: facility for converting synapse centroids into targets (#37) * refactor+perf: use new fill-voids package 1.1.0 ----- * perf: implemented flood fill based binary\_fill\_holes (#38) 1.0.4 ----- * perf: increase postprocess speed (#35) * perf: more judicious use of consolidate in postprocess 1.0.3 ----- * docs: update ChangeLog * fix: preserve skeleton id during postprocessing 1.0.2 ----- * fix: allow multiple invocations of a pathos process pool * perf: skip processing if dust\_threshold larger than image 1.0.1 ----- * fix: accept any root converable to a tuple * fix: progress bars were disrupted in parallel feature * docs: upload changelog 1.0.0 ----- * feat: specify extra\_targets\_before and after (#33) * docs: fix spelling & grammar 0.7.0 ----- * docs: add parallel\_chunk\_size to README * perf+feat: Reduce Parallel Task Starvation + Better Parallel Progress Bar (#32) * docs: add example of join\_close\_components 0.6.0 ----- * feat: adds join\_close\_components to postprocess (#27) * docs: link to tutorial wiki articles * docs: add advice on tweaking parameters 0.5.4 ----- * fix: sometimes get\_mapping doesn't get everything 0.5.3 ----- * fix: object\_ids were being masked instead of mask\_excepted * docs: show performance chart for v0.5.2 0.5.2 ----- * perf: improve performance of find\_objects 7x 0.5.1 ----- * perf: ~20x faster unique(label, return\_counts=True) (#26) * docs: changelog update + small formatting adjustment to example 0.5.0 ----- * docs: example of how to use postprocess * feat: import out-of-core postprocessing logic from Igneous * docs: add object\_ids to example * perf: improve speed of skeletontricks.get\_mapping * fix: accept binary images of type bool * perf: take advantage of faster segid finding if dust\_threshold == 0 * fix: compilation warning for \_roll\_invalidation\_cube * test: add some manual visualization tests * chore: update ChangeLog 0.4.2 ----- * release: 0.4.2 * chore: tell PyPI we're using markdown * fix: ensure we pick max dbf close to centroid of detected somata * chore: update ChangeLog * docs: various corrections to the README 0.4.1 ----- * fix: add defense against setting the dust threshold lower than 1 * chore: formatting around all\_labels * test: x and y joinability * test: show that two 1px overlapping volumes join properly * Update README.md * feat: accept N-dimensional arrays with trivial dimensions above 3 * docs: add Google TEASAR run to boslster case for popularity * fix: prevent duplicate border targets * feat: parallel edt implementation * fix: add support for anisotropy to distance calculations * test: add distortion to border test * wip: propogate anisotropy to fix\_borders calls * fix: cuboid soma processing * fix: bump edt to 1.2.4 to correct part of large anisotropy issue * perf: faster masking operations with newer fastremap * docs: encouraged the use of parallel processing in README.md * chore: add GPLv3 classifer to setup.cfg * chore: add ChangeLog 0.4.0 ----- * feat: parallel implementation (Cursed Seal Mode) (#10) 0.3.1 ----- * fix: INTEGER type did not include all integers 0.3.0 ----- * docs: updated credits with fix\_borders * feat: add fix\_borders parameter & max\_paths parameter (#9) * test+fix: remove "cd python" * docs: add Travis CI badge * chore: add Travis CI * test: add basic test for skeletonizing diagonal of square and cube * perf: improve memory consumption of object masking * perf: introduce in\_place flag to make it safe to modify input data * perf: use fastremap's new in\_place flag for lower memory and perf * docs: updated credits 0.2.2 ----- * fix: accept C order arrays (#7) * docs: reduce redundancy in example vs performance * docs: add benchmark description * docs: added benchmark photo * docs: add link to citation 4 * docs: use citations 3 and 4 * docs: described "roll invalidation cube" * docs: described algorithm in steps * docs: describing the algorithm 0.2.1 ----- * fix: black volumes should return dict not None 0.2.0 ----- * docs: add PyPI badge * feat: fix branching (#1) * docs: adding sections to README 0.1.0 ----- * chore: clean up dockerfile and metadata * docs: draft discussion of motivation and usage * feat: export DimensionError exception (so it can be caught) * refactor: remove path\_downsample from trace function * docs: described parameters of skeletonize function * chore: files required for building distributions * wip: importing skeletonization procedure * Initial commit ================================================ FILE: LICENSE ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: MANIFEST.in ================================================ recursive-include ext * include LICENSE ================================================ FILE: README.md ================================================ [![PyPI version](https://badge.fury.io/py/kimimaro.svg)](https://badge.fury.io/py/kimimaro) # Kimimaro: Skeletonize Densely Labeled Images ```bash # Produce SWC files from volumetric images. kimimaro forge labels.npy --progress # writes to ./kimimaro_out/ kimimaro view kimimaro_out/10.swc ``` Rapidly skeletonize all non-zero labels in 2D and 3D numpy arrays using a TEASAR derived method. The returned list of skeletons is in the format used by [cloud-volume](https://github.com/seung-lab/cloud-volume/wiki/Advanced-Topic:-Skeletons). A skeleton is a stick figure 1D representation of a 2D or 3D object that consists of a graph of verticies linked by edges. A skeleton where the verticies also carry a distance to the nearest boundary they were extracted from is called a "Medial Axis Transform", which Kimimaro provides. Skeletons are a compact representation that can be used to visualize objects, trace the connectivity of an object, or otherwise analyze the object's geometry. Kimimaro was designed for use with high resolution neurons extracted from electron microscopy data via AI segmentation, but it can be applied to many different fields. On an Apple Silicon M1 arm64 chip (Firestorm cores 3.2 GHz max frequency), this package processed a 512x512x100 volume with 333 labels in 20 seconds. It processed a 512x512x512 volume (`connectomics.npy`) with 2124 labels in 187 seconds.

A Densely Labeled Volume Skeletonized with Kimimaro
Fig. 1: A Densely Labeled Volume Skeletonized with Kimimaro

## `pip` Installation If a binary is available for your platform: ```bash pip install kimimaro # installs additional libraries to accelerate some # operations like join_close_components pip install "kimimaro[accel]" # Makes the kimimaro view command work pip install "kimimaro[view]" # Enables TIFF generation on the CLI pip install "kimimaro[tif]" # Enables reading NIBABEL, NRRD, TIFF, CRACKLE on the CLI pip install "kimimaro[all_formats]" # Install all optional dependencies pip install "kimimaro[all]" ``` Otherwise, you'll also need a C++ compiler: ```bash sudo apt-get install python3-dev g++ # ubuntu linux ``` ## Example

A Densely Labeled Volume Skeletonized with Kimimaro
Fig. 2: Memory Usage on a 512x512x512 Densely Labeled Volume (`connectomics.npy`)

Figure 2 shows the memory usage and processessing time (~390 seconds, about 6.5 minutes) required when Kimimaro 1.4.0 was applied to a 512x512x512 cutout, *labels*, from a connectomics dataset, `connectomics.npy` containing 2124 connected components. The different sections of the algorithm are depicted. Grossly, the preamble runs for about half a minute, skeletonization for about six minutes, and finalization within seconds. The peak memory usage was about 4.5 GB. The code below was used to process *labels*. The processing of the glia was truncated in due to a combination of *fix_borders* and max_paths. Kimimaro has come a long way. Version 0.2.1 took over 15 minutes and had a Preamble run time twice as long on the same dataset. On a Macbook Pro M3, the same settings now complete in 94 seconds (1.6 minutes) on version 5.4.0. With xs3d 1.11.0, cross section analysis takes 215 seconds (3.6 minutes). ### Python Interface ```python # LISTING 1: Producing Skeletons from a labeled image. import kimimaro # To obtain this 512 MB segmentation sample volume: # pip install crackle-codec import crackle labels = crackle.load("benchmarks/connectomics.npy.ckl.gz") skels = kimimaro.skeletonize( labels, teasar_params={ "scale": 1.5, "const": 300, # physical units "pdrf_scale": 100000, "pdrf_exponent": 4, "soma_acceptance_threshold": 3500, # physical units "soma_detection_threshold": 750, # physical units "soma_invalidation_const": 300, # physical units "soma_invalidation_scale": 2, "max_paths": 300, # default None }, # object_ids=[ ... ], # process only the specified labels # extra_targets_before=[ (27,33,100), (44,45,46) ], # target points in voxels # extra_targets_after=[ (27,33,100), (44,45,46) ], # target points in voxels dust_threshold=1000, # skip connected components with fewer than this many voxels anisotropy=(16,16,40), # default True fix_branching=True, # default True fix_borders=True, # default True fill_holes=False, # default False fix_avocados=False, # default False progress=True, # default False, show progress bar parallel=1, # <= 0 all cpu, 1 single process, 2+ multiprocess parallel_chunk_size=100, # how many skeletons to process before updating progress bar ) # LISTING 2: Combining skeletons produced from # adjacent or overlapping images. import kimimaro from osteoid import Skeleton skels = ... # a set of skeletons produced from the same label id skel = Skeleton.simple_merge(skels).consolidate() skel = kimimaro.postprocess( skel, dust_threshold=1000, # physical units tick_threshold=3500 # physical units ) # LISTING 3: Adding cross sectional area to skeletons # Cross section planes are defined by normal vectors. Those # vectors come from the difference between adjacent vertices. skels = ... # one or more skeletons produced from a single image skels = kimimaro.cross_sectional_area( labels, skels, anisotropy=(16,16,40), smoothing_window=5, # rolling average window of plane normals progress=True, ) skel = skels[0] skel.cross_sectional_area # array of cross sectional areas skel.cross_sectional_area_contacts # non-zero contacted the image border # Split input skeletons into connected components and # then join the two nearest vertices within `radius` distance # of each other until there is only a single connected component # or no pairs of points nearer than `radius` exist. # Fuse all remaining components into a single skeleton. skel = kimimaro.join_close_components([skel1, skel2], radius=1500) # 1500 units threshold skel = kimimaro.join_close_components([skel1, skel2], radius=None) # no threshold # Given synapse centroids (in voxels) and the SWC integer label you'd # like to assign (e.g. for pre-synaptic and post-synaptic) this finds the # nearest voxel to the centroid for that label. # Input: { label: [ ((x,y,z), swc_label), ... ] } # Returns: { (x,y,z): swc_label, ... } extra_targets = kimimaro.synapses_to_targets(labels, synapses) # LISTING 4: Drawing a centerline between # preselected points on a binary image. # This is a much simpler option for when # you know exactly what you want, but may # be less efficient for large scale procesing. skel = kimimaro.connect_points( labels == 67301298, start=(3, 215, 202), end=(121, 426, 227), anisotropy=(32,32,40), ) # LISTING 5: Using skeletons to oversegment existing # segmentations for integration into proofreading systems # that on merging atomic labels. oversegmented_labels # is returned numbered from 1. skels is a copy returned # with the property skel.segments that associates a label # to each vertex (labels will not be unique if downsampling # is used) oversegmented_labels, skels = kimimaro.oversegment( labels, skels, anisotropy=(32,32,40), downsample=10, ) ``` `connectomics.npy` is multilabel connectomics data derived from pinky40, a 2018 experimental automated segmentation of ~1.5 million cubic micrometers of mouse visual cortex. It is an early predecessor to the now public pinky100_v185 segmentation that can be found at https://microns-explorer.org/phase1 You will need to run `lzma -d connectomics.npy.lzma` to obtain the 512x512x512 uint32 volume at 32x32x40 nm3 resolution. ### CLI Interface The CLI supports producing skeletons from a single image as SWCs and viewing the resulting SWC files one at a time. By default, the SWC files are written to `./kimimaro_out/$LABEL.swc`. Here's an equivalent example to the code above. ```bash kimimaro forge labels.npy --scale 4 --const 10 --soma-detect 1100 --soma-accept 3500 --soma-scale 1 --soma-const 300 --anisotropy 16,16,40 --fix-borders --progress ``` Visualize the your data: ```bash kimimaro view 1241241.swc # visualize skeleton kimimaro view labels.npy # visualize segmentation ``` It can also convert binary image skeletons produced by thinning algorithms into SWC files and back. This can be helpful for comparing different skeletonization algorithms or even just using their results. ```bash kimimaro swc from binary_image.tiff # -> binary_image.swc kimimaro swc to --format tiff binary_image.swc # -> binary_image.tiff or npy ``` ## Tweaking `kimimaro.skeletonize` Parameters This algorithm works by finding a root point on a 3D object and then serially tracing paths via dijksta's shortest path algorithm through a penalty field to the most distant unvisited point. After each pass, there is a sphere (really a circumscribing cube) that expands around each vertex in the current path that marks part of the object as visited. For a visual tutorial on the basics of the skeletonization procedure, check out this wiki article: [A Pictorial Guide to TEASAR Skeletonization](https://github.com/seung-lab/kimimaro/wiki/A-Pictorial-Guide-to-TEASAR-Skeletonization) For more detailed information, [read below](https://github.com/seung-lab/kimimaro#ii-skeletonization) or the [TEASAR paper](https://ieeexplore.ieee.org/abstract/document/883951/) (though we [deviate from TEASAR](https://github.com/seung-lab/kimimaro#teasar-derived-algorthm) in a few places). [1] ### `scale` and `const` Usually, the most important parameters to tweak are `scale` and `const` which control the radius of this invalidation sphere according to the equation `r(x,y,z) = scale * DBF(x,y,z) + const` where the dimensions are physical (e.g. nanometers, i.e. corrected for anisotropy). `DBF(x,y,z)` is the physical distance from the shape boundary at that point. Check out this [wiki article](https://github.com/seung-lab/kimimaro/wiki/Intuition-for-Setting-Parameters-const-and-scale) to help refine your intuition. ### `anisotropy` Represents the physical dimension of each voxel. For example, a connectomics dataset might be scanned with an electron microscope at 4nm x 4nm per pixel and stacked in slices 40nm thick. i.e. `anisotropy=(4,4,40)`. You can use any units so long as you are consistent. ### `dust_threshold` This threshold culls connected components that are smaller than this many voxels. ### `extra_targets_after` and `extra_targets_before` `extra_targets_after` provides additional voxel targets to trace to after the morphological tracing algorithm completes. For example, you might add known synapse locations to the skeleton. `extra_targets_before` is the same as `extra_targets_after` except that the additional targets are front-loaded and the paths that they cover are invalidated. This may affect the results of subsequent morphological tracing. ### `max_paths` Limits the number of paths that can be drawn for the given label. Certain cells, such as glia, that may not be important for the current analysis may be expensive to process and can be aborted early. ### `pdrf_scale` and `pdrf_exponent` The `pdrf_scale` and `pdrf_exponent` represent parameters to the penalty equation that takes the euclidean distance field (**D**) and augments it so that cutting closer to the border is very penalized to make dijkstra take paths that are more centered. Pr = `pdrf_scale` * (1 - **D** / max(**D**)) `pdrf_exponent` + (directional gradient < 1.0). The default settings should work fairly well, but under large anisotropies or with cavernous morphologies, it's possible that you might need to tweak it. If you see the skeleton go haywire inside a large area, it could be a collapse of floating point precision. ### `soma_acceptance_threshold` and `soma_detection_threshold` We process somas specially because they do not have a tubular geometry and instead should be represented in a hub and spoke manner. `soma_acceptance_threshold` is the physical radius (e.g. in nanometers) beyond which we classify a connected component of the image as containing a soma. The distance transform's output is depressed by holes in the label, which are frequently produced by segmentation algorithms on somata. We can fill them, but the hole filling algorithm we use is slow so we would like to only apply it occasionally. Therefore, we set a lower threshold, the `soma_acceptance_threshold`, beyond which we fill the holes and retest the soma. ### `soma_invalidation_scale` and `soma_invalidation_const` Once we have classified a region as a soma, we fix root of the skeletonization algorithm at one of the points of maximum distance from the boundary (usually there is only one). We then mark as visited all voxels around that point in a spherical radius described by `r(x,y,z) = soma_invalidation_scale * DBF(x,y,z) + soma_invalidation_const` where DBF(x,y,z) is the physical distance from the shape boundary at that point. If done correctly, this can prevent skeletons from being drawn to the boundaries of the soma, and instead pulls the skeletons mainly into the processes extending from the cell body. ### `fix_borders` This feature makes it easier to connect the skeletons of adjacent image volumes that do not fit in RAM. If enabled, skeletons will be deterministically drawn to the approximate center of the 2D contact area of each place where the shape contacts the border. This can affect the performance of the operation positively or negatively depending on the shape and number of contacts. ### `fix_branching` You'll probably never want to disable this, but base TEASAR is infamous for forking the skeleton at branch points way too early. This option makes it preferential to fork at a more reasonable place at a significant performance penalty. ### `fill_holes` _Warning: This will remove input labels that are deemed to be holes._ If your segmentation contains artifacts that cause holes to appear in labels, you can preprocess the entire image to eliminate background holes and holes caused by entirely contained inclusions. This option adds a moderate amount of additional processing time at the beginning (perhaps ~30%). ### `fix_avocados` Avocados are segmentations of cell somata that classify the nucleus separately from the cytoplasm. This is a common problem in automatic segmentations due to the visual similarity of a cell membrane and a nuclear membrane combined with insufficient context. Skeletonizing an avocado results in a poor skeletonization of the cell soma that will disconnect the nucleus and usually results in too many paths traced around the nucleus. Setting `fix_avocados=True` attempts to detect and fix these problems. Currently we handle non-avocados, avocados, cells with inclusions, and nested avocados. You can see examples [here](https://github.com/seung-lab/kimimaro/pull/43). ### `progress` Show a progress bar once the skeletonization phase begins. ### `parallel` Use a pool of processors to skeletonize faster. Each process allocatable task is the skeletonization of one connected component (so it won't help with a single label that takes a long time to skeletonize). This option also affects the speed of the initial euclidean distance transform, which is parallel enabled and is the most expensive part of the Preamble (described below). ### `parallel_chunk_size` This only applies when using parallel. This sets the number of skeletons a subprocess will extract before returning control to the main thread, updating the progress bar, and acquiring a new task. If this value is set too low (e.g. < 10-20) the cost of interprocess communication can become significant and even dominant. If it is set too high, task starvation may occur for the other subprocesses if a subprocess gets a particularly hard skeleton and they complete quickly. Progress bar updates will be infrequent if the value is too high as well. The actual chunk size used will be `min(parallel_chunk_size, len(cc_labels) // parallel)`. `cc_labels` represents the number of connected components in the sample. ### Performance Tips - If you only need a few labels skeletonized, pass in `object_ids` to bypass processing all the others. If `object_ids` contains only a single label, the masking operation will run faster. - Larger TEASAR parameters scale and const require processing larger invalidation regions per path. - Set `pdrf_exponent` to a small power of two (e.g. 1, 2, 4, 8, 16) for a small speedup. - If you are willing to sacrifice the improved branching behavior, you can set `fix_branching=False` for a moderate 1.1x to 1.5x speedup (assuming your TEASAR parameters and data allow branching). - If your dataset contains important cells (that may in fact be the seat of consciousness) but they take significant processing power to analyze, you can save them to savor for later by setting `max_paths` to some reasonable level which will abort and proceed to the next label after the algorithm detects that that at least that many paths will be needed. - Parallel distributes work across connected components and is generally a good idea if you have the cores and memory. Not only does it make single runs proceed faster, but you can also practically use a much larger context; that improves soma processing as they are less likely to be cut off. The Preamble of the algorithm (detailed below) is still single threaded at the moment, so task latency increases with size. - If `parallel_chunk_size` is set very low (e.g. < 10) during parallel operation, interprocess communication can become a significant overhead. Try raising this value. ## Motivation The connectomics field commonly generates very large densely labeled volumes of neural tissue. Skeletons are one dimensional representations of two or three dimensional objects. They have many uses, a few of which are visualization of neurons, calculating global topological features, rapidly measuring electrical distances between objects, and imposing tree structures on neurons (useful for computation and user interfaces). There are several ways to compute skeletons and a few ways to define them [4]. After some experimentation, we found that the TEASAR [1] approach gave fairly good results. Other approaches include topological thinning ("onion peeling") and finding the centerline described by maximally inscribed spheres. Ignacio Arganda-Carreras, an alumnus of the Seung Lab, wrote a topological thinning plugin for Fiji called [Skeletonize3d](https://imagej.net/Skeletonize3D). There are several implementations of TEASAR used in the connectomics field [3][5], however it is commonly understood that implementations of TEASAR are slow and can use tens of gigabytes of memory. Our goal to skeletonize all labels in a petavoxel scale image quickly showed clear that existing sparse implementations are impractical. While adapting a sparse approach to a cloud pipeline, we noticed that there are inefficiencies in repeated evaluation of the Euclidean Distance Transform (EDT), the repeated evaluation of the connected components algorithm, in the construction of the graph used by Dijkstra's algorithm where the edges are implied by the spatial relationships between voxels, in the memory cost, quadratic in the number of voxels, of representing a graph that is implicit in image, in the unnecessarily large data type used to represent relatively small cutouts, and in the repeated downloading of overlapping regions. We also found that the naive implmentation of TEASAR's "rolling invalidation ball" unnecessarily reevaluated large numbers of voxels in a way that could be loosely characterized as quadratic in the skeleton path length. We further found that commodity implementations of the EDT supported only binary images. We were unable to find any available Python or C++ libraries for performing Dijkstra's shortest path on an image. Commodity implementations of connected components algorithms for images supported only binary images. Therefore, several libraries were devised to remedy these deficits (see Related Projects). ## Why TEASAR? TEASAR: Tree-structure Extraction Algorithm for Accurate and Robust skeletons, a 2000 paper by M. Sato and others [1], is a member of a family of algorithms that transform two and three dimensional structures into a one dimensional "skeleton" embedded in that higher dimension. One might concieve of a skeleton as extracting a stick figure drawing from a binary image. This problem is more difficult than it might seem. There are different situations one must consider when making such a drawing. For example, a stick drawing of a banana might merely be a curved centerline and a drawing of a doughnut might be a closed loop. In our case of analyzing neurons, sometimes we want the skeleton to include spines, short protrusions from dendrites that usually have synapses attached, and sometimes we want only the characterize the run length of the main trunk of a neurite. Additionally, data quality issues can be challenging as well. If one is skeletonizing a 2D image of a doughnut, but the angle were sufficiently declinated from the ring's orthogonal axis, would it even be possible to perform this task accurately? In a 3D case, if there are breaks or mergers in the labeling of a neuron, will the algorithm function sensibly? These issues are common in both manual and automatic image sementations. In our problem domain of skeletonizing neurons from anisotropic voxel labels, our chosen algorithm should produce tree structures, handle fine or coarse detail extraction depending on the circumstances, handle voxel anisotropy, and be reasonably efficient in CPU and memory usage. TEASAR fufills these criteria. Notably, TEASAR doesn't guarantee the centeredness of the skeleton within the shape, but it makes an effort. The basic TEASAR algorithm is known to cut corners around turns and branch too early. A 2001 paper by members of the original TEASAR team describes a method for reducing the early branching issue on page 204, section 4.2.2. [2] ## TEASAR Derived Algorithm We implemented TEASAR but made several deviations from the published algorithm in order to improve path centeredness, increase performance, handle bulging cell somas, and enable efficient chunked evaluation of large images. We opted not to implement the gradient vector field step from [2] as our implementation is already quite fast. The paper claims a reduction of 70-85% in input voxels, so it might be worth investigating. In order to work with images that contain many labels, our general strategy is to perform as many actions as possible in such a way that all labels are treated in a single pass. Several of the component algorithms (e.g. connected components, euclidean distance transform) in our implementation can take several seconds per a pass, so it is important that they not be run hundreds or thousands of times. A large part of the engineering contribution of this package lies in the efficiency of these operations which reduce the runtime from the scale of hours to minutes. Given a 3D labeled voxel array, *I*, with N >= 0 labels, and ordered triple describing voxel anisotropy *A*, our algorithm can be divided into three phases, the pramble, skeletonization, and finalization in that order. ### I. Preamble The Preamble takes a 3D image containing *N* labels and efficiently generates the connected components, distance transform, and bounding boxes needed by the skeletonization phase. 1. To enhance performance, if *N* is 0 return an empty set of skeletons. 2. Label the M connected components, *Icc*, of *I*. 3. To save memory, renumber the connected components in order from 1 to *M*. Adjust the data type of the new image to the smallest uint type that will contain *M* and overwrite *Icc*. 4. Generate a mapping of the renumbered *Icc* to *I* to assign meaningful labels to skeletons later on and delete *I* to save memory. 5. Compute *E*, the multi-label anisotropic Euclidean Distance Transform of *Icc* given *A*. *E* treats all interlabel edges as transform edges, but not the boundaries of the image. Black pixels are considered background. 6. Gather a list, *Lcc* of unique labels from *Icc* and threshold which ones to process based on the number of voxels they represent to remove "dust". 7. In one pass, compute the list of bounding boxes, *B*, corresponding to each label in *Lcc*. ### II. Skeletonization In this phase, we extract the tree structured skeleton from each connected component label. Below, we reference variables defined in the Preamble. For clarity, we omit the soma specific processing and hold `fix_branching=True`. For each label *l* in *Lcc* and *B*... 1. Extract *Il*, the cropped binary image tightly enclosing *l* from *Icc* using *Bl* 2. Using *Il* and *Bl*, extract *El* from *E*. *El* is the cropped tightly enclosed EDT of *l*. This is much faster than recomputing the EDT for each binary image. 3. Find an arbitrary foreground voxel and using that point as a source, compute the anisotropic euclidean distance field for *Il*. The coordinate of the maximum value is now "the root" *r*. 4. From *r*, compute the euclidean distance field and save it as the distance from root field *Dr*. 5. Compute the penalized distance from root field *Pr* = `pdrf_scale` * ((1 - *El* / max(*El*)) ^ `pdrf_exponent`) + *Dr* / max(*Dr*). 6. While *Il* contains foreground voxels: 1. Identify a target coordinate, *t*, as the foreground voxel with maximum distance in *Dr* from *r*. 2. Draw the shortest path *p* from *r* to *t* considering the voxel values in *Pr* as edge weights. 3. For each vertex *v* in *p*, extend an invalidation cube of physical side length computed as `scale` * *El*(*v*) + `const` and convert any foreground pixels in *Il* that overlap with these cubes to background pixels. 4. (Only if `fix_branching=True`) For each vertex coordinate *v* in *p*, set *Pr*(*v*) = 0. 5. Append *p* to a list of paths for this label. 7. Using *El*, extract the distance to the nearest boundary each vertex in the skeleton represents. 8. For each raw skeleton extracted from *Il*, translate the vertices by *Bl* to correct for the translation the cropping operation induced. 9. Multiply the vertices by the anisotropy *A* to place them in physical space. If soma processing is considered, we modify the root (*r*) search process as follows: 1. If max(*El*) > `soma_detection_threshold`... 1. Fill toplogical holes in *Il*. Soma are large regions that often have dust from imperfect automatic labeling methods. 2. Recompute *El* from this cleaned up image. 3. If max(*El*) > `soma_acceptance_threshold`, divert to soma processing mode. 2. If in soma processing mode, continue, else go to step 3 in the algorithm above. 3. Set *r* to the coordinate corresponding to max(*El*) 4. Create an invalidation sphere of physical radius `soma_invalidation_scale` * max(*El*) + `soma_invalidation_const` and erase foreground voxels from *Il* contained within it. This helps prevent errant paths from being drawn all over the soma. 5. Continue from step 4 in the above algorithm. ### III. Finalization In the final phase, we agglomerate the disparate connected component skeletons into single skeletons and assign their labels corresponding to the input image. This step is artificially broken out compared to how intermingled its implementation is with skeletonization, but it's conceptually separate. ## Deviations from TEASAR There were several places where we took a different approach than called for by the TEASAR authors. ### Using DAF for Targets, PDRF for Pathfinding The original TEASAR algorithm defines the Penalized Distance from Root voxel Field (PDRF, *Pr* above) as: ``` PDRF = 5000 * (1 - DBF / max(DBF))^16 + DAF ``` DBF is the Distance from Boundary Field (*El* above) and DAF is the Distance from Any voxel Field (*Dr* above). We found the addition of the DAF tended to perturb the skeleton path from the centerline better described by the inverted DBF alone. We also found it helpful to modify the constant and exponent to tune cornering behavior. Initially, we completely stripped out the addition of the DAF from the PDRF, but this introduced a different kind of problem. The exponentiation of the PDRF caused floating point values to collapse in wide open spaces. This made the skeletons go crazy as they traced out a path described by floating point errors. The DAF provides a very helpful gradient to follow between the root and the target voxel, we just don't want that gradient to knock the path off the centerline. Therefore, in light of the fact that the PDRF base field is very large, we add the normalized DAF which is just enough to overwhelm floating point errors and provide direction in wide tubes and bulges. The original paper also called for selecting targets using the max(PDRF) foreground values. However, this is a bit strange since the PDRF values are dominated by boundary effects rather than a pure distance metric. Therefore, we select targets from the max(DAF) forground value. ### Zero Weighting Previous Paths (`fix_branching=True`) The 2001 skeletonization paper [2] called for correcting early forking by computing a DAF using already computed path vertices as field sources. This allows Dijkstra's algorithm to trace the existing path cost free and diverge from it at a closer point to the target. As we have strongly deemphasized the role of the DAF in dijkstra path finding, computing this field is unnecessary and we only need to set the PDRF to zero along the path of existing skeletons to achieve this effect. This saves us an expensive repeated DAF calculation per path. However, we still incur a substantial cost for taking this approach because we had been computing a dijkstra "parental field" that recorded the shortest path to the root from every foreground voxel. We then used this saved result to rapidly compute all paths. However, as this zero weighting modification makes successive calculations dependent upon previous ones, we need to compute Dijkstra's algorithm anew for each path. ### Non-Overlapped Chunked Processing (`fix_borders=True`) When processing large volumes, a sensible approach for mass producing skeletons is to chunk the volume, process the chunks independently, and merge the resulting skeleton fragments at the end. However, this is complicated by the "edge effect" induced by a loss of context which makes it impossible to expect the endpoints of skeleton fragments produced by adjacent chunks to align. In contrast, it is easy to join mesh fragments because the vertices of the edge of mesh fragments lie at predictable identical locations given one pixel of overlap. Previously, we had used 50% overlap to join adjacent skeleton fragments which increased the compute cost of skeletonizing a large volume by eight times. However, if we could force skeletons to lie at predictable locations on the border, we could use single pixel overlap and copy the simple mesh joining approach. As an (incorrect but useful) intuition for how one might go about this, consider computing the centroid of each connected component on each border plane and adding that as a required path target. This would guarantee that both sides of the plane connect at the same pixel. However, the centroid may not lie inside of non-convex hulls so we have to be more sophisticated and select some real point inside of the shape. To this end, we again repurpose the euclidean distance transform and apply it to each of the six planes of connected components and select the maximum value as a mandatory target. This works well for many types of objects that contact a single plane and have a single maximum. However, we must treat the corners of the box and shapes that have multiple maxima. To handle shapes that contact multiple sides of the box, we simply assign targets to all connected components. If this introduces a cycle in post-processing, we already have cycle removing code to handle it in Igneous. If it introduces tiny useless appendages, we also have code to handle this. If a shape has multiple distance transform maxima, it is important to choose the same pixel without needing to communicate between spatially adjacent tasks which may run at different times on different machines. Additionally, the same plane on adjacent tasks has the coordinate system flipped. One simple approach might be to pick the coordinate with minimum x and y (or some other coordinate based criterion) in one of the coordinate frames, but this requires tracking the flips on all six planes and is annoying. Instead, we use a series of coordinate-free topology based filters which is both more fun, effort efficient, and picks something reasonable looking. A valid criticism of this approach is that it will fail on a perfectly symmetrical object, but these objects are rare in biological data. We apply a series of filters and pick the point based on the first filter it passes: 1. The voxel closest to the centroid of the current label. 2. The voxel closest to the centroid of the image plane. 3. Closest to a corner of the plane. 4. Closest to an edge of the plane. 5. The previously found maxima. It is important that filter #1 be based on the shape of the label so that kinks are minimimized for convex hulls. For example, originally we used only filters two thru five, but this caused skeletons for neurites located away from the center of a chunk to suddenly jink towards the center of the chunk at chunk boundaries. ## Related Projects Several classic algorithms had to be specially tuned to make this module possible. 1. [edt](https://github.com/seung-lab/euclidean-distance-transform-3d): A single pass, multi-label anisotropy supporting euclidean distance transform implementation. 2. [dijkstra3d](https://github.com/seung-lab/dijkstra3d): Dijkstra's shortest-path algorithm defined on 26-connected 3D images. This avoids the time cost of edge generation and wasted memory of a graph representation. 3. [connected-components-3d](https://github.com/seung-lab/connected-components-3d): A connected components implementation defined on 26-connected 3D images with multiple labels. 4. [fastremap](https://github.com/seung-lab/fastremap): Allows high speed renumbering of labels from 1 in a 3D array in order to reduce memory consumption caused by unnecessarily large 32 and 64-bit labels. 5. [fill_voids](https://github.com/seung-lab/fill_voids): High speed binary_fill_holes. 6. [xs3d](https://github.com/seung-lab/cross-section): Cross section analysis of 3D images. This module was originally designed to be used with CloudVolume and Igneous. 1. [CloudVolume](https://github.com/seung-lab/cloud-volume): Serverless client for reading and writing petascale chunked images of neural tissue, meshes, and skeletons. 2. [Igneous](https://github.com/seung-lab/igneous/tree/master/igneous): Distributed computation for visualizing connectomics datasets. Some of the TEASAR modifications used in this package were first demonstrated by Alex Bae. 1. [skeletonization](https://github.com/seung-lab/skeletonization): Python implementation of modified TEASAR for sparse labels. ## Credits Alex Bae developed the precursor skeletonization package and several modifications to TEASAR that we use in this package. Alex also developed the postprocessing approach used for stitching skeletons using 50% overlap. Will Silversmith adapted these techniques for mass production, refined several basic algorithms for handling thousands of labels at once, and rewrote them into the Kimimaro package. Will added trickle DAF, zero weighted previously explored paths, and fixing borders to the algorithm. A.M. Wilson and Will designed the nucleus/soma "avocado" fuser. Forrest Collman added parameter flexibility and helped tune DAF computation performance. Sven Dorkenwald and Forrest both provided helpful discussions and feedback. Peter Li redesigned the target selection algorithm to avoid bilinear performance on complex cells. ## Acknowledgments We are grateful to our partners in the Seung Lab, the Allen Institute for Brain Science, and the Baylor College of Medicine for providing the data and problems necessitating this library. This research was supported by the Intelligence Advanced Research Projects Activity (IARPA) via Department of Interior/ Interior Business Center (DoI/IBC) contract number D16PC0005, NIH/NIMH (U01MH114824, U01MH117072, RF1MH117815), NIH/NINDS (U19NS104648, R01NS104926), NIH/NEI (R01EY027036), and ARO (W911NF-12-1-0594). The U.S. Government is authorized to reproduce and distribute reprints for Governmental purposes notwithstanding any copyright annotation thereon. Disclaimer: The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of IARPA, DoI/IBC, or the U.S. Government. We are grateful for assistance from Google, Amazon, and Intel. ## Papers Using Kimimaro Please cite Kimimaro using the CITATION.cff file located in this repository. The below list is not comprehensive and is sourced from collaborators or found using internet searches and does not constitute an endorsement except to the extent that they used it for their work. 1. A.M. Wilson, R. Schalek, A. Suissa-Peleg, T.R. Jones, S. Knowles-Barley, H. Pfister, J.M. Lichtman. "Developmental Rewiring between Cerebellar Climbing Fibers and Purkinje Cells Begins with Positive Feedback Synapse Addition". Cell Reports. Vol. 29, Iss. 9, November 2019. Pgs. 2849-2861.e6 doi: 10.1016/j.celrep.2019.10.081 ([link](https://www.cell.com/cell-reports/fulltext/S2211-1247(19)31403-2)) 2. S. Dorkenwald, N.L. Turner, T. Macrina, K. Lee, R. Lu, J. Wu, A.L. Bodor, A.A. Bleckert, D. Brittain, N. Kemnitz, W.M. Silversmith, D. Ih, J. Zung, A. Zlateski, I. Tartavull, S. Yu, S. Popovych, W. Wong, M. Castro, C. S. Jordan, A.M. Wilson, E. Froudarakis, J. Buchanan, M. Takeno, R. Torres, G. Mahalingam, F. Collman, C. Schneider-Mizell, D.J. Bumbarger, Y. Li, L. Becker, S. Suckow, J. Reimer, A.S. Tolias, N. Maçarico da Costa, R. C. Reid, H.S. Seung. "Binary and analog variation of synapses between cortical pyramidal neurons". bioRXiv. December 2019. doi: 10.1101/2019.12.29.890319 ([link](https://www.biorxiv.org/content/10.1101/2019.12.29.890319v1.full)) 3. N.L. Turner, T. Macrina, J.A. Bae, R. Yang, A.M. Wilson, C. Schneider-Mizell, K. Lee, R. Lu, J. Wu, A.L. Bodor, A.A. Bleckert, D. Brittain, E. Froudarakis, S. Dorkenwald, F. Collman, N. Kemnitz, D. Ih, W.M. Silversmith, J. Zung, A. Zlateski, I. Tartavull, S. Yu, S. Popovych, S. Mu, W. Wong, C.S. Jordan, M. Castro, J. Buchanan, D.J. Bumbarger, M. Takeno, R. Torres, G. Mahalingam, L. Elabbady, Y. Li, E. Cobos, P. Zhou, S. Suckow, L. Becker, L. Paninski, F. Polleux, J. Reimer, A.S. Tolias, R.C. Reid, N. Maçarico da Costa, H.S. Seung. "Multiscale and multimodal reconstruction of cortical structure and function". bioRxiv. October 2020; doi: 10.1101/2020.10.14.338681 ([link](https://www.biorxiv.org/content/10.1101/2020.10.14.338681v3)) 4. P.H. Li, L.F. Lindsey, M. Januszewski, Z. Zheng, A.S. Bates, I. Taisz, M. Tyka, M. Nichols, F. Li, E. Perlman, J. Maitin-Shepard, T. Blakely, L. Leavitt, G. S.X.E. Jefferis, D. Bock, V. Jain. "Automated Reconstruction of a Serial-Section EM Drosophila Brain with Flood-Filling Networks and Local Realignment". bioRxiv. October 2020. doi: 10.1101/605634 ([link](https://www.biorxiv.org/content/10.1101/605634v3)) ## References 1. M. Sato, I. Bitter, M.A. Bender, A.E. Kaufman, and M. Nakajima. "TEASAR: Tree-structure Extraction Algorithm for Accurate and Robust Skeletons". Proc. 8th Pacific Conf. on Computer Graphics and Applications. Oct. 2000. doi: 10.1109/PCCGA.2000.883951 ([link](https://ieeexplore.ieee.org/abstract/document/883951/)) 2. I. Bitter, A.E. Kaufman, and M. Sato. "Penalized-distance volumetric skeleton algorithm". IEEE Transactions on Visualization and Computer Graphics Vol. 7, Iss. 3, Jul-Sep 2001. doi: 10.1109/2945.942688 ([link](https://ieeexplore.ieee.org/abstract/document/942688/)) 3. T. Zhao, S. Plaza. "Automatic Neuron Type Identification by Neurite Localization in the Drosophila Medulla". Sept. 2014. arXiv:1409.1892 \[q-bio.NC\] ([link](https://arxiv.org/abs/1409.1892)) 4. A. Tagliasacchi, T. Delame, M. Spagnuolo, N. Amenta, A. Telea. "3D Skeletons: A State-of-the-Art Report". May 2016. Computer Graphics Forum. Vol. 35, Iss. 2. doi: 10.1111/cgf.12865 ([link](https://onlinelibrary.wiley.com/doi/full/10.1111/cgf.12865)) 5. P. Li, L. Lindsey, M. Januszewski, Z. Zheng, A. Bates, I. Taisz, M. Tyka, M. Nichols, F. Li, E. Perlman, J. Maitin-Shepard, T. Blakely, L. Leavitt, G. Jefferis, D. Bock, V. Jain. "Automated Reconstruction of a Serial-Section EM Drosophila Brain with Flood-Filling Networks and Local Realignment". April 2019. bioRXiv. doi: 10.1101/605634 ([link](https://www.biorxiv.org/content/10.1101/605634v1)) 6. M.M. McKerns, L. Strand, T. Sullivan, A. Fang, M.A.G. Aivazis, "Building a framework for predictive science", Proceedings of the 10th Python in Science Conference, 2011; http://arxiv.org/pdf/1202.1056 7. Michael McKerns and Michael Aivazis, "pathos: a framework for heterogeneous computing", 2010- ; http://trac.mystic.cacr.caltech.edu/project/pathos ================================================ FILE: automated_test.py ================================================ import pytest import edt import numpy as np from osteoid import Skeleton import kimimaro.intake import kimimaro.post import kimimaro.skeletontricks from kimimaro.utility import moving_average, cross_sectional_area @pytest.fixture def connectomics_data(): import crackle return crackle.load("benchmarks/connectomics.npy.ckl.gz") def test_empty_image(): labels = np.zeros( (256, 256, 256), dtype=bool) skels = kimimaro.skeletonize(labels, fix_borders=True) assert len(skels) == 0 def test_very_sparse_image(): labels = np.zeros( (64, 64, 64), dtype=bool) labels[5,5,5] = True labels[6,5,5] = True labels[20,20,20] = True skels = kimimaro.skeletonize(labels, dust_threshold=0) # single voxels don't get skeletonized assert len(skels) == 1 def test_solid_image(): labels = np.ones( (128, 128, 128), dtype=bool) skels = kimimaro.skeletonize(labels, fix_borders=True) assert len(skels) == 1 def test_binary_image(): labels = np.ones( (256, 256, 3), dtype=bool) labels[-1,0] = 0 labels[0,-1] = 0 skels = kimimaro.skeletonize(labels, fix_borders=False) assert len(skels) == 1 @pytest.mark.parametrize('fill_holes', (True, False)) def test_square(fill_holes): labels = np.ones( (1000, 1000), dtype=np.uint8) labels[-1,0] = 0 labels[0,-1] = 0 teasar_params = { "scale": 1.5, "const": 300, "pdrf_scale": 100000, "pdrf_exponent": 4, "soma_acceptance_threshold": 3500, "soma_detection_threshold": 750, "soma_invalidation_const": 300, "soma_invalidation_scale": 2 } skels = kimimaro.skeletonize(labels, teasar_params=teasar_params, fix_borders=False, fill_holes=fill_holes) assert len(skels) == 1 skel = skels[1] assert skel.vertices.shape[0] == 1000 assert skel.edges.shape[0] == 999 assert abs(skel.cable_length() - 999 * np.sqrt(2)) < 0.001 assert skel.space == 'physical' labels = np.ones( (1000, 1000), dtype=np.uint8) labels[0,0] = 0 labels[-1,-1] = 0 skels = kimimaro.skeletonize(labels, teasar_params=teasar_params, fix_borders=False, fill_holes=fill_holes) assert len(skels) == 1 skel = skels[1] assert skel.vertices.shape[0] == 1000 assert skel.edges.shape[0] == 999 assert abs(skel.cable_length() - 999 * np.sqrt(2)) < 0.001 assert skel.space == 'physical' def test_cube(): labels = np.ones( (128, 128, 128), dtype=np.uint8) labels[0, 0, 0] = 0 labels[-1, -1, -1] = 0 skels = kimimaro.skeletonize(labels, fix_borders=False) assert len(skels) == 1 skel = skels[1] assert skel.vertices.shape[0] == 128 assert skel.edges.shape[0] == 127 assert abs(skel.cable_length() - 127 * np.sqrt(3)) < 0.001 assert skel.space == 'physical' def test_find_border_targets(): labels = np.zeros( (257, 257), dtype=np.uint8) labels[1:-1,1:-1] = 1 dt = edt.edt(labels) targets = kimimaro.skeletontricks.find_border_targets( dt, labels.astype(np.uint32), wx=100, wy=100 ) assert len(targets) == 1 assert targets[1] == (128, 128) def test_fix_borders_z(): labels = np.zeros((256, 256, 256), dtype=np.uint8) labels[ 64:196, 64:196, : ] = 128 skels = kimimaro.skeletonize( labels, teasar_params={ 'const': 250, 'scale': 10, 'pdrf_exponent': 4, 'pdrf_scale': 100000, }, anisotropy=(40,32,20), object_ids=None, dust_threshold=1000, progress=True, fix_branching=True, in_place=False, fix_borders=True ) skel = skels[128] assert skel.space == 'physical' skel = skel.voxel_space() assert np.all(skel.vertices[:,0] == 129) assert np.all(skel.vertices[:,1] == 129) assert np.all(skel.vertices[:,2] == np.arange(256)) assert skel.space == 'voxel' def test_fix_borders_x(): labels = np.zeros((256, 256, 256), dtype=np.uint8) labels[ :, 64:196, 64:196 ] = 128 skels = kimimaro.skeletonize( labels, teasar_params={ 'const': 250, 'scale': 10, 'pdrf_exponent': 4, 'pdrf_scale': 100000, }, anisotropy=(1,1,1), object_ids=None, dust_threshold=1000, progress=True, fix_branching=True, in_place=False, fix_borders=True ) skel = skels[128] assert np.all(skel.vertices[:,0] == np.arange(256)) assert np.all(skel.vertices[:,1] == 129) assert np.all(skel.vertices[:,2] == 129) def test_fix_borders_y(): labels = np.zeros((256, 256, 256), dtype=np.uint8) labels[ 64:196, :, 64:196 ] = 128 skels = kimimaro.skeletonize( labels, teasar_params={ 'const': 250, 'scale': 10, 'pdrf_exponent': 4, 'pdrf_scale': 100000, }, anisotropy=(1,1,1), object_ids=None, dust_threshold=1000, progress=True, fix_branching=True, in_place=False, fix_borders=True ) skel = skels[128] assert np.all(skel.vertices[:,0] == 129) assert np.all(skel.vertices[:,1] == np.arange(256)) assert np.all(skel.vertices[:,2] == 129) def test_extra_targets(): labels = np.zeros((256, 256, 1), dtype=np.uint8) labels[ 64:196, 64:196, : ] = 128 def skeletonize(labels, **kwargs): return kimimaro.skeletonize( labels, teasar_params={ 'const': 250, 'scale': 10, 'pdrf_exponent': 4, 'pdrf_scale': 100000, }, anisotropy=(1,1,1), object_ids=None, dust_threshold=1000, progress=True, fix_branching=True, in_place=False, fix_borders=True, **kwargs )[128] skel1 = skeletonize(labels) skel2 = skeletonize(labels, extra_targets_after=[ (65, 65, 0) ]) assert skel1.vertices.size < skel2.vertices.size skel3 = skeletonize(labels, extra_targets_before=[ (65, 65, 0) ]) assert skel3.vertices.size < skel2.vertices.size def test_parallel(): labels = np.zeros((256, 256, 128), dtype=np.uint8) labels[ 0:128, 0:128, : ] = 1 labels[ 0:128, 128:256, : ] = 2 labels[ 128:256, 0:128, : ] = 3 labels[ 128:256, 128:256, : ] = 4 skels = kimimaro.skeletonize( labels, teasar_params={ 'const': 250, 'scale': 10, 'pdrf_exponent': 4, 'pdrf_scale': 100000, }, anisotropy=(1,1,1), object_ids=None, dust_threshold=1000, progress=True, fix_branching=True, in_place=False, fix_borders=True, parallel=2, ) assert len(skels) == 4 def test_dimensions(): labels = np.zeros((10,), dtype=np.uint8) skel = kimimaro.skeletonize(labels) labels = np.zeros((10,10), dtype=np.uint8) skel = kimimaro.skeletonize(labels) labels = np.zeros((10,10,10), dtype=np.uint8) skel = kimimaro.skeletonize(labels) labels = np.zeros((10,10,10,1), dtype=np.uint8) skel = kimimaro.skeletonize(labels) try: labels = np.zeros((10,10,10,2), dtype=np.uint8) skel = kimimaro.skeletonize(labels) assert False except kimimaro.DimensionError: pass @pytest.mark.parametrize('axis', ('x','y')) def test_joinability(axis): def skeletionize(labels, fix_borders): return kimimaro.skeletonize( labels, teasar_params={ 'const': 10, 'scale': 10, 'pdrf_exponent': 4, 'pdrf_scale': 100000, }, anisotropy=(1,1,1), object_ids=None, dust_threshold=0, progress=True, fix_branching=True, in_place=False, fix_borders=fix_borders, parallel=1, ) labels = np.zeros((256, 256, 20), dtype=np.uint8) if axis == 'x': lslice = np.s_[ 32:160, :, : ] elif axis == 'y': lslice = np.s_[ :, 32:160, : ] labels = np.zeros((256, 256, 20), dtype=np.uint8) labels[lslice] = 1 skels1 = skeletionize(labels[:,:,:10], True) skels1 = skels1[1] skels2 = skeletionize(labels[:,:,9:], True) skels2 = skels2[1] skels2.vertices[:,2] += 9 skels_fb = skels1.merge(skels2) assert len(skels_fb.components()) == 1 skels1 = skeletionize(labels[:,:,:10], False) skels1 = skels1[1] skels2 = skeletionize(labels[:,:,9:], False) skels2 = skels2[1] skels2.vertices[:,2] += 9 skels = skels1.merge(skels2) # Ususally this results in 2 connected components, # but random variation in how fp is handled can # result in a merge near the tails. assert not Skeleton.equivalent(skels, skels_fb) def test_find_cycle(): edges = np.array([ [0, 1], [1, 2], [2, 0], [2, 3], [2, 4] ], dtype=np.int32) cycle = kimimaro.skeletontricks.find_cycle(edges) assert np.all(cycle == np.array([0, 2, 1, 0])) edges = np.array([ [0, 1], [1, 2], [2, 3], [3, 4], [4, 10], [10, 11], [11, 12], [12, 2], [4, 5], [5, 6], [6, 7], ], dtype=np.int32) cycle = kimimaro.skeletontricks.find_cycle(edges) assert np.all(cycle == np.array([ 2, 12, 11, 10, 4, 3, 2 ])) # two loops edges = np.array([ [0, 1], [0, 20], [20, 21], [21, 22], [22, 23], [23, 21], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [7, 10], [10, 11], [11, 6] ], dtype=np.int32) cycle = kimimaro.skeletontricks.find_cycle(edges) assert np.all(cycle == np.array([ 21, 23, 22, 21 ])) or np.all(cycle == np.array([ 6, 11, 10, 7, 6 ])) def test_join_close_components_simple(): skel = Skeleton([ (0,0,0), (1,0,0), (10,0,0), (11, 0, 0) ], edges=[ (0,1), (2,3) ], radii=[ 0, 1, 2, 3 ], vertex_types=[ 0, 1, 2, 3 ], segid=1337, ) assert len(skel.components()) == 2 res = kimimaro.join_close_components(skel, radius=np.inf) assert len(res.components()) == 1 res = kimimaro.join_close_components(skel, radius=9) assert len(res.components()) == 1 assert np.all(res.edges == [[0,1], [1,2], [2,3]]) res = kimimaro.join_close_components(skel, radius=8.5) assert len(res.components()) == 2 def test_join_close_components_complex(): skel = Skeleton([ (0,0,0), (1,0,0), (4,0,0), (6,0,0), (20,0,0), (21, 0, 0), (0,0,5), (0,0,10), ], edges=[ (0,1), (2,3), (4,5), (6,7) ], ) assert len(skel.components()) == 4 res = kimimaro.join_close_components(skel, radius=np.inf) assert len(res.components()) == 1 assert np.all(res.edges == [[0,1], [0,3], [1,2], [3,4], [4,5], [5,6], [6,7]]) def test_join_close_components_by_radius(): skel = Skeleton([ (0,0,0), (1,0,0), (5,0,0), (11, 0, 0) ], edges=[ (0,1), (2,3) ], radii=[ 100, 100, 100, 100 ], vertex_types=[ 0, 1, 2, 3 ], segid=1337, ) res = kimimaro.join_close_components(skel, restrict_by_radius=False) assert len(res.components()) == 1 assert np.all(res.edges == [[0,1], [1,2], [2,3]]) res = kimimaro.join_close_components(skel, restrict_by_radius=True) assert len(res.components()) == 1 assert np.all(res.edges == [[0,1], [1,2], [2,3]]) skel.radii = np.array([1,1,1,1], dtype=np.float32) res = kimimaro.join_close_components(skel, restrict_by_radius=True) assert len(res.components()) == 2 assert np.all(res.edges == [[0,1], [2,3]]) skel.radii = np.array([1,0.9,3,1], dtype=np.float32) res = kimimaro.join_close_components(skel, restrict_by_radius=True) assert len(res.components()) == 2 assert np.all(res.edges == [[0,1], [2,3]]) skel.radii = np.array([1,1,3,1], dtype=np.float32) res = kimimaro.join_close_components(skel, restrict_by_radius=True) assert len(res.components()) == 1 assert np.all(res.edges == [[0,1], [1,2], [2,3]]) def test_fill_all_holes(): labels = np.zeros((64, 32, 32), dtype=np.uint32) labels[0:32,:,:] = 1 labels[32:64,:,:] = 8 noise = np.random.randint(low=1, high=8, size=(30, 30, 30)) labels[1:31,1:31,1:31] = noise noise = np.random.randint(low=8, high=11, size=(30, 30, 30)) labels[33:63,1:31,1:31] = noise noise_labels = np.unique(labels) assert set(noise_labels) == set([1,2,3,4,5,6,7,8,9,10]) result = kimimaro.intake.fill_all_holes(labels) filled_labels = np.unique(result) assert set(filled_labels) == set([1,8]) def test_fix_avocados(): labels = np.zeros((256, 256, 256), dtype=np.uint32) # fake clipped avocado labels[:50, :40, :30] = 1 labels[:25, :20, :25] = 2 # double avocado labels[50:100, 40:100, 30:80] = 3 labels[60:90, 50:90, 40:70] = 4 labels[60:70, 51:89, 41:69] = 5 # not an avocado labels[200:,200:,200:] = 6 # not a pit labels[150:200,200:,200:] = 7 # not a fruit fn = lambda lbls: edt.edt(lbls) dt = fn(labels) labels, dbf, remapping = kimimaro.intake.engage_avocado_protection( labels, dt, { 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7 }, soma_detection_threshold=1, edtfn=fn, progress=True ) uniq = set(np.unique(labels)) assert uniq == set([0, 1, 2, 3, 4]) # 0,2,5 renumbered assert np.all(labels[:50, :40, :30] == 1) assert np.all(labels[50:100, 40:100, 30:80] == 2) assert np.all(labels[150:200,200:,200:] == 3) assert np.all(labels[200:,200:,200:] == 4) def test_cross_sectional_area(): labels = np.ones((100,3,3), dtype=bool, order="F") vertices = np.array([ [x,1,1] for x in range(labels.shape[0]) ]) edges = np.array([ [x,x+1] for x in range(labels.shape[0] - 1) ]) skel = Skeleton(vertices, edges, segid=1) skel = kimimaro.cross_sectional_area(labels, skel, smoothing_window=5) assert len(skel.cross_sectional_area == 100) assert np.all(skel.cross_sectional_area == 9) def test_moving_average(): data = np.array([]) assert np.all(moving_average(data, 1) == data) assert np.all(moving_average(data, 2) == data) data = np.array([1,1,1,1,1,1,1,1,1,1,1]) assert np.all(moving_average(data, 1) == data) data = np.array([1,1,1,1,1,1,1,1,1,1,1,1]) assert np.all(moving_average(data, 1) == data) data = np.array([1,1,1,1,1,10,1,1,1,1,1]) assert np.all(moving_average(data, 1) == data) data = np.array([1,1,1,1,1,1,1,1,1,1,1]) assert np.all(moving_average(data, 2) == data) data = np.array([0,1,1,1,1,1,1,1,1,1,0]) ans = np.array([ 0,0.5,1,1,1,1,1,1,1,1,0.5 ]) assert np.all(moving_average(data, 2) == ans) data = np.array([0,1,1,1,1,1,1,1,1,1,0]) ans = np.array([ 1/3,1/3,2/3,1,1,1,1,1,1,1,2/3 ]) res = moving_average(data, 3) assert np.all(res == ans) assert len(ans) == len(data) def test_no_fix_branching(connectomics_data): kimimaro.skeletonize(connectomics_data[:,:,100], fix_branching=False) def test_remove_row(): arr = np.array([ [0,1], [1,2], [2,1], [2,2], [2,3], [3,4], ]) result = kimimaro.post.remove_row(arr, np.array([[1,2]])) assert np.all(result == np.array([[0,1],[2,2],[2,3],[3,4]])) arr = np.array([ [] ]) result = kimimaro.post.remove_row(arr, np.array([[1,2]])) assert np.all(result == np.array([])) def test_cross_sectional_area(): labels = np.ones([100,100,100], dtype=np.uint8) skel = kimimaro.skeletonize(labels, teasar_params={ "pdrf_exponent": 16, })[1] xsa_1 = cross_sectional_area(labels, skel, step=1).cross_sectional_area xsa_10 = cross_sectional_area(labels, skel, step=10).cross_sectional_area assert np.all(xsa_1[xsa_10 == 0] != xsa_10[xsa_10 == 0]) assert np.all(xsa_1[xsa_10 > 0] == xsa_10[xsa_10 > 0]) assert np.any(xsa_1 == 10000) terminals = skel.terminals() assert np.all(xsa_10[terminals] > 0) assert np.all(xsa_10[terminals] > 0) try: cross_sectional_area(labels, skel, step=-1) except AssertionError: pass def test_postprocess(): skel = Skeleton([ (0,0,0), (1,0,0), (4,0,0), (6,0,0), (20,0,0), (21, 0, 0), (0,0,5), (0,0,10), ], edges=[ (0,1), (2,3), (4,5), (6,7), (0,7), (1,6) ], ) res_skel = kimimaro.post.postprocess(skel, dust_threshold=0, tick_threshold=0) ans = Skeleton([ (4,0,0), (6,0,0), (20,0,0), (21, 0, 0), ], edges=[ (0,1), (2,3) ], ) assert Skeleton.equivalent(res_skel, ans) ================================================ FILE: benchmarks/README.md ================================================ Benchmarks ========== To open `connectomics.npy.ckl.gz` you must use [`crackle-codec`](https://github.com/seung-lab/crackle). Except where noted, these benchmarks were executed on an 2.8 GHz Dual-Core Intel Core i7 with 1600 MHz DDR3 RAM. The data source used was `connectomics.npy` which can be found in this repository. `connectomics.npy` is a 32-bit 512x512x512 cutout of mouse visual cortex at 16nm x 16nm x 40nm resolution that contains 2124 connected components including a partial cell body and a large glia fragment. Below, we compared the run time and peak memory usage of Kimimaro across many versions that contained performance significant updates. Due to the annoying length of each run, each value represents a single run, so there is some random perturbation around the true mean that can obscure the value of small improvements. Version 0.4.2 can be considered the first "feature complete" version that includes quality improvements like fix_branches, fix_borders, and a reasonable root selected for the cell body.

Kimimaro Execution Time by Version on connectomics.npy
Fig. 1: Kimimaro Execution Time by Version on `connectomics.npy`

Kimimaro Peak Memory Usage by Version on connectomics.npy
Fig. 2: Kimimaro Peak Memory Usage by Version on `connectomics.npy`

Kimimaro Memory Profile Versions 0.3.1 vs. 3.0.0
Fig. 3: Kimimaro Memory Profile Versions (blue) 0.3.1 (black) 3.0.0. The first hump on the left is processing a soma. The second hump is a glia.

================================================ FILE: benchmarks/benchmark.py ================================================ import time import numpy as np import kimimaro import crackle import pickle labels = crackle.load("connectomics.npy.ckl.gz") s = time.time() skels = kimimaro.skeletonize( labels, teasar_params={ 'scale': 1.5, 'const': 300, # physical units 'pdrf_exponent': 4, 'pdrf_scale': 100000, 'soma_detection_threshold': 1100, # physical units 'soma_acceptance_threshold': 3500, # physical units 'soma_invalidation_scale': 1.0, 'soma_invalidation_const': 300, # physical units # 'max_paths': 50, # default None }, # object_ids=[ ], # process only the specified labels # extra_targets_before=[ (27,33,100), (44,45,46) ], # target points in voxels # extra_targets_after=[ (27,33,100), (44,45,46) ], # target points in voxels # dust_threshold=1000, # skip connected components with fewer than this many voxels anisotropy=(16,16,40), # default True # fix_branching=True, # default True # fix_borders=True, # default True # fill_holes=False, # default False # fix_avocados=False, # default False progress=True, # default False, show progress bar # parallel=1, # <= 0 all cpu, 1 single process, 2+ multiprocess # parallel_chunk_size=100, # how many skeletons to process before updating progress bar ) print(time.time() - s) # with open("skels.pkl", "wb") as f: # pickle.dump(skels, f) # with open("skels.pkl", "rb") as f: # skels = pickle.load(f) s = time.time() skels = kimimaro.cross_sectional_area( labels, skels, anisotropy=(16,16,40), smoothing_window=7, progress=True, step=1, ) print(f"{time.time() - s:.3f}s") ================================================ FILE: build_linux.sh ================================================ #!/bin/bash # Some dependencies don't support manylinux1 docker build . -f manylinux2010.Dockerfile --tag seunglab/kimimaro:manylinux2010 docker build . -f manylinux2014.Dockerfile --tag seunglab/kimimaro:manylinux2014 docker run -v $PWD/dist:/output seunglab/kimimaro:manylinux2010 /bin/bash -c "cp -r wheelhouse/* /output" docker run -v $PWD/dist:/output seunglab/kimimaro:manylinux2014 /bin/bash -c "cp -r wheelhouse/* /output" ================================================ FILE: ext/skeletontricks/dijkstra_invalidation.hpp ================================================ /* * This file is part of Kimimaro. * * Kimimaro is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Kimimaro is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Kimimaro. If not, see . * * * This algorithm is derived from dijkstra3d: * https://github.com/seung-lab/dijkstra3d * * Author: William Silversmith * Affiliation: Seung Lab, Princeton University * Date: May 2024 */ #ifndef DIJKSTRA_INVALIDATION_HPP #define DIJKSTRA_INVALIDATION_HPP #include #include #include #include #include #include #include #include #include "./libdivide.h" #define NHOOD_SIZE 26 namespace dijkstra_invalidation { // helper function to compute 2D anisotropy ("_s" = "square") inline float _s(const float wa, const float wb) { return std::sqrt(wa * wa + wb * wb); } // helper function to compute 3D anisotropy ("_c" = "cube") inline float _c(const float wa, const float wb, const float wc) { return std::sqrt(wa * wa + wb * wb + wc * wc); } void connectivity_check(int connectivity) { if (connectivity != 6 && connectivity != 18 && connectivity != 26) { throw std::runtime_error("Only 6, 18, and 26 connectivities are supported."); } } void compute_neighborhood_helper_6( int *neighborhood, const int x, const int y, const int z, const uint64_t sx, const uint64_t sy, const uint64_t sz ) { const int sxy = sx * sy; // 6-hood neighborhood[0] = -1 * (x > 0); // -x neighborhood[1] = (x < (static_cast(sx) - 1)); // +x neighborhood[2] = -static_cast(sx) * (y > 0); // -y neighborhood[3] = static_cast(sx) * (y < static_cast(sy) - 1); // +y neighborhood[4] = -sxy * static_cast(z > 0); // -z neighborhood[5] = sxy * (z < static_cast(sz) - 1); // +z } void compute_neighborhood_helper_18( int *neighborhood, const int x, const int y, const int z, const uint64_t sx, const uint64_t sy, const uint64_t sz ) { // 6-hood compute_neighborhood_helper_6(neighborhood, x,y,z, sx,sy,sz); // 18-hood // xy diagonals neighborhood[6] = (neighborhood[0] + neighborhood[2]) * (neighborhood[0] && neighborhood[2]); // up-left neighborhood[7] = (neighborhood[0] + neighborhood[3]) * (neighborhood[0] && neighborhood[3]); // up-right neighborhood[8] = (neighborhood[1] + neighborhood[2]) * (neighborhood[1] && neighborhood[2]); // down-left neighborhood[9] = (neighborhood[1] + neighborhood[3]) * (neighborhood[1] && neighborhood[3]); // down-right // yz diagonals neighborhood[10] = (neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]); // up-left neighborhood[11] = (neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]); // up-right neighborhood[12] = (neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]); // down-left neighborhood[13] = (neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]); // down-right // xz diagonals neighborhood[14] = (neighborhood[0] + neighborhood[4]) * (neighborhood[0] && neighborhood[4]); // up-left neighborhood[15] = (neighborhood[0] + neighborhood[5]) * (neighborhood[0] && neighborhood[5]); // up-right neighborhood[16] = (neighborhood[1] + neighborhood[4]) * (neighborhood[1] && neighborhood[4]); // down-left neighborhood[17] = (neighborhood[1] + neighborhood[5]) * (neighborhood[1] && neighborhood[5]); // down-right } void compute_neighborhood_helper_26( int *neighborhood, const int x, const int y, const int z, const uint64_t sx, const uint64_t sy, const uint64_t sz ) { compute_neighborhood_helper_18(neighborhood, x,y,z, sx,sy,sz); // 26-hood // Now the eight corners of the cube neighborhood[18] = (neighborhood[0] + neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]); neighborhood[19] = (neighborhood[1] + neighborhood[2] + neighborhood[4]) * (neighborhood[2] && neighborhood[4]); neighborhood[20] = (neighborhood[0] + neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]); neighborhood[21] = (neighborhood[0] + neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]); neighborhood[22] = (neighborhood[1] + neighborhood[3] + neighborhood[4]) * (neighborhood[3] && neighborhood[4]); neighborhood[23] = (neighborhood[1] + neighborhood[2] + neighborhood[5]) * (neighborhood[2] && neighborhood[5]); neighborhood[24] = (neighborhood[0] + neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]); neighborhood[25] = (neighborhood[1] + neighborhood[3] + neighborhood[5]) * (neighborhood[3] && neighborhood[5]); } inline void compute_neighborhood( int *neighborhood, const int x, const int y, const int z, const uint64_t sx, const uint64_t sy, const uint64_t sz, const int connectivity = 26, const uint32_t* voxel_connectivity_graph = NULL) { if (connectivity == 26) { compute_neighborhood_helper_26(neighborhood, x, y, z, sx, sy, sz); } else if (connectivity == 18) { compute_neighborhood_helper_18(neighborhood, x, y, z, sx, sy, sz); } else { compute_neighborhood_helper_6(neighborhood, x, y, z, sx, sy, sz); } if (voxel_connectivity_graph == NULL) { return; } uint64_t loc = x + sx * (y + sy * z); uint32_t graph = voxel_connectivity_graph[loc]; // graph conventions are defined here: // https://github.com/seung-lab/connected-components-3d/blob/3.2.0/cc3d_graphs.hpp#L73-L92 // 6-hood neighborhood[0] *= ((graph & 0b000010) > 0); // -x neighborhood[1] *= ((graph & 0b000001) > 0); // +x neighborhood[2] *= ((graph & 0b001000) > 0); // -y neighborhood[3] *= ((graph & 0b000100) > 0); // +y neighborhood[4] *= ((graph & 0b100000) > 0); // -z neighborhood[5] *= ((graph & 0b010000) > 0); // +z // 18-hood // xy diagonals neighborhood[6] *= ((graph & 0b1000000000) > 0); // up-left -x,-y neighborhood[7] *= ((graph & 0b0010000000) > 0); // up-right -x,+y neighborhood[8] *= ((graph & 0b0100000000) > 0); // down-left +x,-y neighborhood[9] *= ((graph & 0b0001000000) > 0); // down-right +x,+y // yz diagonals neighborhood[10] *= ((graph & 0b100000000000000000) > 0); // up-left -y,-z neighborhood[11] *= ((graph & 0b000010000000000000) > 0); // up-right -y,+z neighborhood[12] *= ((graph & 0b010000000000000000) > 0); // down-left +y,-z neighborhood[13] *= ((graph & 0b000001000000000000) > 0); // down-right +y,+z // xz diagonals neighborhood[14] *= ((graph & 0b001000000000000000) > 0); // up-left, -x,-z neighborhood[15] *= ((graph & 0b000000100000000000) > 0); // up-right, -x,+z neighborhood[16] *= ((graph & 0b000100000000000000) > 0); // down-left +x,-z neighborhood[17] *= ((graph & 0b000000010000000000) > 0); // down-right +x,+z // 26-hood // Now the eight corners of the cube neighborhood[18] *= ((graph & 0b10000000000000000000000000) > 0); // -x,-y,-z neighborhood[19] *= ((graph & 0b01000000000000000000000000) > 0); // +x,-y,-z neighborhood[20] *= ((graph & 0b00100000000000000000000000) > 0); // -x,+y,-z neighborhood[21] *= ((graph & 0b00001000000000000000000000) > 0); // -x,-y,+z neighborhood[22] *= ((graph & 0b00010000000000000000000000) > 0); // +x,+y,-z neighborhood[23] *= ((graph & 0b00000100000000000000000000) > 0); // +x,-y,+z neighborhood[24] *= ((graph & 0b00000010000000000000000000) > 0); // -x,+y,+z neighborhood[25] *= ((graph & 0b00000001000000000000000000) > 0); // +x,+y,+z } #define DIJKSTRA_3D_PREFETCH_26WAY(field, loc) \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) + sxy - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) - sxy - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) + sxy + sx - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) + sxy - sx - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) - sxy + sx - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) - sxy - sx - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) + sx - 1]), 0, 1); \ HEDLEYX_PREFETCH(reinterpret_cast(&field[(loc) - sx - 1]), 0, 1); class HeapDistanceNode { public: float dist; uint64_t original_loc; uint64_t value; float max_dist; HeapDistanceNode() { dist = 0; value = 0; original_loc = 0; max_dist = 0; } HeapDistanceNode (float d, uint64_t o_loc, uint64_t val, float mx_dist) { dist = d; value = val; original_loc = o_loc; max_dist = mx_dist; } HeapDistanceNode (const HeapDistanceNode &h) { dist = h.dist; value = h.value; max_dist = h.max_dist; original_loc = h.original_loc; } }; struct HeapDistanceNodeCompare { bool operator()(const HeapDistanceNode &t1, const HeapDistanceNode &t2) const { return t1.dist >= t2.dist; } }; int64_t _roll_invalidation_ball( uint8_t* field, // really a boolean field const uint64_t sx, const uint64_t sy, const uint64_t sz, const float wx, const float wy, const float wz, const std::vector &sources, const std::vector &max_distances, const int connectivity = 26, const uint32_t* voxel_connectivity_graph = NULL ) { const uint64_t sxy = sx * sy; const libdivide::divider fast_sx(sx); const libdivide::divider fast_sxy(sxy); const bool power_of_two = !((sx & (sx - 1)) || (sy & (sy - 1))); const int xshift = std::log2(sx); // must use log2 here, not lg/lg2 to avoid fp errors const int yshift = std::log2(sy); connectivity_check(connectivity); int neighborhood[NHOOD_SIZE] = {}; std::priority_queue< HeapDistanceNode, std::vector, HeapDistanceNodeCompare > queue; for (uint64_t i = 0; i < sources.size(); i++) { queue.emplace(0.0, sources[i], sources[i], max_distances[i]); } uint64_t loc; uint64_t neighboridx; int64_t x, y, z; int64_t orig_x, orig_y, orig_z; int64_t invalidated = 0; auto xyzfn = [=](uint64_t l, int64_t& x, int64_t& y, int64_t& z) { if (power_of_two) { z = l >> (xshift + yshift); y = (l - (z << (xshift + yshift))) >> xshift; x = l - ((y + (z << yshift)) << xshift); } else { z = l / fast_sxy; y = (l - (z * sxy)) / fast_sx; x = l - sx * (y + z * sy); } }; while (!queue.empty()) { const float max_dist = queue.top().max_dist; const uint64_t original_loc = queue.top().original_loc; loc = queue.top().value; queue.pop(); if (!field[loc]) { continue; } field[loc] = 0; invalidated++; xyzfn(loc, x, y, z); xyzfn(original_loc, orig_x, orig_y, orig_z); compute_neighborhood(neighborhood, x, y, z, sx, sy, sz, connectivity, voxel_connectivity_graph); for (int i = 0; i < connectivity; i++) { if (neighborhood[i] == 0) { continue; } neighboridx = loc + neighborhood[i]; if (field[neighboridx] == 0) { continue; } xyzfn(neighboridx, x, y, z); float new_dist = _c( wx * static_cast(x - orig_x), wy * static_cast(y - orig_y), wz * static_cast(z - orig_z) ); if (new_dist < max_dist) { queue.emplace(new_dist, original_loc, neighboridx, max_dist); } } } return invalidated; } }; #undef NHOOD_SIZE #undef DIJKSTRA_3D_PREFETCH_26WAY #endif ================================================ FILE: ext/skeletontricks/libdivide.h ================================================ // libdivide.h - Optimized integer division // https://libdivide.com // // Copyright (C) 2010 - 2022 ridiculous_fish, // Copyright (C) 2016 - 2022 Kim Walisch, // // libdivide is dual-licensed under the Boost or zlib licenses. // You may use libdivide under the terms of either of these. // See LICENSE.txt for more details. #ifndef LIBDIVIDE_H #define LIBDIVIDE_H // *** Version numbers are auto generated - do not edit *** #define LIBDIVIDE_VERSION "5.2.0" #define LIBDIVIDE_VERSION_MAJOR 5 #define LIBDIVIDE_VERSION_MINOR 2 #define LIBDIVIDE_VERSION_PATCH 0 #include #if !defined(__AVR__) && __STDC_HOSTED__ != 0 #include #include #endif #if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \ (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L)) #include #include #define LIBDIVIDE_VC_CXX20 #endif #if defined(LIBDIVIDE_SSE2) #include #endif #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) #include #endif #if defined(LIBDIVIDE_NEON) #include #endif // Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics #if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \ (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)) #define LIBDIVIDE_MULH_INTRINSICS #endif #if defined(_MSC_VER) #if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__) #include #endif #ifndef __clang__ #pragma warning(push) // 4146: unary minus operator applied to unsigned type, result still unsigned #pragma warning(disable : 4146) // 4204: nonstandard extension used : non-constant aggregate initializer #pragma warning(disable : 4204) #endif #define LIBDIVIDE_VC #endif #if !defined(__has_builtin) #define __has_builtin(x) 0 #endif #if defined(__SIZEOF_INT128__) #define HAS_INT128_T // clang-cl on Windows does not yet support 128-bit division #if !(defined(__clang__) && defined(LIBDIVIDE_VC)) #define HAS_INT128_DIV #endif #endif #if defined(__x86_64__) || defined(_M_X64) #define LIBDIVIDE_X86_64 #endif #if defined(__i386__) #define LIBDIVIDE_i386 #endif #if defined(__GNUC__) || defined(__clang__) #define LIBDIVIDE_GCC_STYLE_ASM #endif #if defined(__cplusplus) || defined(LIBDIVIDE_VC) #define LIBDIVIDE_FUNCTION __FUNCTION__ #else #define LIBDIVIDE_FUNCTION __func__ #endif // Set up forced inlining if possible. // We need both the attribute and keyword to avoid "might not be inlineable" warnings. #ifdef __has_attribute #if __has_attribute(always_inline) #define LIBDIVIDE_INLINE __attribute__((always_inline)) inline #endif #endif #ifndef LIBDIVIDE_INLINE #ifdef _MSC_VER #define LIBDIVIDE_INLINE __forceinline #else #define LIBDIVIDE_INLINE inline #endif #endif #if defined(__AVR__) || __STDC_HOSTED__ == 0 #define LIBDIVIDE_ERROR(msg) #else #define LIBDIVIDE_ERROR(msg) \ do { \ fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \ abort(); \ } while (0) #endif #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0 #define LIBDIVIDE_ASSERT(x) \ do { \ if (!(x)) { \ fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \ LIBDIVIDE_FUNCTION, #x); \ abort(); \ } \ } while (0) #else #define LIBDIVIDE_ASSERT(x) #endif #ifdef __cplusplus // For constexpr zero initialization, c++11 might handle things ok, // but just limit to at least c++14 to ensure we don't break anyone's code: // Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr #if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L) #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE // Supposedly, MSVC might not implement feature test macros right: // https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c // so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS // 2017 15.0 (for extended constexpr support: // https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170) #elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE #else #define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE #endif namespace libdivide { #endif #if defined(_MSC_VER) && !defined(__clang__) #if defined(LIBDIVIDE_VC_CXX20) static LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) { if (std::is_constant_evaluated()) { for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) { if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i; } return sizeof(x) * CHAR_BIT; } #else static LIBDIVIDE_INLINE int __builtin_clz(unsigned x) { #endif #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) return (int)_CountLeadingZeros(x); #elif defined(__AVX2__) || defined(__LZCNT__) return (int)_lzcnt_u32(x); #else unsigned long r; _BitScanReverse(&r, x); return (int)(r ^ 31); #endif } #if defined(LIBDIVIDE_VC_CXX20) static LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) { if (std::is_constant_evaluated()) { for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) { if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i; } return sizeof(x) * CHAR_BIT; } #else static LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) { #endif #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) return (int)_CountLeadingZeros64(x); #elif defined(_WIN64) #if defined(__AVX2__) || defined(__LZCNT__) return (int)_lzcnt_u64(x); #else unsigned long r; _BitScanReverse64(&r, x); return (int)(r ^ 63); #endif #else int l = __builtin_clz((unsigned)x) + 32; int h = __builtin_clz((unsigned)(x >> 32)); return !!((unsigned)(x >> 32)) ? h : l; #endif } #endif // defined(_MSC_VER) && !defined(__clang__) // pack divider structs to prevent compilers from padding. // This reduces memory usage by up to 43% when using a large // array of libdivide dividers and improves performance // by up to 10% because of reduced memory bandwidth. #pragma pack(push, 1) struct libdivide_u16_t { uint16_t magic; uint8_t more; }; struct libdivide_s16_t { int16_t magic; uint8_t more; }; struct libdivide_u32_t { uint32_t magic; uint8_t more; }; struct libdivide_s32_t { int32_t magic; uint8_t more; }; struct libdivide_u64_t { uint64_t magic; uint8_t more; }; struct libdivide_s64_t { int64_t magic; uint8_t more; }; struct libdivide_u16_branchfree_t { uint16_t magic; uint8_t more; }; struct libdivide_s16_branchfree_t { int16_t magic; uint8_t more; }; struct libdivide_u32_branchfree_t { uint32_t magic; uint8_t more; }; struct libdivide_s32_branchfree_t { int32_t magic; uint8_t more; }; struct libdivide_u64_branchfree_t { uint64_t magic; uint8_t more; }; struct libdivide_s64_branchfree_t { int64_t magic; uint8_t more; }; #pragma pack(pop) // Explanation of the "more" field: // // * Bits 0-5 is the shift value (for shift path or mult path). // * Bit 6 is the add indicator for mult path. // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative // divisor indicator so that we can efficiently use sign extension to // create a bitmask with all bits set to 1 (if the divisor is negative) // or 0 (if the divisor is positive). // // u32: [0-4] shift value // [5] ignored // [6] add indicator // magic number of 0 indicates shift path // // s32: [0-4] shift value // [5] ignored // [6] add indicator // [7] indicates negative divisor // magic number of 0 indicates shift path // // u64: [0-5] shift value // [6] add indicator // magic number of 0 indicates shift path // // s64: [0-5] shift value // [6] add indicator // [7] indicates negative divisor // magic number of 0 indicates shift path // // In s32 and s64 branchfree modes, the magic number is negated according to // whether the divisor is negated. In branchfree strategy, it is not negated. enum { LIBDIVIDE_16_SHIFT_MASK = 0x1F, LIBDIVIDE_32_SHIFT_MASK = 0x1F, LIBDIVIDE_64_SHIFT_MASK = 0x3F, LIBDIVIDE_ADD_MARKER = 0x40, LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 }; static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d); static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d); static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d); static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d); static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d); static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d); static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d); static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d); static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw( int16_t numer, int16_t magic, uint8_t more); static LIBDIVIDE_INLINE int16_t libdivide_s16_do( int16_t numer, const struct libdivide_s16_t *denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw( uint16_t numer, uint16_t magic, uint8_t more); static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( uint16_t numer, const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw( int32_t numer, int32_t magic, uint8_t more); static LIBDIVIDE_INLINE int32_t libdivide_s32_do( int32_t numer, const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw( uint32_t numer, uint32_t magic, uint8_t more); static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( uint32_t numer, const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw( int64_t numer, int64_t magic, uint8_t more); static LIBDIVIDE_INLINE int64_t libdivide_s64_do( int64_t numer, const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw( uint64_t numer, uint64_t magic, uint8_t more); static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( uint64_t numer, const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( int16_t numer, const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( uint16_t numer, const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( int32_t numer, const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( uint32_t numer, const struct libdivide_u32_branchfree_t *denom); static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do( int64_t numer, const struct libdivide_s64_branchfree_t *denom); static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( uint64_t numer, const struct libdivide_u64_branchfree_t *denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( const struct libdivide_u32_branchfree_t *denom); static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover( const struct libdivide_s64_branchfree_t *denom); static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover( const struct libdivide_u64_branchfree_t *denom); //////// Internal Utility Functions static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) { uint32_t xl = x, yl = y; uint32_t rl = xl * yl; return (uint16_t)(rl >> 16); } static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) { int32_t xl = x, yl = y; int32_t rl = xl * yl; // needs to be arithmetic shift return (int16_t)(rl >> 16); } static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { uint64_t xl = x, yl = y; uint64_t rl = xl * yl; return (uint32_t)(rl >> 32); } static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { int64_t xl = x, yl = y; int64_t rl = xl * yl; // needs to be arithmetic shift return (int32_t)(rl >> 32); } static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { #if defined(LIBDIVIDE_MULH_INTRINSICS) return __umulh(x, y); #elif defined(HAS_INT128_T) __uint128_t xl = x, yl = y; __uint128_t rl = xl * yl; return (uint64_t)(rl >> 64); #else // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) uint32_t mask = 0xFFFFFFFF; uint32_t x0 = (uint32_t)(x & mask); uint32_t x1 = (uint32_t)(x >> 32); uint32_t y0 = (uint32_t)(y & mask); uint32_t y1 = (uint32_t)(y >> 32); uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); uint64_t x0y1 = x0 * (uint64_t)y1; uint64_t x1y0 = x1 * (uint64_t)y0; uint64_t x1y1 = x1 * (uint64_t)y1; uint64_t temp = x1y0 + x0y0_hi; uint64_t temp_lo = temp & mask; uint64_t temp_hi = temp >> 32; return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); #endif } static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { #if defined(LIBDIVIDE_MULH_INTRINSICS) return __mulh(x, y); #elif defined(HAS_INT128_T) __int128_t xl = x, yl = y; __int128_t rl = xl * yl; return (int64_t)(rl >> 64); #else // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) uint32_t mask = 0xFFFFFFFF; uint32_t x0 = (uint32_t)(x & mask); uint32_t y0 = (uint32_t)(y & mask); int32_t x1 = (int32_t)(x >> 32); int32_t y1 = (int32_t)(y >> 32); uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); int64_t t = x1 * (int64_t)y0 + x0y0_hi; int64_t w1 = x0 * (int64_t)y1 + (t & mask); return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32); #endif } static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) { #if defined(__AVR__) // Fast way to count leading zeros // On the AVR 8-bit architecture __builtin_clz() works on a int16_t. return __builtin_clz(val); #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER) // Fast way to count leading zeros return (int16_t)(__builtin_clz(val) - 16); #else if (val == 0) return 16; int16_t result = 4; uint16_t hi = 0xFU << 12; while ((val & hi) == 0) { hi >>= 4; result += 4; } while (val & hi) { result -= 1; hi <<= 1; } return result; #endif } static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { #if defined(__AVR__) // Fast way to count leading zeros return __builtin_clzl(val); #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER) // Fast way to count leading zeros return __builtin_clz(val); #else if (val == 0) return 32; int32_t result = 8; uint32_t hi = 0xFFU << 24; while ((val & hi) == 0) { hi >>= 8; result += 8; } while (val & hi) { result -= 1; hi <<= 1; } return result; #endif } static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { #if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER) // Fast way to count leading zeros return __builtin_clzll(val); #else uint32_t hi = val >> 32; uint32_t lo = val & 0xFFFFFFFF; if (hi != 0) return libdivide_count_leading_zeros32(hi); return 32 + libdivide_count_leading_zeros32(lo); #endif } // libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit // uint {v}. The result must fit in 16 bits. // Returns the quotient directly and the remainder in *r static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) { uint32_t n = ((uint32_t)u1 << 16) | u0; uint16_t result = (uint16_t)(n / v); *r = (uint16_t)(n - result * (uint32_t)v); return result; } // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit // uint {v}. The result must fit in 32 bits. // Returns the quotient directly and the remainder in *r static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32( uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint32_t result; __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1)); return result; #else uint64_t n = ((uint64_t)u1 << 32) | u0; uint32_t result = (uint32_t)(n / v); *r = (uint32_t)(n - result * (uint64_t)v); return result; #endif } // libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The // result must fit in 64 bits. Returns the quotient directly and the remainder in *r static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) { // N.B. resist the temptation to use __uint128_t here. // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because // it's not LIBDIVIDE_INLINEd. #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint64_t result; __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi)); return result; #else // We work in base 2**32. // A uint32 holds a single digit. A uint64 holds two digits. // Our numerator is conceptually [num3, num2, num1, num0]. // Our denominator is [den1, den0]. const uint64_t b = ((uint64_t)1 << 32); // The high and low digits of our computed quotient. uint32_t q1; uint32_t q0; // The normalization shift factor. int shift; // The high and low digits of our denominator (after normalizing). // Also the low 2 digits of our numerator (after normalizing). uint32_t den1; uint32_t den0; uint32_t num1; uint32_t num0; // A partial remainder. uint64_t rem; // The estimated quotient, and its corresponding remainder (unrelated to true remainder). uint64_t qhat; uint64_t rhat; // Variables used to correct the estimated quotient. uint64_t c1; uint64_t c2; // Check for overflow and divide by 0. if (numhi >= den) { if (r) *r = ~0ull; return ~0ull; } // Determine the normalization factor. We multiply den by this, so that its leading digit is at // least half b. In binary this means just shifting left by the number of leading zeros, so that // there's a 1 in the MSB. // We also shift numer by the same amount. This cannot overflow because numhi < den. // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it. shift = libdivide_count_leading_zeros64(den); den <<= shift; numhi <<= shift; numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63); numlo <<= shift; // Extract the low digits of the numerator and both digits of the denominator. num1 = (uint32_t)(numlo >> 32); num0 = (uint32_t)(numlo & 0xFFFFFFFFu); den1 = (uint32_t)(den >> 32); den0 = (uint32_t)(den & 0xFFFFFFFFu); // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. // Estimate q1 as [n3 n2] / [d1], and then correct it. // Note while qhat may be 2 digits, q1 is always 1 digit. qhat = numhi / den1; rhat = numhi % den1; c1 = qhat * den0; c2 = rhat * b + num1; if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; q1 = (uint32_t)qhat; // Compute the true (partial) remainder. rem = numhi * b + num1 - q1 * den; // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. // Estimate q0 as [rem1 rem0] / [d1] and correct it. qhat = rem / den1; rhat = rem % den1; c1 = qhat * den0; c2 = rhat * b + num0; if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; q0 = (uint32_t)qhat; // Return remainder if requested. if (r) *r = (rem * b + num0 - q0 * den) >> shift; return ((uint64_t)q1 << 32) | q0; #endif } #if !(defined(HAS_INT128_T) && \ defined(HAS_INT128_DIV)) // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) static LIBDIVIDE_INLINE void libdivide_u128_shift( uint64_t *u1, uint64_t *u0, int32_t signed_shift) { if (signed_shift > 0) { uint32_t shift = signed_shift; *u1 <<= shift; *u1 |= *u0 >> (64 - shift); *u0 <<= shift; } else if (signed_shift < 0) { uint32_t shift = -signed_shift; *u0 >>= shift; *u0 |= *u1 << (64 - shift); *u1 >>= shift; } } #endif // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV) __uint128_t ufull = u_hi; __uint128_t vfull = v_hi; ufull = (ufull << 64) | u_lo; vfull = (vfull << 64) | v_lo; uint64_t res = (uint64_t)(ufull / vfull); __uint128_t remainder = ufull - (vfull * res); *r_lo = (uint64_t)remainder; *r_hi = (uint64_t)(remainder >> 64); return res; #else // Adapted from "Unsigned Doubleword Division" in Hacker's Delight // We want to compute u / v typedef struct { uint64_t hi; uint64_t lo; } u128_t; u128_t u = {u_hi, u_lo}; u128_t v = {v_hi, v_lo}; if (v.hi == 0) { // divisor v is a 64 bit value, so we just need one 128/64 division // Note that we are simpler than Hacker's Delight here, because we know // the quotient fits in 64 bits whereas Hacker's Delight demands a full // 128 bit quotient *r_hi = 0; return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo); } // Here v >= 2**64 // We know that v.hi != 0, so count leading zeros is OK // We have 0 <= n <= 63 uint32_t n = libdivide_count_leading_zeros64(v.hi); // Normalize the divisor so its MSB is 1 u128_t v1t = v; libdivide_u128_shift(&v1t.hi, &v1t.lo, n); uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 // To ensure no overflow u128_t u1 = u; libdivide_u128_shift(&u1.hi, &u1.lo, -1); // Get quotient from divide unsigned insn. uint64_t rem_ignored; uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored); // Undo normalization and division of u by 2. u128_t q0 = {0, q1}; libdivide_u128_shift(&q0.hi, &q0.lo, n); libdivide_u128_shift(&q0.hi, &q0.lo, -63); // Make q0 correct or too small by 1 // Equivalent to `if (q0 != 0) q0 = q0 - 1;` if (q0.hi != 0 || q0.lo != 0) { q0.hi -= (q0.lo == 0); // borrow q0.lo -= 1; } // Now q0 is correct. // Compute q0 * v as q0v // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo) // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) + // (q0.lo * v.hi << 64) + q0.lo * v.lo) // Each term is 128 bit // High half of full product (upper 128 bits!) are dropped u128_t q0v = {0, 0}; q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo); q0v.lo = q0.lo * v.lo; // Compute u - q0v as u_q0v // This is the remainder u128_t u_q0v = u; u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow u_q0v.lo -= q0v.lo; // Check if u_q0v >= v // This checks if our remainder is larger than the divisor if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { // Increment q0 q0.lo += 1; q0.hi += (q0.lo == 0); // carry // Subtract v from remainder u_q0v.hi -= v.hi + (u_q0v.lo < v.lo); u_q0v.lo -= v.lo; } *r_hi = u_q0v.hi; *r_lo = u_q0v.lo; LIBDIVIDE_ASSERT(q0.hi == 0); return q0.lo; #endif } ////////// UINT16 static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( uint16_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_u16_t result; uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d)); // Power of 2 if ((d & (d - 1)) == 0) { // We need to subtract 1 from the shift value in case of an unsigned // branchfree divider because there is a hardcoded right shift by 1 // in its division algorithm. Because of this we also need to add back // 1 in its recovery algorithm. result.magic = 0; result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); } else { uint8_t more; uint16_t rem, proposed_m; proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint16_t e = d - rem; // This power works if e < 2**floor_log_2_d. if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { // This power works more = floor_log_2_d; } else { // We have to use the general 17-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the // remainder, we can compute the larger division. // don't care about overflow here - in fact, we expect it proposed_m += proposed_m; const uint16_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; } result.magic = 1 + proposed_m; result.more = more; // result.more's shift should in general be ceil_log_2_d. But if we // used the smaller power, we subtract one from the shift because we're // using the smaller power. If we're using the larger power, we // subtract one from the shift because it's taken care of by the add // indicator. So floor_log_2_d happens to be correct in both cases. } return result; } static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) { return libdivide_internal_u16_gen(d, 0); } static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { if (d == 1) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); struct libdivide_u16_branchfree_t ret = { tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)}; return ret; } // The original libdivide_u16_do takes a const pointer. However, this cannot be used // with a compile time constant libdivide_u16_t: it will generate a warning about // taking the address of a temporary. Hence this overload. static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { if (!magic) { return numer >> more; } else { uint16_t q = libdivide_mullhi_u16(numer, magic); if (more & LIBDIVIDE_ADD_MARKER) { uint16_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_16_SHIFT_MASK); } else { // All upper bits are 0, // don't need to mask them off. return q >> more; } } } static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) { return libdivide_u16_do_raw(numer, denom->magic, denom->more); } static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( uint16_t numer, const struct libdivide_u16_branchfree_t *denom) { uint16_t q = libdivide_mullhi_u16(numer, denom->magic); uint16_t t = ((numer - q) >> 1) + q; return t >> denom->more; } static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; if (!denom->magic) { return (uint16_t)1 << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(16 + shift) // Therefore we have d = 2^(16 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor uint16_t hi_dividend = (uint16_t)1 << shift; uint16_t rem_ignored; return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored); } else { // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now // Also note that shift may be as high as 15, so shift + 1 will // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and // then double the quotient and remainder. uint32_t half_n = (uint32_t)1 << (16 + shift); uint32_t d = ((uint32_t)1 << 16) | denom->magic; // Note that the quotient is guaranteed <= 16 bits, but the remainder // may need 17! uint16_t half_q = (uint16_t)(half_n / d); uint32_t rem = half_n % d; // We computed 2^(16+shift)/(m+2^16) // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits uint16_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; } } static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; if (!denom->magic) { return (uint16_t)1 << (shift + 1); } else { // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now // Also note that shift may be as high as 15, so shift + 1 will // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and // then double the quotient and remainder. uint32_t half_n = (uint32_t)1 << (16 + shift); uint32_t d = ((uint32_t)1 << 16) | denom->magic; // Note that the quotient is guaranteed <= 16 bits, but the remainder // may need 17! uint16_t half_q = (uint16_t)(half_n / d); uint32_t rem = half_n % d; // We computed 2^(16+shift)/(m+2^16) // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits uint16_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; } } ////////// UINT32 static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen( uint32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_u32_t result; uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); // Power of 2 if ((d & (d - 1)) == 0) { // We need to subtract 1 from the shift value in case of an unsigned // branchfree divider because there is a hardcoded right shift by 1 // in its division algorithm. Because of this we also need to add back // 1 in its recovery algorithm. result.magic = 0; result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); } else { uint8_t more; uint32_t rem, proposed_m; proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint32_t e = d - rem; // This power works if e < 2**floor_log_2_d. if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) { // This power works more = (uint8_t)floor_log_2_d; } else { // We have to use the general 33-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the // remainder, we can compute the larger division. // don't care about overflow here - in fact, we expect it proposed_m += proposed_m; const uint32_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } result.magic = 1 + proposed_m; result.more = more; // result.more's shift should in general be ceil_log_2_d. But if we // used the smaller power, we subtract one from the shift because we're // using the smaller power. If we're using the larger power, we // subtract one from the shift because it's taken care of by the add // indicator. So floor_log_2_d happens to be correct in both cases. } return result; } static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { return libdivide_internal_u32_gen(d, 0); } static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { if (d == 1) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1); struct libdivide_u32_branchfree_t ret = { tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; return ret; } static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) { if (!magic) { return numer >> more; } else { uint32_t q = libdivide_mullhi_u32(numer, magic); if (more & LIBDIVIDE_ADD_MARKER) { uint32_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_32_SHIFT_MASK); } else { // All upper bits are 0, // don't need to mask them off. return q >> more; } } } static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { return libdivide_u32_do_raw(numer, denom->magic, denom->more); } static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { uint32_t q = libdivide_mullhi_u32(numer, denom->magic); uint32_t t = ((numer - q) >> 1) + q; return t >> denom->more; } static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { return (uint32_t)1 << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(32 + shift) // Therefore we have d = 2^(32 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor uint32_t hi_dividend = (uint32_t)1 << shift; uint32_t rem_ignored; return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored); } else { // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. uint64_t half_n = (uint64_t)1 << (32 + shift); uint64_t d = ((uint64_t)1 << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); uint64_t rem = half_n % d; // We computed 2^(32+shift)/(m+2^32) // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits uint32_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; } } static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { return (uint32_t)1 << (shift + 1); } else { // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. uint64_t half_n = (uint64_t)1 << (32 + shift); uint64_t d = ((uint64_t)1 << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); uint64_t rem = half_n % d; // We computed 2^(32+shift)/(m+2^32) // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits uint32_t full_q = half_q + half_q + ((rem << 1) >= d); // We rounded down in gen (hence +1) return full_q + 1; } } ////////// UINT64 static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen( uint64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_u64_t result; uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d); // Power of 2 if ((d & (d - 1)) == 0) { // We need to subtract 1 from the shift value in case of an unsigned // branchfree divider because there is a hardcoded right shift by 1 // in its division algorithm. Because of this we also need to add back // 1 in its recovery algorithm. result.magic = 0; result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); } else { uint64_t proposed_m, rem; uint8_t more; // (1 << (64 + floor_log_2_d)) / d proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint64_t e = d - rem; // This power works if e < 2**floor_log_2_d. if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { // This power works more = (uint8_t)floor_log_2_d; } else { // We have to use the general 65-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the // remainder, we can compute the larger division. // don't care about overflow here - in fact, we expect it proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } result.magic = 1 + proposed_m; result.more = more; // result.more's shift should in general be ceil_log_2_d. But if we // used the smaller power, we subtract one from the shift because we're // using the smaller power. If we're using the larger power, we // subtract one from the shift because it's taken care of by the add // indicator. So floor_log_2_d happens to be correct in both cases, // which is why we do it outside of the if statement. } return result; } static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) { return libdivide_internal_u64_gen(d, 0); } static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { if (d == 1) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1); struct libdivide_u64_branchfree_t ret = { tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; return ret; } static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) { if (!magic) { return numer >> more; } else { uint64_t q = libdivide_mullhi_u64(numer, magic); if (more & LIBDIVIDE_ADD_MARKER) { uint64_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_64_SHIFT_MASK); } else { // All upper bits are 0, // don't need to mask them off. return q >> more; } } } static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { return libdivide_u64_do_raw(numer, denom->magic, denom->more); } static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { uint64_t q = libdivide_mullhi_u64(numer, denom->magic); uint64_t t = ((numer - q) >> 1) + q; return t >> denom->more; } static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { return (uint64_t)1 << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(64 + shift) // Therefore we have d = 2^(64 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor uint64_t hi_dividend = (uint64_t)1 << shift; uint64_t rem_ignored; return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored); } else { // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). // Notice (m + 2^64) is a 65 bit number. This gets hairy. See // libdivide_u32_recover for more on what we do here. // TODO: do something better than 128 bit math // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; } } static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { return (uint64_t)1 << (shift + 1); } else { // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). // Notice (m + 2^64) is a 65 bit number. This gets hairy. See // libdivide_u32_recover for more on what we do here. // TODO: do something better than 128 bit math // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; } } ////////// SINT16 static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen( int16_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_s16_t result; // If d is a power of 2, or negative a power of 2, we have to use a shift. // This is especially important because the magic algorithm fails for -1. // To check if d is a power of 2 or its inverse, it suffices to check // whether its absolute value has exactly one bit set. This works even for // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set // and is a power of 2. uint16_t ud = (uint16_t)d; uint16_t absD = (d < 0) ? -ud : ud; uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD); // check if exactly one bit is set, // don't care if absD is 0 since that's divide by zero if ((absD & (absD - 1)) == 0) { // Branchfree and normal paths are exactly the same result.magic = 0; result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); } else { LIBDIVIDE_ASSERT(floor_log_2_d >= 1); uint8_t more; // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word // is 0 and the high word is floor_log_2_d - 1 uint16_t rem, proposed_m; proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem); const uint16_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) { // This power works more = (uint8_t)(floor_log_2_d - 1); } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an // int16_t. proposed_m += proposed_m; const uint16_t twice_rem = rem + rem; if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } proposed_m += 1; int16_t magic = (int16_t)proposed_m; // Mark if we are negative. Note we only negate the magic number in the // branchfull case. if (d < 0) { more |= LIBDIVIDE_NEGATIVE_DIVISOR; if (!branchfree) { magic = -magic; } } result.more = more; result.magic = magic; } return result; } static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) { return libdivide_internal_s16_gen(d, 0); } static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) { struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1); struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more}; return result; } // The original libdivide_s16_do takes a const pointer. However, this cannot be used // with a compile time constant libdivide_s16_t: it will generate a warning about // taking the address of a temporary. Hence this overload. static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) { uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; if (!magic) { uint16_t sign = (int8_t)more >> 7; uint16_t mask = ((uint16_t)1 << shift) - 1; uint16_t uq = numer + ((numer >> 15) & mask); int16_t q = (int16_t)uq; q >>= shift; q = (q ^ sign) - sign; return q; } else { uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int16_t sign = (int8_t)more >> 7; // q += (more < 0 ? -numer : numer) // cast required to avoid UB uq += ((uint16_t)numer ^ sign) - sign; } int16_t q = (int16_t)uq; q >>= shift; q += (q < 0); return q; } } static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) { return libdivide_s16_do_raw(numer, denom->magic, denom->more); } static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; // must be arithmetic shift and then sign extend int16_t sign = (int8_t)more >> 7; int16_t magic = denom->magic; int16_t q = libdivide_mullhi_s16(numer, magic); q += numer; // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is a power of // 2, or (2**shift) if it is not a power of 2 uint16_t is_power_of_2 = (magic == 0); uint16_t q_sign = (uint16_t)(q >> 15); q += q_sign & (((uint16_t)1 << shift) - is_power_of_2); // Now arithmetic right shift q >>= shift; // Negate if needed q = (q ^ sign) - sign; return q; } static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; if (!denom->magic) { uint16_t absD = (uint16_t)1 << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } return (int16_t)absD; } else { // Unsigned math is much easier // We negate the magic number only in the branchfull case, and we don't // know which case we're in. However we have enough information to // determine the correct sign of the magic number. The divisor was // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, // the magic number's sign is opposite that of the divisor. // We want to compute the positive magic number. int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; // Handle the power of 2 case (including branchfree) if (denom->magic == 0) { int16_t result = (uint16_t)1 << shift; return negative_divisor ? -result : result; } uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic); uint32_t n = (uint32_t)1 << (16 + shift); // this shift cannot exceed 30 uint16_t q = (uint16_t)(n / d); int16_t result = (int16_t)q; result += 1; return negative_divisor ? -result : result; } } static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) { const struct libdivide_s16_t den = {denom->magic, denom->more}; return libdivide_s16_recover(&den); } ////////// SINT32 static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen( int32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_s32_t result; // If d is a power of 2, or negative a power of 2, we have to use a shift. // This is especially important because the magic algorithm fails for -1. // To check if d is a power of 2 or its inverse, it suffices to check // whether its absolute value has exactly one bit set. This works even for // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set // and is a power of 2. uint32_t ud = (uint32_t)d; uint32_t absD = (d < 0) ? -ud : ud; uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD); // check if exactly one bit is set, // don't care if absD is 0 since that's divide by zero if ((absD & (absD - 1)) == 0) { // Branchfree and normal paths are exactly the same result.magic = 0; result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); } else { LIBDIVIDE_ASSERT(floor_log_2_d >= 1); uint8_t more; // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word // is 0 and the high word is floor_log_2_d - 1 uint32_t rem, proposed_m; proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem); const uint32_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) { // This power works more = (uint8_t)(floor_log_2_d - 1); } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an // int32_t. proposed_m += proposed_m; const uint32_t twice_rem = rem + rem; if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } proposed_m += 1; int32_t magic = (int32_t)proposed_m; // Mark if we are negative. Note we only negate the magic number in the // branchfull case. if (d < 0) { more |= LIBDIVIDE_NEGATIVE_DIVISOR; if (!branchfree) { magic = -magic; } } result.more = more; result.magic = magic; } return result; } static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) { return libdivide_internal_s32_gen(d, 0); } static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) { struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1); struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more}; return result; } static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) { uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!magic) { uint32_t sign = (int8_t)more >> 7; uint32_t mask = ((uint32_t)1 << shift) - 1; uint32_t uq = numer + ((numer >> 31) & mask); int32_t q = (int32_t)uq; q >>= shift; q = (q ^ sign) - sign; return q; } else { uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int32_t sign = (int8_t)more >> 7; // q += (more < 0 ? -numer : numer) // cast required to avoid UB uq += ((uint32_t)numer ^ sign) - sign; } int32_t q = (int32_t)uq; q >>= shift; q += (q < 0); return q; } } static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { return libdivide_s32_do_raw(numer, denom->magic, denom->more); } static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift and then sign extend int32_t sign = (int8_t)more >> 7; int32_t magic = denom->magic; int32_t q = libdivide_mullhi_s32(numer, magic); q += numer; // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is a power of // 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); uint32_t q_sign = (uint32_t)(q >> 31); q += q_sign & (((uint32_t)1 << shift) - is_power_of_2); // Now arithmetic right shift q >>= shift; // Negate if needed q = (q ^ sign) - sign; return q; } static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { uint32_t absD = (uint32_t)1 << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } return (int32_t)absD; } else { // Unsigned math is much easier // We negate the magic number only in the branchfull case, and we don't // know which case we're in. However we have enough information to // determine the correct sign of the magic number. The divisor was // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, // the magic number's sign is opposite that of the divisor. // We want to compute the positive magic number. int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; // Handle the power of 2 case (including branchfree) if (denom->magic == 0) { int32_t result = (uint32_t)1 << shift; return negative_divisor ? -result : result; } uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); uint64_t n = (uint64_t)1 << (32 + shift); // this shift cannot exceed 30 uint32_t q = (uint32_t)(n / d); int32_t result = (int32_t)q; result += 1; return negative_divisor ? -result : result; } } static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) { const struct libdivide_s32_t den = {denom->magic, denom->more}; return libdivide_s32_recover(&den); } ////////// SINT64 static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen( int64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_s64_t result; // If d is a power of 2, or negative a power of 2, we have to use a shift. // This is especially important because the magic algorithm fails for -1. // To check if d is a power of 2 or its inverse, it suffices to check // whether its absolute value has exactly one bit set. This works even for // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set // and is a power of 2. uint64_t ud = (uint64_t)d; uint64_t absD = (d < 0) ? -ud : ud; uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD); // check if exactly one bit is set, // don't care if absD is 0 since that's divide by zero if ((absD & (absD - 1)) == 0) { // Branchfree and non-branchfree cases are the same result.magic = 0; result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); } else { // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word // is 0 and the high word is floor_log_2_d - 1 uint8_t more; uint64_t rem, proposed_m; proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem); const uint64_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { // This power works more = (uint8_t)(floor_log_2_d - 1); } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an // int32_t. proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we // also set ADD_MARKER this is an annoying optimization that // enables algorithm #4 to avoid the mask. However we always set it // in the branchfree case more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } proposed_m += 1; int64_t magic = (int64_t)proposed_m; // Mark if we are negative if (d < 0) { more |= LIBDIVIDE_NEGATIVE_DIVISOR; if (!branchfree) { magic = -magic; } } result.more = more; result.magic = magic; } return result; } static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) { return libdivide_internal_s64_gen(d, 0); } static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) { struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1); struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more}; return ret; } static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) { uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!magic) { // shift path uint64_t mask = ((uint64_t)1 << shift) - 1; uint64_t uq = numer + ((numer >> 63) & mask); int64_t q = (int64_t)uq; q >>= shift; // must be arithmetic shift and then sign-extend int64_t sign = (int8_t)more >> 7; q = (q ^ sign) - sign; return q; } else { uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int64_t sign = (int8_t)more >> 7; // q += (more < 0 ? -numer : numer) // cast required to avoid UB uq += ((uint64_t)numer ^ sign) - sign; } int64_t q = (int64_t)uq; q >>= shift; q += (q < 0); return q; } } static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { return libdivide_s64_do_raw(numer, denom->magic, denom->more); } static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift and then sign extend int64_t sign = (int8_t)more >> 7; int64_t magic = denom->magic; int64_t q = libdivide_mullhi_s64(numer, magic); q += numer; // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is a power of // 2, or (2**shift) if it is not a power of 2. uint64_t is_power_of_2 = (magic == 0); uint64_t q_sign = (uint64_t)(q >> 63); q += q_sign & (((uint64_t)1 << shift) - is_power_of_2); // Arithmetic right shift q >>= shift; // Negate if needed q = (q ^ sign) - sign; return q; } static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (denom->magic == 0) { // shift path uint64_t absD = (uint64_t)1 << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } return (int64_t)absD; } else { // Unsigned math is much easier int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0; uint64_t rem_ignored; uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored); int64_t result = (int64_t)(q + 1); if (negative_divisor) { result = -result; } return result; } } static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) { const struct libdivide_s64_t den = {denom->magic, denom->more}; return libdivide_s64_recover(&den); } // Simplest possible vector type division: treat the vector type as an array // of underlying native type. // // Use a union to read a vector via pointer-to-integer, without violating strict // aliasing. #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ const size_t count = sizeof(VecT) / sizeof(IntT); \ union type_pun_vec { \ VecT vec; \ IntT arr[sizeof(VecT) / sizeof(IntT)]; \ }; \ union type_pun_vec result; \ union type_pun_vec input; \ input.vec = numers; \ for (size_t loop = 0; loop < count; ++loop) { \ result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \ } \ return result.vec; #if defined(LIBDIVIDE_NEON) static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128( uint16x8_t numers, const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128( int16x8_t numers, const struct libdivide_s16_t *denom); static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128( uint32x4_t numers, const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128( int32x4_t numers, const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128( uint64x2_t numers, const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128( int64x2_t numers, const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128( uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128( int16x8_t numers, const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128( uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom); static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128( int32x4_t numers, const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128( uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom); static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128( int64x2_t numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Logical right shift by runtime value. // NEON implements right shift as left shits by negative values. static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) { int32_t wamt = (int32_t)(amt); return vshlq_u32(v, vdupq_n_s32(-wamt)); } static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) { int64_t wamt = (int64_t)(amt); return vshlq_u64(v, vdupq_n_s64(-wamt)); } // Arithmetic right shift by runtime value. static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) { int32_t wamt = (int32_t)(amt); return vshlq_s32(v, vdupq_n_s32(-wamt)); } static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) { int64_t wamt = (int64_t)(amt); return vshlq_s64(v, vdupq_n_s64(-wamt)); } static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); } static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) { // Desire is [x0, x1, x2, x3] uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b)); // [_, x0, _, x1] uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b)); //[_, x2, _, x3] return vuzp2q_u32(w1, w2); // [x0, x1, x2, x3] } static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) { int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b)); // [_, x0, _, x1] int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b)); //[_, x2, _, x3] return vuzp2q_s32(w1, w2); // [x0, x1, x2, x3] } static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) { // full 128 bits product is: // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits. uint64x2_t y = vdupq_n_u64(sy); uint32x2_t x0 = vmovn_u64(x); uint32x2_t y0 = vmovn_u64(y); uint32x2_t x1 = vshrn_n_u64(x, 32); uint32x2_t y1 = vshrn_n_u64(y, 32); // Compute x0*y0. uint64x2_t x0y0 = vmull_u32(x0, y0); uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32); // Compute other intermediate products. uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0); // temp = x0y0_hi + x1*y0; // We want to split temp into its low 32 bits and high 32 bits, both // in the low half of 64 bit registers. // Use shifts to avoid needing a reg for the mask. uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32); // temp_lo = temp & 0xFFFFFFFF; uint64x2_t temp_hi = vshrq_n_u64(temp, 32); // temp_hi = temp >> 32; temp_lo = vmlal_u32(temp_lo, x0, y1); // temp_lo += x0*y0 temp_lo = vshrq_n_u64(temp_lo, 32); // temp_lo >>= 32 temp_hi = vmlal_u32(temp_hi, x1, y1); // temp_hi += x1*y1 uint64x2_t result = vaddq_u64(temp_hi, temp_lo); return result; } static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { int64x2_t p = vreinterpretq_s64_u64( libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy))); int64x2_t y = vdupq_n_s64(sy); int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y); int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x); p = vsubq_s64(p, t1); p = vsubq_s64(p, t2); return p; } ////////// UINT16 uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){ SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)} uint16x8_t libdivide_u16_branchfree_do_vec128( uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){ SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)} ////////// UINT32 uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return libdivide_u32_neon_srl(numers, more); } else { uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; // Note we can use halving-subtract to avoid the shift. uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); return libdivide_u32_neon_srl(t, shift); } else { return libdivide_u32_neon_srl(q, more); } } } uint32x4_t libdivide_u32_branchfree_do_vec128( uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) { uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); return libdivide_u32_neon_srl(t, denom->more); } ////////// UINT64 uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return libdivide_u64_neon_srl(numers, more); } else { uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; // No 64-bit halving subtracts in NEON :( uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); return libdivide_u64_neon_srl(t, shift); } else { return libdivide_u64_neon_srl(q, more); } } } uint64x2_t libdivide_u64_branchfree_do_vec128( uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) { uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); return libdivide_u64_neon_srl(t, denom->more); } ////////// SINT16 int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){ SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)} int16x8_t libdivide_s16_branchfree_do_vec128( int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){ SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)} ////////// SINT32 int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = ((uint32_t)1 << shift) - 1; int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask); // q = numer + ((numer >> 31) & roundToZeroTweak); int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak)); q = libdivide_s32_neon_sra(q, shift); int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = vsubq_s32(veorq_s32(q, sign), sign); return q; } else { int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign)); } // q >>= shift q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK); q = vaddq_s32( q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31))); // q += (q < 0) return q; } } int32x4_t libdivide_s32_branchfree_do_vec128( int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic); q = vaddq_s32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31 int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2); q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s32_neon_sra(q, shift); // q >>= shift q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = ((uint64_t)1 << shift) - 1; int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend // q = numer + ((numer >> 63) & roundToZeroTweak); int64x2_t q = vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak)); q = libdivide_s64_neon_sra(q, shift); // q = (q ^ sign) - sign; int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7)); q = vsubq_s64(veorq_s64(q, sign), sign); return q; } else { int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: no need to widen // q += ((numer ^ sign) - sign); q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK); q = vaddq_s64( q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63))); // q += (q < 0) return q; } } int64x2_t libdivide_s64_branchfree_do_vec128( int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: avoid sign extend // libdivide_mullhi_s64(numers, magic); int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); q = vaddq_s64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2); q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_neon_sra(q, shift); // q >>= shift q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign return q; } #endif #if defined(LIBDIVIDE_AVX512) static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512( __m512i numers, const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512( __m512i numers, const struct libdivide_s16_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512( __m512i numers, const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512( __m512i numers, const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512( __m512i numers, const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512( __m512i numers, const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512( __m512i numers, const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512( __m512i numers, const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512( __m512i numers, const struct libdivide_u32_branchfree_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512( __m512i numers, const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512( __m512i numers, const struct libdivide_u64_branchfree_t *denom); static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512( __m512i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) { ; return _mm512_srai_epi64(v, 63); } static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) { return _mm512_srai_epi64(v, amt); } // Here, b is assumed to contain one 32-bit value repeated. static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask); return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); } // b is one 32-bit value repeated. static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask); return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); } // Here, y is assumed to contain one 64-bit value repeated. static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { // see m128i variant for comments. __m512i x0y0 = _mm512_mul_epu32(x, y); __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32); __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); __m512i x0y1 = _mm512_mul_epu32(x, y1); __m512i x1y0 = _mm512_mul_epu32(x1, y); __m512i x1y1 = _mm512_mul_epu32(x1, y1); __m512i mask = _mm512_set1_epi64(0xFFFFFFFF); __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi); __m512i temp_lo = _mm512_and_si512(temp, mask); __m512i temp_hi = _mm512_srli_epi64(temp, 32); temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32); temp_hi = _mm512_add_epi64(x1y1, temp_hi); return _mm512_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) { __m512i p = libdivide_mullhi_u64_vec512(x, y); __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y); __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x); p = _mm512_sub_epi64(p, t1); p = _mm512_sub_epi64(p, t2); return p; } ////////// UINT16 __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){ SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)} __m512i libdivide_u16_branchfree_do_vec512( __m512i numers, const struct libdivide_u16_branchfree_t *denom){ SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)} ////////// UINT32 __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi32(numers, more); } else { __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, shift); } else { return _mm512_srli_epi32(q, more); } } } __m512i libdivide_u32_branchfree_do_vec512( __m512i numers, const struct libdivide_u32_branchfree_t *denom) { __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, denom->more); } ////////// UINT64 __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi64(numers, more); } else { __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, shift); } else { return _mm512_srli_epi64(q, more); } } } __m512i libdivide_u64_branchfree_do_vec512( __m512i numers, const struct libdivide_u64_branchfree_t *denom) { __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, denom->more); } ////////// SINT16 __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){ SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)} __m512i libdivide_s16_branchfree_do_vec512( __m512i numers, const struct libdivide_s16_branchfree_t *denom){ SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)} ////////// SINT32 __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = ((uint32_t)1 << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m512i q = _mm512_add_epi32( numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm512_srai_epi32(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); return q; } else { __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign)); } // q >>= shift q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) return q; } } __m512i libdivide_s32_branchfree_do_vec512( __m512i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic)); q = _mm512_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) q = _mm512_srai_epi32(q, shift); // q >>= shift q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = ((uint64_t)1 << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi64(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m512i q = _mm512_add_epi64( numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec512(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); return q; } else { __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK); q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) return q; } } __m512i libdivide_s64_branchfree_do_vec512( __m512i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // libdivide_mullhi_s64(numers, magic); __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); q = _mm512_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); __m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63 __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2); q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } #endif #if defined(LIBDIVIDE_AVX2) static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256( __m256i numers, const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256( __m256i numers, const struct libdivide_s16_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256( __m256i numers, const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256( __m256i numers, const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256( __m256i numers, const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256( __m256i numers, const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256( __m256i numers, const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256( __m256i numers, const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256( __m256i numers, const struct libdivide_u32_branchfree_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256( __m256i numers, const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256( __m256i numers, const struct libdivide_u64_branchfree_t *denom); static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256( __m256i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm256_srai_epi64(v, 63) (from AVX512). static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) { __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm256_srai_epi64 (from AVX512). static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) { const int b = 64 - amt; __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1)); __m256i x = _mm256_srli_epi64(v, amt); __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask); return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); } // b is one 32-bit value repeated. static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask); return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); } // Here, y is assumed to contain one 64-bit value repeated. static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { // see m128i variant for comments. __m256i x0y0 = _mm256_mul_epu32(x, y); __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32); __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); __m256i x0y1 = _mm256_mul_epu32(x, y1); __m256i x1y0 = _mm256_mul_epu32(x1, y); __m256i x1y1 = _mm256_mul_epu32(x1, y1); __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF); __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi); __m256i temp_lo = _mm256_and_si256(temp, mask); __m256i temp_hi = _mm256_srli_epi64(temp, 32); temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32); temp_hi = _mm256_add_epi64(x1y1, temp_hi); return _mm256_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) { __m256i p = libdivide_mullhi_u64_vec256(x, y); __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y); __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x); p = _mm256_sub_epi64(p, t1); p = _mm256_sub_epi64(p, t2); return p; } ////////// UINT16 __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi16(numers, more); } else { __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); } else { return _mm256_srli_epi16(q, more); } } } __m256i libdivide_u16_branchfree_do_vec256( __m256i numers, const struct libdivide_u16_branchfree_t *denom) { __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); return _mm256_srli_epi16(t, denom->more); } ////////// UINT32 __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi32(numers, more); } else { __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, shift); } else { return _mm256_srli_epi32(q, more); } } } __m256i libdivide_u32_branchfree_do_vec256( __m256i numers, const struct libdivide_u32_branchfree_t *denom) { __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, denom->more); } ////////// UINT64 __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi64(numers, more); } else { __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, shift); } else { return _mm256_srli_epi64(q, more); } } } __m256i libdivide_u64_branchfree_do_vec256( __m256i numers, const struct libdivide_u64_branchfree_t *denom) { __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, denom->more); } ////////// SINT16 __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; uint16_t mask = ((uint16_t)1 << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi16(mask); // q = numer + ((numer >> 15) & roundToZeroTweak); __m256i q = _mm256_add_epi16( numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak)); q = _mm256_srai_epi16(q, shift); __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); return q; } else { __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign)); } // q >>= shift q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0) return q; } } __m256i libdivide_s16_branchfree_do_vec256( __m256i numers, const struct libdivide_s16_branchfree_t *denom) { int16_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; // must be arithmetic shift __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic)); q = _mm256_add_epi16(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint16_t is_power_of_2 = (magic == 0); __m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15 __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) q = _mm256_srai_epi16(q, shift); // q >>= shift q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT32 __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = ((uint32_t)1 << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m256i q = _mm256_add_epi32( numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm256_srai_epi32(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); return q; } else { __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign)); } // q >>= shift q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) return q; } } __m256i libdivide_s32_branchfree_do_vec256( __m256i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic)); q = _mm256_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) q = _mm256_srai_epi32(q, shift); // q >>= shift q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = ((uint64_t)1 << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m256i q = _mm256_add_epi64( numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec256(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); return q; } else { __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK); q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) return q; } } __m256i libdivide_s64_branchfree_do_vec256( __m256i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // libdivide_mullhi_s64(numers, magic); __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); q = _mm256_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); __m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63 __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } #endif #if defined(LIBDIVIDE_SSE2) static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128( __m128i numers, const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128( __m128i numers, const struct libdivide_s16_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128( __m128i numers, const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128( __m128i numers, const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128( __m128i numers, const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128( __m128i numers, const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128( __m128i numers, const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128( __m128i numers, const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128( __m128i numers, const struct libdivide_u32_branchfree_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128( __m128i numers, const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128( __m128i numers, const struct libdivide_u64_branchfree_t *denom); static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128( __m128i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm_srai_epi64(v, 63) (from AVX512). static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) { __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm_srai_epi64 (from AVX512). static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) { const int b = 64 - amt; __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1)); __m128i x = _mm_srli_epi64(v, amt); __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); __m128i a1X3X = _mm_srli_epi64(a, 32); __m128i mask = _mm_set_epi32(-1, 0, -1, 0); __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask); return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); } // SSE2 does not have a signed multiplication instruction, but we can convert // unsigned to signed pretty efficiently. Again, b is just a 32 bit value // repeated four times. static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { __m128i p = libdivide_mullhi_u32_vec128(a, b); // t1 = (a >> 31) & y, arithmetic shift __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); p = _mm_sub_epi32(p, t1); p = _mm_sub_epi32(p, t2); return p; } // Here, y is assumed to contain one 64-bit value repeated. static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { // full 128 bits product is: // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. // Compute x0*y0. // Note x1, y1 are ignored by mul_epu32. __m128i x0y0 = _mm_mul_epu32(x, y); __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32); // Get x1, y1 in the low bits. // We could shuffle or right shift. Shuffles are preferred as they preserve // the source register for the next computation. __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); // No need to mask off top 32 bits for mul_epu32. __m128i x0y1 = _mm_mul_epu32(x, y1); __m128i x1y0 = _mm_mul_epu32(x1, y); __m128i x1y1 = _mm_mul_epu32(x1, y1); // Mask here selects low bits only. __m128i mask = _mm_set1_epi64x(0xFFFFFFFF); __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); __m128i temp_lo = _mm_and_si128(temp, mask); __m128i temp_hi = _mm_srli_epi64(temp, 32); temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); temp_hi = _mm_add_epi64(x1y1, temp_hi); return _mm_add_epi64(temp_lo, temp_hi); } // y is one 64-bit value repeated. static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) { __m128i p = libdivide_mullhi_u64_vec128(x, y); __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y); __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x); p = _mm_sub_epi64(p, t1); p = _mm_sub_epi64(p, t2); return p; } ////////// UINT26 __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi16(numers, more); } else { __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); } else { return _mm_srli_epi16(q, more); } } } __m128i libdivide_u16_branchfree_do_vec128( __m128i numers, const struct libdivide_u16_branchfree_t *denom) { __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); return _mm_srli_epi16(t, denom->more); } ////////// UINT32 __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi32(numers, more); } else { __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, shift); } else { return _mm_srli_epi32(q, more); } } } __m128i libdivide_u32_branchfree_do_vec128( __m128i numers, const struct libdivide_u32_branchfree_t *denom) { __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, denom->more); } ////////// UINT64 __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi64(numers, more); } else { __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, shift); } else { return _mm_srli_epi64(q, more); } } } __m128i libdivide_u64_branchfree_do_vec128( __m128i numers, const struct libdivide_u64_branchfree_t *denom) { __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, denom->more); } ////////// SINT16 __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; uint16_t mask = ((uint16_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi16(mask); // q = numer + ((numer >> 15) & roundToZeroTweak); __m128i q = _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak)); q = _mm_srai_epi16(q, shift); __m128i sign = _mm_set1_epi16((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); return q; } else { __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m128i sign = _mm_set1_epi16((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign)); } // q >>= shift q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0) return q; } } __m128i libdivide_s16_branchfree_do_vec128( __m128i numers, const struct libdivide_s16_branchfree_t *denom) { int16_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; // must be arithmetic shift __m128i sign = _mm_set1_epi16((int8_t)more >> 7); __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic)); q = _mm_add_epi16(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint16_t is_power_of_2 = (magic == 0); __m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15 __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = _mm_srai_epi16(q, shift); // q >>= shift q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT32 __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = ((uint32_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm_srai_epi32(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); return q; } else { __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); } // q >>= shift q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) return q; } } __m128i libdivide_s32_branchfree_do_vec128( __m128i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic)); q = _mm_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = _mm_srai_epi32(q, shift); // q >>= shift q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = ((uint64_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m128i q = _mm_add_epi64( numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec128(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); return q; } else { __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK); q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) return q; } } __m128i libdivide_s64_branchfree_do_vec128( __m128i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // libdivide_mullhi_s64(numers, magic); __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); q = _mm_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); __m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63 __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } #endif ////////// C++ stuff #ifdef __cplusplus enum Branching { BRANCHFULL, // use branching algorithms BRANCHFREE // use branchfree algorithms }; namespace detail { enum Signedness { SIGNED, UNSIGNED, }; #if defined(LIBDIVIDE_NEON) // Helper to deduce NEON vector type for integral type. template struct NeonVec {}; template <> struct NeonVec<16, UNSIGNED> { typedef uint16x8_t type; }; template <> struct NeonVec<16, SIGNED> { typedef int16x8_t type; }; template <> struct NeonVec<32, UNSIGNED> { typedef uint32x4_t type; }; template <> struct NeonVec<32, SIGNED> { typedef int32x4_t type; }; template <> struct NeonVec<64, UNSIGNED> { typedef uint64x2_t type; }; template <> struct NeonVec<64, SIGNED> { typedef int64x2_t type; }; template struct NeonVecFor { // See 'class divider' for an explanation of these template parameters. typedef typename NeonVec> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type; }; #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ LIBDIVIDE_INLINE typename NeonVecFor::type divide( \ typename NeonVecFor::type n) const { \ return libdivide_##ALGO##_do_vec128(n, &denom); \ } #else #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) #endif #if defined(LIBDIVIDE_SSE2) #define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ return libdivide_##ALGO##_do_vec128(n, &denom); \ } #else #define LIBDIVIDE_DIVIDE_SSE2(ALGO) #endif #if defined(LIBDIVIDE_AVX2) #define LIBDIVIDE_DIVIDE_AVX2(ALGO) \ LIBDIVIDE_INLINE __m256i divide(__m256i n) const { \ return libdivide_##ALGO##_do_vec256(n, &denom); \ } #else #define LIBDIVIDE_DIVIDE_AVX2(ALGO) #endif #if defined(LIBDIVIDE_AVX512) #define LIBDIVIDE_DIVIDE_AVX512(ALGO) \ LIBDIVIDE_INLINE __m512i divide(__m512i n) const { \ return libdivide_##ALGO##_do_vec512(n, &denom); \ } #else #define LIBDIVIDE_DIVIDE_AVX512(ALGO) #endif // The DISPATCHER_GEN() macro generates C++ methods (for the given integer // and algorithm types) that redirect to libdivide's C API. #define DISPATCHER_GEN(T, ALGO) \ libdivide_##ALGO##_t denom; \ LIBDIVIDE_INLINE dispatcher() {} \ explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \ LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ LIBDIVIDE_DIVIDE_NEON(ALGO, T) \ LIBDIVIDE_DIVIDE_SSE2(ALGO) \ LIBDIVIDE_DIVIDE_AVX2(ALGO) \ LIBDIVIDE_DIVIDE_AVX512(ALGO) // The dispatcher selects a specific division algorithm for a given // width, signedness, and ALGO using partial template specialization. template struct dispatcher {}; template <> struct dispatcher<16, SIGNED, BRANCHFULL> { DISPATCHER_GEN(int16_t, s16) }; template <> struct dispatcher<16, SIGNED, BRANCHFREE> { DISPATCHER_GEN(int16_t, s16_branchfree) }; template <> struct dispatcher<16, UNSIGNED, BRANCHFULL> { DISPATCHER_GEN(uint16_t, u16) }; template <> struct dispatcher<16, UNSIGNED, BRANCHFREE> { DISPATCHER_GEN(uint16_t, u16_branchfree) }; template <> struct dispatcher<32, SIGNED, BRANCHFULL> { DISPATCHER_GEN(int32_t, s32) }; template <> struct dispatcher<32, SIGNED, BRANCHFREE> { DISPATCHER_GEN(int32_t, s32_branchfree) }; template <> struct dispatcher<32, UNSIGNED, BRANCHFULL> { DISPATCHER_GEN(uint32_t, u32) }; template <> struct dispatcher<32, UNSIGNED, BRANCHFREE> { DISPATCHER_GEN(uint32_t, u32_branchfree) }; template <> struct dispatcher<64, SIGNED, BRANCHFULL> { DISPATCHER_GEN(int64_t, s64) }; template <> struct dispatcher<64, SIGNED, BRANCHFREE> { DISPATCHER_GEN(int64_t, s64_branchfree) }; template <> struct dispatcher<64, UNSIGNED, BRANCHFULL> { DISPATCHER_GEN(uint64_t, u64) }; template <> struct dispatcher<64, UNSIGNED, BRANCHFREE> { DISPATCHER_GEN(uint64_t, u64_branchfree) }; } // namespace detail #if defined(LIBDIVIDE_NEON) // Allow NeonVecFor outside of detail namespace. template struct NeonVecFor { typedef typename detail::NeonVecFor::type type; }; #endif // This is the main divider class for use by the user (C++ API). // The actual division algorithm is selected using the dispatcher struct // based on the integer width and algorithm template parameters. template class divider { private: // Dispatch based on the size and signedness. // We avoid using type_traits as it's not available in AVR. // Detect signedness by checking if T(-1) is less than T(0). // Also throw in a shift by 0, which prevents floating point types from being passed. typedef detail::dispatcher> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO> dispatcher_t; public: // We leave the default constructor empty so that creating // an array of dividers and then initializing them // later doesn't slow us down. divider() {} // constexpr zero-initialization to allow for use w/ static constinit explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {} // Constructor that takes the divisor as a parameter LIBDIVIDE_INLINE divider(T d) : div(d) {} // Divides n by the divisor LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); } // Recovers the divisor, returns the value that was // used to initialize this divider object. T recover() const { return div.recover(); } bool operator==(const divider &other) const { return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more; } bool operator!=(const divider &other) const { return !(*this == other); } // Vector variants treat the input as packed integer values with the same type as the divider // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed // quotients. #if defined(LIBDIVIDE_SSE2) LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); } #endif #if defined(LIBDIVIDE_AVX2) LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); } #endif #if defined(LIBDIVIDE_AVX512) LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); } #endif #if defined(LIBDIVIDE_NEON) LIBDIVIDE_INLINE typename NeonVecFor::type divide(typename NeonVecFor::type n) const { return div.divide(n); } #endif private: // Storage for the actual divisor dispatcher_t div; }; // Overload of operator / for scalar division template LIBDIVIDE_INLINE T operator/(T n, const divider &div) { return div.divide(n); } // Overload of operator /= for scalar division template LIBDIVIDE_INLINE T &operator/=(T &n, const divider &div) { n = div.divide(n); return n; } // Overloads for vector types. #if defined(LIBDIVIDE_SSE2) template LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider &div) { return div.divide(n); } template LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider &div) { n = div.divide(n); return n; } #endif #if defined(LIBDIVIDE_AVX2) template LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider &div) { return div.divide(n); } template LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider &div) { n = div.divide(n); return n; } #endif #if defined(LIBDIVIDE_AVX512) template LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider &div) { return div.divide(n); } template LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider &div) { n = div.divide(n); return n; } #endif #if defined(LIBDIVIDE_NEON) template LIBDIVIDE_INLINE typename NeonVecFor::type operator/( typename NeonVecFor::type n, const divider &div) { return div.divide(n); } template LIBDIVIDE_INLINE typename NeonVecFor::type operator/=( typename NeonVecFor::type &n, const divider &div) { n = div.divide(n); return n; } #endif #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) // libdivide::branchfree_divider template using branchfree_divider = divider; #endif } // namespace libdivide #endif // __cplusplus #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) #endif #endif // LIBDIVIDE_H ================================================ FILE: ext/skeletontricks/skeletontricks.hpp ================================================ /* * This file is part of Kimimaro. * * Kimimaro is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Kimimaro is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Kimimaro. If not, see . * * * Author: William Silversmith * Affiliation: Seung Lab, Princeton University * Date: September 2018 - April 2025 */ #include #include #include #include #include #include #include #include #include #include #include #include "unordered_dense.hpp" #ifndef SKELETONTRICKS_HPP #define SKELETONTRICKS_HPP namespace skeletontricks { size_t _roll_invalidation_cube( uint8_t* labels, float* DBF, const int64_t sx, const int64_t sy, const int64_t sz, const float wx, const float wy, const float wz, size_t* path, const size_t path_size, const float scale, const float constant ) { if (path_size == 0) { return 0; } const size_t sxy = sx * sy; const size_t voxels = sxy * sz; int64_t minx, maxx, miny, maxy, minz, maxz; int64_t x, y, z; int64_t global_minx = sx; int64_t global_maxx = 0; int64_t global_miny = sy; int64_t global_maxy = 0; int64_t global_minz = sz; int64_t global_maxz = 0; std::vector topology(voxels); const bool power_of_two = !((sx & (sx - 1)) || (sy & (sy - 1))); const int xshift = std::log2(sx); // must use log2 here, not lg/lg2 to avoid fp errors const int yshift = std::log2(sy); size_t loc; float radius; // First pass: compute toplology for (size_t i = 0; i < path_size; i++) { loc = path[i]; radius = scale * DBF[loc] + constant; if (power_of_two) { z = loc >> (xshift + yshift); y = (loc - (z << (xshift + yshift))) >> xshift; x = loc - ((y + (z << yshift)) << xshift); } else { z = loc / sxy; y = (loc - (z * sxy)) / sx; x = loc - sx * (y + z * sy); } const int64_t ZERO = 0; minx = std::max(ZERO, static_cast(x - (radius / wx))); maxx = std::min(sx-1, static_cast(0.5 + (x + (radius / wx)))); miny = std::max(ZERO, static_cast(y - (radius / wy))); maxy = std::min(sy-1, static_cast(0.5 + (y + (radius / wy)))); minz = std::max(ZERO, static_cast(z - (radius / wz))); maxz = std::min(sz-1, static_cast(0.5 + (z + (radius / wz)))); global_minx = std::min(global_minx, minx); global_maxx = std::max(global_maxx, maxx); global_miny = std::min(global_miny, miny); global_maxy = std::max(global_maxy, maxy); global_minz = std::min(global_minz, minz); global_maxz = std::max(global_maxz, maxz); for (y = miny; y <= maxy; y++) { for (z = minz; z <= maxz; z++) { topology[minx + sx * y + sxy * z] += 1; topology[maxx + sx * y + sxy * z] -= 1; } } } // Second pass: invalidate labels int coloring; size_t invalidated = 0; size_t yzoffset; for (z = global_minz; z <= global_maxz; z++) { for (y = global_miny; y <= global_maxy; y++) { yzoffset = sx * y + sxy * z; coloring = 0; for (x = global_minx; x <= global_maxx; x++) { coloring += topology[x + yzoffset]; if (coloring > 0 || topology[x + yzoffset]) { invalidated += static_cast(labels[x + yzoffset] > 0); // convert non-bool vals labels[x + yzoffset] = 0; } } } } return invalidated; } template inline size_t max(T* edges, const size_t size) { if (size == 0) { return 0; } size_t mx = edges[0]; for (size_t i = 0; i < size; i++) { if (static_cast(edges[i]) > mx) { mx = static_cast(edges[i]); } } return mx; } template void printvec(std::vector vec) { for (T v : vec) { printf("%d, ", v); } printf("\n"); } template void printstack(std::stack stack) { while (!stack.empty()) { printf("%d, ", stack.top()); stack.pop(); } printf("\n"); } template std::vector stack2vec(std::stack stk) { std::vector vec; vec.reserve(stk.size()); while (!stk.empty()) { vec.push_back(stk.top()); stk.pop(); } std::reverse(vec.begin(), vec.end()); return vec; } // Ne = size of edges / 2 // Nv = number of vertices (max of edge values) template std::vector _find_cycle(const T* edges, const size_t Ne) { if (Ne == 0) { return std::vector(0); } size_t Nv = max(edges, Ne * 2) + 1; // +1 to ensure zero is counted std::vector< ankerl::unordered_dense::set > index(Nv); index.reserve(Nv); // NB: consolidate handles the trivial loops (e1 == e2) // and deduplication of edges for (size_t i = 0; i < 2 * Ne; i += 2) { T e1 = edges[i]; T e2 = edges[i+1]; index[e1].insert(e2); index[e2].insert(e1); } T root = edges[0]; T node = -1; T parent = -1; uint32_t depth = -1; std::stack stack; std::stack parents; std::stack depth_stack; std::stack path; stack.push(root); parents.push(-1); depth_stack.push(0); std::vector visited(Nv, false); while (!stack.empty()) { node = stack.top(); parent = parents.top(); depth = depth_stack.top(); stack.pop(); parents.pop(); depth_stack.pop(); while (path.size() > depth) { path.pop(); } path.push(node); if (visited[node]) { break; } visited[node] = true; for (T child : index[node]) { if (child == parent) { continue; } stack.push(child); parents.push(node); depth_stack.push(depth + 1); } } if (path.size() <= 1) { return std::vector(0); } // cast stack to vector w/ zero copy std::vector vec_path = stack2vec(path); // Find start of loop. Since a cycle was detected, // the last node found started the cycle. We need // to trim the path leading up to that connection. size_t i; for (i = 0; i < vec_path.size() - 1; i++) { if (vec_path[i] == node) { break; } } if (vec_path.size() - i < 3) { return std::vector(0); } return std::vector(vec_path.begin() + i, vec_path.end()); } // Had trouble returning an unordered_map< pair, float> // to python, so I decided to just pack two uint32s into a uint64 // and unpack them on the other side. std::unordered_map _create_distance_graph( float* vertices, size_t Nv, uint32_t* edges, size_t Ne, uint32_t start_node, std::vector critical_points_vec ) { std::vector< std::vector > tree(Nv); tree.reserve(Nv); std::vector critical_points(Nv, false); for (uint32_t edge : critical_points_vec) { critical_points[edge] = true; } for (size_t i = 0; i < Ne; i++) { uint32_t e1 = edges[2*i]; uint32_t e2 = edges[2*i + 1]; tree[e1].push_back(e2); tree[e2].push_back(e1); } std::unordered_map distgraph; std::stack stack; std::stack parents; std::stack dist_stack; std::stack root_stack; stack.push(start_node); parents.push(-1); dist_stack.push(0.0); root_stack.push(start_node); uint32_t node, root; int32_t parent; float dist; uint64_t key = 0; std::vector visited(Nv, false); while (!stack.empty()) { node = stack.top(); dist = dist_stack.top(); root = root_stack.top(); parent = parents.top(); if (visited[node]) { throw std::runtime_error(std::string("Cycle detected. Node: ") + std::to_string(node)); } visited[node] = true; stack.pop(); dist_stack.pop(); root_stack.pop(); parents.pop(); if (critical_points[node] && node != root) { key = (root < node) ? static_cast(root) | (static_cast(node) << 32) : static_cast(node) | (static_cast(root) << 32); distgraph[key] = dist; dist = 0.0; root = node; } for (int32_t child : tree[node]) { if (static_cast(child) == parent) { continue; } float dx = vertices[3*node + 0] - vertices[3*child + 0]; float dy = vertices[3*node + 1] - vertices[3*child + 1]; float dz = vertices[3*node + 2] - vertices[3*child + 2]; dx *= dx; dy *= dy; dz *= dz; stack.push(child); parents.push(static_cast(node)); dist_stack.push( dist + sqrt(dx + dy + dz) ); root_stack.push(root); } } return distgraph; } // extracting skeletons from binary images produced by // other thinning based skeletonization algorithms inline void compute_neighborhood( int *neighborhood, const int x, const int y, const int z, const uint64_t sx, const uint64_t sy, const uint64_t sz, const int connectivity = 26 ) { const int sxy = sx * sy; const int plus_x = (x < (static_cast(sx) - 1)); // +x const int minus_x = -1 * (x > 0); // -x const int plus_y = static_cast(sx) * (y < static_cast(sy) - 1); // +y const int minus_y = -static_cast(sx) * (y > 0); // -y const int minus_z = -sxy * static_cast(z > 0); // -z // 6-hood neighborhood[0] = minus_x; neighborhood[1] = minus_y; neighborhood[2] = minus_z; // 18-hood // xy diagonals neighborhood[3] = (connectivity > 6) * (minus_x + minus_y) * (minus_x && minus_y); // up-left neighborhood[4] = (connectivity > 6) * (plus_x + minus_y) * (plus_x && minus_y); // up-right // yz diagonals neighborhood[5] = (connectivity > 6) * (minus_x + minus_z) * (minus_x && minus_z); // down-left neighborhood[6] = (connectivity > 6) * (plus_x + minus_z) * (plus_x && minus_z); // down-right // xz diagonals neighborhood[7] = (connectivity > 6) * (minus_y + minus_z) * (minus_y && minus_z); // down-left neighborhood[8] = (connectivity > 6) * (plus_y + minus_z) * (plus_y && minus_z); // down-right // 26-hood // Now the eight corners of the cube neighborhood[9] = (connectivity > 18) * (minus_x + minus_y + minus_z) * (minus_y && minus_z); neighborhood[10] = (connectivity > 18) * (plus_x + minus_y + minus_z) * (minus_y && minus_z); neighborhood[11] = (connectivity > 18) * (minus_x + plus_y + minus_z) * (plus_y && minus_z); neighborhood[12] = (connectivity > 18) * (plus_x + plus_y + minus_z) * (plus_y && minus_z); } struct pair_hash { inline std::size_t operator()(const std::pair & v) const { return v.first * 31 + v.second; // arbitrary hash fn } }; std::unordered_set, pair_hash> _extract_edges_from_binary_image( const uint8_t* image, const uint64_t sx, const uint64_t sy, const uint64_t sz, const int connectivity = 26 ) { const uint64_t sxy = sx * sy; std::unordered_set, pair_hash> edges; edges.reserve(sx * sy * sz / 100); int neighborhood[13]; uint64_t neighboridx = 0; for (uint64_t z = 0; z < sz; z++) { for (uint64_t y = 0; y < sy; y++) { for (uint64_t x = 0; x < sx; x++) { uint64_t loc = x + sx * y + sxy * z; if (image[loc] == 0) { continue; } compute_neighborhood(neighborhood, x, y, z, sx, sy, sz, connectivity); for (int i = 0; i < 13; i++) { if (neighborhood[i] == 0) { continue; } neighboridx = loc + neighborhood[i]; if (image[neighboridx] == 0) { continue; } if (loc <= neighboridx) { edges.emplace(std::make_pair(loc, neighboridx)); } else { edges.emplace(std::make_pair(neighboridx, loc)); } } } } } return edges; } }; #endif ================================================ FILE: ext/skeletontricks/skeletontricks.pyx ================================================ # cython: language_level=3 """ Certain operations have to be fast for the skeletonization procedure. The ones that didn't fit elsewhere have a home here. Author: William Silversmith Affiliation: Seung Lab, Princeton Neuroscience Institute Date: August 2018 - May 2024 ***************************************************************** This file is part of Kimimaro. Kimimaro is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Kimimaro is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Kimimaro. If not, see . ***************************************************************** """ cimport cython from libc.stdlib cimport calloc, free from libc.stdint cimport ( int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t ) from libcpp cimport bool from cpython cimport array import array import sys from libcpp.vector cimport vector from libcpp.unordered_map cimport unordered_map from libcpp.unordered_set cimport unordered_set from libcpp.utility cimport pair as cpp_pair cimport numpy as cnp import numpy as np cnp.import_array() from collections import defaultdict cdef float INFINITY = float('inf') ctypedef fused UINT: uint8_t uint16_t uint32_t uint64_t unsigned char ctypedef fused INTEGER: int8_t int16_t int32_t int64_t UINT cdef extern from "dijkstra_invalidation.hpp" namespace "dijkstra_invalidation": cdef int64_t _roll_invalidation_ball( uint8_t* field, uint64_t sx, uint64_t sy, uint64_t sz, float wx, float wy, float wz, vector[uint64_t] sources, vector[float] max_distances, int connectivity, uint32_t* voxel_connectivity_graph ) cdef extern from "skeletontricks.hpp" namespace "skeletontricks": cdef size_t _roll_invalidation_cube( uint8_t* labels, float* DBF, int64_t sx, int64_t sy, int64_t sz, float wx, float wy, float wz, size_t* path, size_t path_size, float scale, float constant ) cdef vector[T] _find_cycle[T](T* edges, size_t Ne) cdef unordered_map[ uint64_t, float ] _create_distance_graph( float* vertices, size_t Nv, uint32_t* edges, size_t Ne, uint32_t start_node, vector[int32_t] critical_points_vec ) cdef struct pair_hash: size_t __call__(cpp_pair[uint64_t,uint64_t] v) cdef unordered_set[ cpp_pair[uint64_t, uint64_t], pair_hash ] _extract_edges_from_binary_image( uint8_t* image, uint64_t sx, uint64_t sy, uint64_t sz, int connectivity ) def find_cycle(cnp.ndarray[int32_t, ndim=2] edges): """ Given a graph of edges that are a single connected component, find a cycle via depth first search. Returns: list of edges in a cycle (empty list if no cycle is found) """ if edges.size == 0: return np.zeros((0,), dtype=np.uint32) edges = np.ascontiguousarray(edges) cdef cnp.ndarray[int32_t, ndim=1] elist = np.array( _find_cycle[int32_t]( &edges[0,0], (edges.size // 2) ), dtype=np.int32 ) return elist def create_distance_graph(skeleton): """ Creates the distance "supergraph" from a single connected component skeleton as described in _remove_ticks. Returns: a distance "supergraph" describing the physical distance between the critical points in the skeleton's structure. Example skeleton with output: 60nm 60nm 60nm 1------2------3------4 30nm | 70nm \ 5 ----6 { (1,2): 60, (2,3): 60, (2,5): 30, (3,4): 60, (3,6): 70, } """ cdef cnp.ndarray[float, ndim=2] vertices = skeleton.vertices cdef cnp.ndarray[uint32_t, ndim=2] edges = skeleton.edges unique_nodes, unique_counts = np.unique(edges, return_counts=True) terminal_nodes = unique_nodes[ unique_counts == 1 ] branch_nodes = set(unique_nodes[ unique_counts >= 3 ]) critical_points = set(terminal_nodes) critical_points.update(branch_nodes) res = _create_distance_graph( &vertices[0,0], vertices.shape[0], &edges[0,0], edges.shape[0], terminal_nodes[0], list(critical_points) ) cdef dict supergraph = res cdef dict real_supergraph = {} cdef uint64_t key = 0 cdef int32_t e1, e2 for key in supergraph.keys(): e2 = (key & 0xffffffff) e1 = (key >> 32) real_supergraph[ (e1, e2) ] = supergraph[key] return real_supergraph @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def inf2zero(cnp.ndarray[float, cast=True, ndim=3] field): """ inf2zero(cnp.ndarray[float, cast=True, ndim=3] field) Convert infinities to zeros. Returns: field """ cdef size_t sx, sy, sz cdef size_t x, y, z sx = field.shape[0] sy = field.shape[1] sz = field.shape[2] for z in range(0, sz): for y in range(0, sy): for x in range(0, sx): if field[x,y,z] == INFINITY: field[x,y,z] = 0 return field @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def zero2inf(cnp.ndarray[float, cast=True, ndim=3] field): """ zero2inf(cnp.ndarray[float, cast=True, ndim=3] field) Convert zeros to positive infinities. Returns: field """ cdef size_t sx, sy, sz cdef size_t x, y, z sx = field.shape[0] sy = field.shape[1] sz = field.shape[2] for z in range(0, sz): for y in range(0, sy): for x in range(0, sx): if (field[x,y,z] == 0): field[x,y,z] = INFINITY return field @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def zero_out_all_except(cnp.ndarray[INTEGER, cast=True, ndim=3] field, INTEGER leave_alone): """ zero_out_all_except(cnp.ndarray[INTEGER, cast=True, ndim=3] field, INTEGER leave_alone) Change all values in field to zero except `leave_alone`. Returns: field """ cdef size_t sx, sy, sz cdef size_t x, y, z sx = field.shape[0] sy = field.shape[1] sz = field.shape[2] for z in range(0, sz): for y in range(0, sy): for x in range(0, sx): if (field[x,y,z] != leave_alone): field[x,y,z] = 0 return field @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def finite_max(cnp.ndarray[float, cast=True, ndim=3] field): """ float finite_max(cnp.ndarray[float, cast=True, ndim=3] field) Given a field of floats that may include infinities, find the largest finite value. """ cdef size_t sx, sy, sz cdef size_t x, y, z sx = field.shape[0] sy = field.shape[1] sz = field.shape[2] cdef float maximum = -INFINITY for z in range(0, sz): for y in range(0, sy): for x in range(0, sx): if (field[x,y,z] > maximum) and (field[x,y,z] < +INFINITY): maximum = field[x,y,z] return maximum @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def finite_min(cnp.ndarray[float, cast=True, ndim=3] field): """ float finite_min(cnp.ndarray[float, cast=True, ndim=3] field) Given a field of floats that may include infinities, find the minimum finite value. """ cdef size_t sx, sy, sz cdef size_t x, y, z sx = field.shape[0] sy = field.shape[1] sz = field.shape[2] cdef float minimum = -INFINITY for z in range(0, sz): for y in range(0, sy): for x in range(0, sx): if (field[x,y,z] < minimum) and (field[x,y,z] > -INFINITY): minimum = field[x,y,z] return minimum @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def first_label(cnp.ndarray[uint8_t, cast=True, ndim=3] labels): """ uint8_t first_label(cnp.ndarray[uint8_t, cast=True, ndim=3] labels) Scan through labels to find the first non-zero value and return it. """ cdef size_t sx, sy, sz cdef size_t x, y, z sx = labels.shape[0] sy = labels.shape[1] sz = labels.shape[2] for z in range(0, sz): for y in range(0, sy): for x in range(0, sx): if labels[x,y,z]: return (x,y,z) return None @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def find_target( cnp.ndarray[uint8_t, cast=True, ndim=3] labels, cnp.ndarray[float, ndim=3] PDRF ): """ find_target(ndarray[uint8_t, cast=True, ndim=3] labels, ndarray[float, ndim=3] PDRF) Given a binary image and a coregistered map of values to it, find the coordinate of the voxel corresponding to the first instance of the maximum map value. Returns: (x, y, z) """ cdef size_t x,y,z cdef size_t sx, sy, sz sx = labels.shape[0] sy = labels.shape[1] sz = labels.shape[2] cdef int64_t mx, my, mz mx = -1 my = -1 mz = -1 cdef float maxpdrf = -INFINITY for x in range(0, sx): for y in range(0, sy): for z in range(0, sz): if labels[x,y,z] and PDRF[x,y,z] > maxpdrf: maxpdrf = PDRF[x,y,z] mx = x my = y mz = z return (mx, my, mz) @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) @cython.binding(True) def roll_invalidation_ball_inside_component( cnp.ndarray[uint8_t, cast=True, ndim=3] labels, cnp.ndarray[float, ndim=3] DBF, float scale, float constant, anisotropy, path, voxel_connectivity_graph = None, connectivity = 26, ): cdef int64_t sx, sy, sz sx = labels.shape[0] sy = labels.shape[1] sz = labels.shape[2] cdef size_t sxy = sx * sy cdef float wx, wy, wz (wx, wy, wz) = anisotropy max_distances = [ (scale * DBF[x,y,z] + constant) for (x,y,z) in path ] path = [ coord[0] + sx * coord[1] + sxy * coord[2] for coord in path if tuple(coord) ] cdef uint32_t* vcg = NULL cdef cnp.ndarray[uint32_t, ndim=3] vcg_arr if isinstance(voxel_connectivity_graph, np.ndarray): vcg_arr = voxel_connectivity_graph vcg = &vcg_arr[0,0,0] invalidated = _roll_invalidation_ball( &labels[0,0,0], sx, sy, sz, wx, wy, wz, path, max_distances, connectivity, vcg ) return (invalidated, labels) @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) @cython.binding(True) def roll_invalidation_ball( cnp.ndarray[uint8_t, cast=True, ndim=3] labels, cnp.ndarray[float, ndim=3] DBF, path, float scale, float const, anisotropy=(1,1,1), invalid_vertices={}, ): """ Given an anisotropic binary image, its distance transform, and a path traversing the binary image, erase the voxels surrounding the path in a sphere around each vertex on the path corresponding to the equation: r = scale * DBF[x,y,z] + const Returns: modified labels """ cdef int64_t sx, sy, sz sx = labels.shape[0] sy = labels.shape[1] sz = labels.shape[2] cdef float wx, wy, wz (wx, wy, wz) = anisotropy cdef float radius, dist cdef int64_t minx, maxx, miny, maxy, minz, maxz cdef int64_t x,y,z cdef int64_t x0, y0, z0 cdef size_t invalidated = 0 for coord in path: if tuple(coord) in invalid_vertices: continue (x0, y0, z0) = coord radius = DBF[x0,y0,z0] * scale + const # physical units (e.g. nm) minx = max(0, (0.5 + (x0 - (radius / wx)))) maxx = min(sx, (0.5 + (x0 + (radius / wx)))) miny = max(0, (0.5 + (y0 - (radius / wy)))) maxy = min(sy, (0.5 + (y0 + (radius / wy)))) minz = max(0, (0.5 + (z0 - (radius / wz)))) maxz = min(sz, (0.5 + (z0 + (radius / wz)))) radius *= radius for x in range(minx, maxx): for y in range(miny, maxy): for z in range(minz, maxz): if not labels[x,y,z]: continue dist = (wx * (x - x0)) ** 2 + (wy * (y - y0)) ** 2 + (wz * (z - z0)) ** 2 if dist <= radius: invalidated += 1 labels[x,y,z] = 0 return invalidated, labels @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) @cython.binding(True) def get_mapping( cnp.ndarray[INTEGER, ndim=3] orig_labels, cnp.ndarray[UINT, ndim=3] cc_labels ): """ Given a set of possibly not connected labels and an image containing their labeled connected components, produce a dictionary containing the inverse of this mapping. Returns: { $CC_LABEL: $ORIGINAL_LABEL } """ cdef size_t sx, sy, sz sx = orig_labels.shape[0] sy = orig_labels.shape[1] sz = orig_labels.shape[2] cdef size_t x,y,z remap = {} if orig_labels.size == 0: return remap cdef UINT last_label = cc_labels[0,0,0] remap[cc_labels[0,0,0]] = orig_labels[0,0,0] for z in range(sz): for y in range(sy): for x in range(sx): if last_label == cc_labels[x,y,z]: continue remap[cc_labels[x,y,z]] = orig_labels[x,y,z] last_label = cc_labels[x,y,z] return remap @cython.binding(True) def compute_centroids( cnp.ndarray[UINT, ndim=2] labels, float wx, float wy ): """ Compute the centroid for every label on a 2D image at once. Returns: { $segid: (x, y), ... } """ cdef float[:] xsum = np.zeros( (labels.size,), dtype=np.float32) cdef float[:] ysum = np.zeros( (labels.size,), dtype=np.float32) cdef uint32_t[:] labelct = np.zeros( (labels.size,), dtype=np.uint32) cdef size_t sx, sy sx = labels.shape[0] sy = labels.shape[1] cdef size_t x, y cdef uint32_t label = 0 for x in range(sx): for y in range(sy): label = labels[x,y] if label == 0: continue xsum[label] += x ysum[label] += y labelct[label] += 1 result = {} cdef float cx = wx * sx / 2 cdef float cy = wy * sy / 2 cdef float px, py for label in range(labels.size): if labelct[label] == 0: continue px = wx * xsum[label] / labelct[label] py = wy * ysum[label] / labelct[label] # Since we don't know which coordinate frame we # are using, round toward the center of the image # to ensure we get the same pixel every time. if px - cx >= 0: px = px # will be truncated towards center else: px = px + wx if py - cy >= 0: py = py # will be truncated towards center else: py = py + wy result[label] = ((px / wx), (py / wy)) return result @cython.binding(True) def find_border_targets( cnp.ndarray[float, ndim=2] dt, cnp.ndarray[UINT, ndim=2] cc_labels, float wx, float wy ): """ Given a set of connected components that line within a plane and their distance transform, return a map of label ID to the coordinate of its maximum distance transform value. If there are multiple maxima, we disambiguate based on topological criteria that are coordinate frame independent in order to avoid dealing with issues that come from the six rotated frames and their mirrored partners. The purpose of this function is to fix the edge effect the standard TEASAR algorithm generates and ensure that we can trivially join skeletons from adjacent chunks. Rotating the (x,y) pairs into their appropriate frame is performed in the function that calls this one. Returns: { $SEGID: (x, y), ... } """ cdef size_t sx, sy sx = dt.shape[0] sy = dt.shape[1] cdef size_t x, y mx = defaultdict(float) pts = {} cdef UINT label = 0 cdef dict centroids = compute_centroids(cc_labels, wx, wy) cdef float px, py cdef float centx, centy for y in range(sy): for x in range(sx): label = cc_labels[x,y] if label == 0: continue elif dt[x,y] == 0: continue elif dt[x,y] > mx[label]: mx[label] = dt[x,y] pts[label] = (x,y) elif mx[label] == dt[x,y]: px, py = pts[label] centx, centy = centroids[label] pts[label] = compute_tiebreaker_maxima( px, py, x, y, centx, centy, sx, sy, wx, wy ) return pts def compute_tiebreaker_maxima( float px, float py, float x, float y, float centx, float centy, float sx, float sy, float wx, float wy ): """ compute_tiebreaker_maxima( float px, float py, float x, float y, float centx, float centy, float sx, float sy, float wx, float wy ) This function breaks ties for `compute_border_targets`. (px,py): A previously found distance transform maxima (x,y): The coordinate of the newly found maxima (sx,sy): The length and width of the image plane. (wx,wy): Weighting for anisotropy. (centx, centy): The centroid of the current label. We use following topolological criteria to achieve a coordinate frame-free voxel selection. We pick the result of the first criterion that is satisfied. 1) Pick the voxel closest to the centroid of the label. 2) The voxel closest to the centroid of the plane. 3) Closest to a corner of the plane. 4) Closest to an edge of the plane. 5) The previous maxima. The worst case would be an annulus drawn around the center, which would result in four equally eligible pixels.... Hopefully this won't happen too often... Returns: some (x, y) """ cdef float cx = wx * sx / 2.0 cdef float cy = wy * sy / 2.0 cdef float dist1 = distsq(px,py, centx,centy, wx,wy) cdef float dist2 = distsq( x, y, centx,centy, wx,wy) if dist2 < dist1: return (x, y) elif dist1 == dist2: dist1 = distsq(px,py, cx,cy, wx,wy) dist2 = distsq( x, y, cx,cy, wx,wy) if dist2 < dist1: return (x,y) elif dist1 == dist2: dist1 = cornerness(px, py, sx, sy, wx,wy) dist2 = cornerness( x, y, sx, sy, wx,wy) if dist2 < dist1: return (x, y) elif dist1 == dist2: dist1 = edgeness(px, py, sx, sy, wx,wy) dist2 = edgeness( x, y, sx, sy, wx,wy) if dist2 < dist1: return (x, y) return (px, py) cdef float edgeness( float x, float y, float sx, float sy, float wx, float wy ): """ float edgeness(float x, float y, float sx, float sy) Nearness of (x,y) to the edge of an image of size (sx,sy). """ return min( wx * (x - 0.5), wx * (sx - 0.5 - x), wy * (y - 0.5), wy * (sy - 0.5 - y) ) cdef float cornerness( float x, float y, float sx, float sy, float wx, float wy ): """ float cornerness( float x, float y, float sx, float sy float wx, float wy ) Nearness of (x,y) to a corner of an image of size (sx,sy). """ return min( distsq(x,y,-0.5,-0.5, wx, wy), distsq(x,y,sx-0.5,-0.5, wx, wy), distsq(x,y,sx-0.5,sy-0.5, wx, wy), distsq(x,y,-0.5,sx-0.5, wx, wy) ) cdef float distsq( float p1x, float p1y, float p2x, float p2y, float wx, float wy ): p1x = wx * (p1x - p2x) p1y = wy * (p1y - p2y) return p1x * p1x + p1y * p1y @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) @cython.binding(True) def roll_invalidation_cube( cnp.ndarray[uint8_t, cast=True, ndim=3] labels, cnp.ndarray[float, ndim=3] DBF, path, float scale, float const, anisotropy=(1,1,1), invalid_vertices={}, ): """ Given an anisotropic binary image, its distance transform, and a path traversing the binary image, erase the voxels surrounding the path in a cube around each vertex. In contrast to `roll_invalidation_ball`, this function runs in time linear in the number of image pixels. """ cdef int64_t sx, sy, sz sx = labels.shape[0] sy = labels.shape[1] sz = labels.shape[2] cdef size_t sxy = sx * sy cdef float wx, wy, wz (wx, wy, wz) = anisotropy path = [ coord[0] + sx * coord[1] + sxy * coord[2] for coord in path if tuple(coord) not in invalid_vertices ] path = np.array(path, dtype=np.uintp) cdef size_t[:] pathview = path cdef size_t invalidated = _roll_invalidation_cube( &labels[0,0,0], &DBF[0,0,0], sx, sy, sz, wx, wy, wz, &pathview[0], path.size, scale, const ) return invalidated, labels @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def find_cycle_cython(cnp.ndarray[int32_t, ndim=2] edges): """ Given a graph of edges that are a single connected component, find a cycle via depth first search. Returns: list of edges in a cycle (empty list if no cycle is found) """ index = defaultdict(set) visited = defaultdict(int) if edges.size == 0: return np.array([], dtype=np.int32) for e1, e2 in edges: index[e1].add(e2) index[e2].add(e1) cdef int root = edges[0,0] cdef int node = -1 cdef int child = -1 cdef int parent = -1 cdef int depth = -1 cdef int i = 0 cdef list stack = [root] cdef list parents = [-1] cdef list depth_stack = [0] cdef list path = [] while stack: node = stack.pop() parent = parents.pop() depth = depth_stack.pop() for i in range(len(path) - depth): path.pop() path.append(node) if visited[node] == 1: break visited[node] = 1 for child in index[node]: if child != parent: stack.append(child) parents.append(node) depth_stack.append(depth + 1) if len(path) <= 1: return np.array([], dtype=np.int32) for i in range(len(path) - 1): if path[i] == node: break path = path[i:] if len(path) < 3: return np.array([], dtype=np.int32) return np.array(path, dtype=np.int32) def find_avocado_fruit( cnp.ndarray[INTEGER, ndim=3] labels, size_t cx, size_t cy, size_t cz, INTEGER background = 0 ): """ Tests to see if the current coordinate is inside the nucleus of a somata that has been assigned to a separate label from the rest of the cell. Returns: (pit, fruit) """ cdef size_t sx, sy, sz sx, sy, sz = labels.shape[:3] cdef size_t voxels = sx * sy * sz if cx >= sx or cy >= sy or cz >= sz: raise ValueError( "<{},{},{}> must be be contained within shape <{},{},{}>".format( cx,cy,cz,sx,sy,sz )) cdef size_t x, y, z cdef INTEGER label = labels[cx, cy, cz] cdef list changes = [ None ] * 6 for x in range(cx, sx): if labels[x,cy,cz] == background: break elif labels[x,cy,cz] != label: changes[0] = labels[x,cy,cz] break for x in range(cx, 0, -1): if labels[x,cy,cz] == background: break elif labels[x,cy,cz] != label: changes[1] = labels[x,cy,cz] break for y in range(cy, sy): if labels[cx,y,cz] == background: break if labels[cx,y,cz] != label: changes[2] = labels[cx,y,cz] break for y in range(cy, 0, -1): if labels[cx,y,cz] == background: break if labels[cx,y,cz] != label: changes[3] = labels[cx,y,cz] break for z in range(cz, sz): if labels[cx,cy,z] == background: break if labels[cx,cy,z] != label: changes[4] = labels[cx,cy,z] break for z in range(cz, 0, -1): if labels[cx,cy,z] == background: break if labels[cx,cy,z] != label: changes[5] = labels[cx,cy,z] break changes = [ _ for _ in changes if _ is not None ] # Too little info to make a decision if len(changes) < 3: return (label, label) if len(changes) > 3: # if more than 3, allow one non-match allowed_differences = 1 else: # allow no non-matches (we're in a corner) allowed_differences = 0 uniq, cts = np.unique(changes, return_counts=True) candidate_fruit_index = np.argmax(cts) differences = len(changes) - cts[candidate_fruit_index] # it's not an avocado if there's lots of # labels surrounding the candidate "pit" if differences > allowed_differences: return (label, label) return (label, uniq[candidate_fruit_index]) class CachedTargetFinder: def __init__(self, mask: np.ndarray, daf: np.ndarray): """ From DAF, compute a sorted list of the maximum values so that finding them becomes very fast. """ mask_indices = np.flatnonzero(mask.ravel(order='F')) if mask.size < np.iinfo(np.uint32).max: mask_indices = mask_indices.astype(np.uint32, copy=False) daf_sort = np.argsort(daf.ravel(order='F')[mask_indices]) daf_sort = np.flip(daf_sort) self.daf_indices = mask_indices[daf_sort] def find_target(self, mask: np.ndarray): """ Find the coordinate of a voxel corresponding the maximum map value. Returns: (x, y, z) """ first_positive_index = self.first_label_indexed( mask.ravel(order='F'), self.daf_indices ) if first_positive_index is None: self.daf_indices = self.daf_indices[self.daf_indices.size:] # Clear it. return None # This tells us mask positions daf_indices[0:first_positive_index] are now # zeroed out. We assume that this is permanent, so we don't need to search # those positions again next time. self.daf_indices = self.daf_indices[first_positive_index:] return np.unravel_index(self.daf_indices[0], mask.shape, order='F') @cython.boundscheck(False) @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.nonecheck(False) def first_label_indexed(self, uint8_t[:] labels not None, INTEGER[:] indices not None): """ Returns: first i for which labels[indices[i]] is non-zero. """ cdef size_t length = indices.size cdef size_t i = 0 cdef INTEGER label_index for i in range(length): label_index = indices[i] if labels[label_index]: return i return None def extract_edges_from_binary_image(uint8_t[:,:,:] binimg, int connectivity = 26): cdef uint64_t sx, sy, sz sx, sy, sz = tuple(binimg.shape)[:3] cdef uint64_t sxy = sx * sy binimg = np.asfortranarray(binimg) cdef unordered_set[cpp_pair[uint64_t,uint64_t], pair_hash] edges = _extract_edges_from_binary_image( &binimg[0,0,0], sx, sy, sz, connectivity ) numbering = {} cdef int64_t i = 0 for edge in edges: for v in (edge.first, edge.second): if v not in numbering: numbering[v] = i i += 1 inumbering = { v:k for k,v in numbering.items() } vertices = [] cdef uint64_t loc, x, y, z for i in range(len(inumbering)): loc = inumbering[i] z = loc // sxy y = (loc - z * sxy) // sx x = loc - z * sxy - y * sx vertices.append((x,y,z)) int_edges = [] for v1,v2 in edges: int_edges.append((numbering[v1], numbering[v2])) vertices = np.array(vertices, dtype=np.uint32) int_edges = np.array(int_edges, dtype=np.uint32) return (vertices, int_edges) ================================================ FILE: ext/skeletontricks/unordered_dense.hpp ================================================ ///////////////////////// ankerl::unordered_dense::{map, set} ///////////////////////// // A fast & densely stored hashmap and hashset based on robin-hood backward shift deletion. // Version 4.5.0 // https://github.com/martinus/unordered_dense // // Licensed under the MIT License . // SPDX-License-Identifier: MIT // Copyright (c) 2022-2024 Martin Leitner-Ankerl // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ANKERL_UNORDERED_DENSE_H #define ANKERL_UNORDERED_DENSE_H // see https://semver.org/spec/v2.0.0.html #define ANKERL_UNORDERED_DENSE_VERSION_MAJOR 4 // NOLINT(cppcoreguidelines-macro-usage) incompatible API changes #define ANKERL_UNORDERED_DENSE_VERSION_MINOR 5 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible functionality #define ANKERL_UNORDERED_DENSE_VERSION_PATCH 0 // NOLINT(cppcoreguidelines-macro-usage) backwards compatible bug fixes // API versioning with inline namespace, see https://www.foonathan.net/2018/11/inline-namespaces/ // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) #define ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch) v##major##_##minor##_##patch // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) #define ANKERL_UNORDERED_DENSE_VERSION_CONCAT(major, minor, patch) ANKERL_UNORDERED_DENSE_VERSION_CONCAT1(major, minor, patch) #define ANKERL_UNORDERED_DENSE_NAMESPACE \ ANKERL_UNORDERED_DENSE_VERSION_CONCAT( \ ANKERL_UNORDERED_DENSE_VERSION_MAJOR, ANKERL_UNORDERED_DENSE_VERSION_MINOR, ANKERL_UNORDERED_DENSE_VERSION_PATCH) #if defined(_MSVC_LANG) # define ANKERL_UNORDERED_DENSE_CPP_VERSION _MSVC_LANG #else # define ANKERL_UNORDERED_DENSE_CPP_VERSION __cplusplus #endif #if defined(__GNUC__) // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) # define ANKERL_UNORDERED_DENSE_PACK(decl) decl __attribute__((__packed__)) #elif defined(_MSC_VER) // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) # define ANKERL_UNORDERED_DENSE_PACK(decl) __pragma(pack(push, 1)) decl __pragma(pack(pop)) #endif // exceptions #if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND) # define ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() 1 // NOLINT(cppcoreguidelines-macro-usage) #else # define ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() 0 // NOLINT(cppcoreguidelines-macro-usage) #endif #ifdef _MSC_VER # define ANKERL_UNORDERED_DENSE_NOINLINE __declspec(noinline) #else # define ANKERL_UNORDERED_DENSE_NOINLINE __attribute__((noinline)) #endif // defined in unordered_dense.cpp #if !defined(ANKERL_UNORDERED_DENSE_EXPORT) # define ANKERL_UNORDERED_DENSE_EXPORT #endif #if ANKERL_UNORDERED_DENSE_CPP_VERSION < 201703L # error ankerl::unordered_dense requires C++17 or higher #else # include // for array # include // for uint64_t, uint32_t, uint8_t, UINT64_C # include // for size_t, memcpy, memset # include // for equal_to, hash # include // for initializer_list # include // for pair, distance # include // for numeric_limits # include // for allocator, allocator_traits, shared_ptr # include // for optional # include // for out_of_range # include // for basic_string # include // for basic_string_view, hash # include // for forward_as_tuple # include // for enable_if_t, declval, conditional_t, ena... # include // for forward, exchange, pair, as_const, piece... # include // for vector # if ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() == 0 # include // for abort # endif # if defined(__has_include) && !defined(ANKERL_UNORDERED_DENSE_DISABLE_PMR) # if __has_include() # define ANKERL_UNORDERED_DENSE_PMR std::pmr // NOLINT(cppcoreguidelines-macro-usage) # include // for polymorphic_allocator # elif __has_include() # define ANKERL_UNORDERED_DENSE_PMR std::experimental::pmr // NOLINT(cppcoreguidelines-macro-usage) # include // for polymorphic_allocator # endif # endif # if defined(_MSC_VER) && defined(_M_X64) # include # pragma intrinsic(_umul128) # endif # if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) # define ANKERL_UNORDERED_DENSE_LIKELY(x) __builtin_expect(x, 1) // NOLINT(cppcoreguidelines-macro-usage) # define ANKERL_UNORDERED_DENSE_UNLIKELY(x) __builtin_expect(x, 0) // NOLINT(cppcoreguidelines-macro-usage) # else # define ANKERL_UNORDERED_DENSE_LIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage) # define ANKERL_UNORDERED_DENSE_UNLIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage) # endif namespace ankerl::unordered_dense { inline namespace ANKERL_UNORDERED_DENSE_NAMESPACE { namespace detail { # if ANKERL_UNORDERED_DENSE_HAS_EXCEPTIONS() // make sure this is not inlined as it is slow and dramatically enlarges code, thus making other // inlinings more difficult. Throws are also generally the slow path. [[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_key_not_found() { throw std::out_of_range("ankerl::unordered_dense::map::at(): key not found"); } [[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_bucket_overflow() { throw std::overflow_error("ankerl::unordered_dense: reached max bucket size, cannot increase size"); } [[noreturn]] inline ANKERL_UNORDERED_DENSE_NOINLINE void on_error_too_many_elements() { throw std::out_of_range("ankerl::unordered_dense::map::replace(): too many elements"); } # else [[noreturn]] inline void on_error_key_not_found() { abort(); } [[noreturn]] inline void on_error_bucket_overflow() { abort(); } [[noreturn]] inline void on_error_too_many_elements() { abort(); } # endif } // namespace detail // hash /////////////////////////////////////////////////////////////////////// // This is a stripped-down implementation of wyhash: https://github.com/wangyi-fudan/wyhash // No big-endian support (because different values on different machines don't matter), // hardcodes seed and the secret, reformats the code, and clang-tidy fixes. namespace detail::wyhash { inline void mum(uint64_t* a, uint64_t* b) { # if defined(__SIZEOF_INT128__) __uint128_t r = *a; r *= *b; *a = static_cast(r); *b = static_cast(r >> 64U); # elif defined(_MSC_VER) && defined(_M_X64) *a = _umul128(*a, *b, b); # else uint64_t ha = *a >> 32U; uint64_t hb = *b >> 32U; uint64_t la = static_cast(*a); uint64_t lb = static_cast(*b); uint64_t hi{}; uint64_t lo{}; uint64_t rh = ha * hb; uint64_t rm0 = ha * lb; uint64_t rm1 = hb * la; uint64_t rl = la * lb; uint64_t t = rl + (rm0 << 32U); auto c = static_cast(t < rl); lo = t + (rm1 << 32U); c += static_cast(lo < t); hi = rh + (rm0 >> 32U) + (rm1 >> 32U) + c; *a = lo; *b = hi; # endif } // multiply and xor mix function, aka MUM [[nodiscard]] inline auto mix(uint64_t a, uint64_t b) -> uint64_t { mum(&a, &b); return a ^ b; } // read functions. WARNING: we don't care about endianness, so results are different on big endian! [[nodiscard]] inline auto r8(const uint8_t* p) -> uint64_t { uint64_t v{}; std::memcpy(&v, p, 8U); return v; } [[nodiscard]] inline auto r4(const uint8_t* p) -> uint64_t { uint32_t v{}; std::memcpy(&v, p, 4); return v; } // reads 1, 2, or 3 bytes [[nodiscard]] inline auto r3(const uint8_t* p, size_t k) -> uint64_t { return (static_cast(p[0]) << 16U) | (static_cast(p[k >> 1U]) << 8U) | p[k - 1]; } [[maybe_unused]] [[nodiscard]] inline auto hash(void const* key, size_t len) -> uint64_t { static constexpr auto secret = std::array{UINT64_C(0xa0761d6478bd642f), UINT64_C(0xe7037ed1a0b428db), UINT64_C(0x8ebc6af09c88c6e3), UINT64_C(0x589965cc75374cc3)}; auto const* p = static_cast(key); uint64_t seed = secret[0]; uint64_t a{}; uint64_t b{}; if (ANKERL_UNORDERED_DENSE_LIKELY(len <= 16)) { if (ANKERL_UNORDERED_DENSE_LIKELY(len >= 4)) { a = (r4(p) << 32U) | r4(p + ((len >> 3U) << 2U)); b = (r4(p + len - 4) << 32U) | r4(p + len - 4 - ((len >> 3U) << 2U)); } else if (ANKERL_UNORDERED_DENSE_LIKELY(len > 0)) { a = r3(p, len); b = 0; } else { a = 0; b = 0; } } else { size_t i = len; if (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 48)) { uint64_t see1 = seed; uint64_t see2 = seed; do { seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed); see1 = mix(r8(p + 16) ^ secret[2], r8(p + 24) ^ see1); see2 = mix(r8(p + 32) ^ secret[3], r8(p + 40) ^ see2); p += 48; i -= 48; } while (ANKERL_UNORDERED_DENSE_LIKELY(i > 48)); seed ^= see1 ^ see2; } while (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 16)) { seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed); i -= 16; p += 16; } a = r8(p + i - 16); b = r8(p + i - 8); } return mix(secret[1] ^ len, mix(a ^ secret[1], b ^ seed)); } [[nodiscard]] inline auto hash(uint64_t x) -> uint64_t { return detail::wyhash::mix(x, UINT64_C(0x9E3779B97F4A7C15)); } } // namespace detail::wyhash ANKERL_UNORDERED_DENSE_EXPORT template struct hash { auto operator()(T const& obj) const noexcept(noexcept(std::declval>().operator()(std::declval()))) -> uint64_t { return std::hash{}(obj); } }; template struct hash::is_avalanching> { using is_avalanching = void; auto operator()(T const& obj) const noexcept(noexcept(std::declval>().operator()(std::declval()))) -> uint64_t { return std::hash{}(obj); } }; template struct hash> { using is_avalanching = void; auto operator()(std::basic_string const& str) const noexcept -> uint64_t { return detail::wyhash::hash(str.data(), sizeof(CharT) * str.size()); } }; template struct hash> { using is_avalanching = void; auto operator()(std::basic_string_view const& sv) const noexcept -> uint64_t { return detail::wyhash::hash(sv.data(), sizeof(CharT) * sv.size()); } }; template struct hash { using is_avalanching = void; auto operator()(T* ptr) const noexcept -> uint64_t { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) return detail::wyhash::hash(reinterpret_cast(ptr)); } }; template struct hash> { using is_avalanching = void; auto operator()(std::unique_ptr const& ptr) const noexcept -> uint64_t { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) return detail::wyhash::hash(reinterpret_cast(ptr.get())); } }; template struct hash> { using is_avalanching = void; auto operator()(std::shared_ptr const& ptr) const noexcept -> uint64_t { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) return detail::wyhash::hash(reinterpret_cast(ptr.get())); } }; template struct hash::value>::type> { using is_avalanching = void; auto operator()(Enum e) const noexcept -> uint64_t { using underlying = typename std::underlying_type_t; return detail::wyhash::hash(static_cast(e)); } }; template struct tuple_hash_helper { // Converts the value into 64bit. If it is an integral type, just cast it. Mixing is doing the rest. // If it isn't an integral we need to hash it. template [[nodiscard]] constexpr static auto to64(Arg const& arg) -> uint64_t { if constexpr (std::is_integral_v || std::is_enum_v) { return static_cast(arg); } else { return hash{}(arg); } } [[nodiscard]] static auto mix64(uint64_t state, uint64_t v) -> uint64_t { return detail::wyhash::mix(state + v, uint64_t{0x9ddfea08eb382d69}); } // Creates a buffer that holds all the data from each element of the tuple. If possible we memcpy the data directly. If // not, we hash the object and use this for the array. Size of the array is known at compile time, and memcpy is optimized // away, so filling the buffer is highly efficient. Finally, call wyhash with this buffer. template [[nodiscard]] static auto calc_hash(T const& t, std::index_sequence) noexcept -> uint64_t { auto h = uint64_t{}; ((h = mix64(h, to64(std::get(t)))), ...); return h; } }; template struct hash> : tuple_hash_helper { using is_avalanching = void; auto operator()(std::tuple const& t) const noexcept -> uint64_t { return tuple_hash_helper::calc_hash(t, std::index_sequence_for{}); } }; template struct hash> : tuple_hash_helper { using is_avalanching = void; auto operator()(std::pair const& t) const noexcept -> uint64_t { return tuple_hash_helper::calc_hash(t, std::index_sequence_for{}); } }; // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) # define ANKERL_UNORDERED_DENSE_HASH_STATICCAST(T) \ template <> \ struct hash { \ using is_avalanching = void; \ auto operator()(T const& obj) const noexcept -> uint64_t { \ return detail::wyhash::hash(static_cast(obj)); \ } \ } # if defined(__GNUC__) && !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wuseless-cast" # endif // see https://en.cppreference.com/w/cpp/utility/hash ANKERL_UNORDERED_DENSE_HASH_STATICCAST(bool); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(signed char); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned char); # if ANKERL_UNORDERED_DENSE_CPP_VERSION >= 202002L && defined(__cpp_char8_t) ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char8_t); # endif ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char16_t); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char32_t); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(wchar_t); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(short); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned short); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(int); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned int); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long long); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long); ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long long); # if defined(__GNUC__) && !defined(__clang__) # pragma GCC diagnostic pop # endif // bucket_type ////////////////////////////////////////////////////////// namespace bucket_type { struct standard { static constexpr uint32_t dist_inc = 1U << 8U; // skip 1 byte fingerprint static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash uint32_t m_value_idx; // index into the m_values vector. }; ANKERL_UNORDERED_DENSE_PACK(struct big { static constexpr uint32_t dist_inc = 1U << 8U; // skip 1 byte fingerprint static constexpr uint32_t fingerprint_mask = dist_inc - 1; // mask for 1 byte of fingerprint uint32_t m_dist_and_fingerprint; // upper 3 byte: distance to original bucket. lower byte: fingerprint from hash size_t m_value_idx; // index into the m_values vector. }); } // namespace bucket_type namespace detail { struct nonesuch {}; struct default_container_t {}; template class Op, class... Args> struct detector { using value_t = std::false_type; using type = Default; }; template class Op, class... Args> struct detector>, Op, Args...> { using value_t = std::true_type; using type = Op; }; template