Repository: hajimes/mmh3 Branch: master Commit: fd45e5ef5078 Files: 49 Total size: 283.8 KB Directory structure: gitextract_gkc_80ex/ ├── .clang-format ├── .github/ │ ├── actionlint.yml │ ├── dependabot.yml │ └── workflows/ │ ├── benchmark-base-hash.yml │ ├── benchmark.yml │ ├── build.yml │ ├── draft-pdf.yml │ ├── superlinter.yml │ └── wheels.yml ├── .gitignore ├── .gitmodules ├── .markdown-lint.yml ├── .readthedocs.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── benchmark/ │ ├── benchmark.py │ ├── generate_table.py │ ├── plot_graph.py │ └── plot_graph_base_hash.py ├── docs/ │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── CONTRIBUTORS.md │ ├── Makefile │ ├── api.md │ ├── benchmark.md │ ├── changelog.md │ ├── conf.py │ ├── index.rst │ ├── make.bat │ └── quickstart.md ├── paper/ │ ├── paper.bib │ └── paper.md ├── pyproject.toml ├── src/ │ └── mmh3/ │ ├── __init__.pyi │ ├── hashlib.h │ ├── mmh3module.c │ ├── murmurhash3.c │ ├── murmurhash3.h │ └── py.typed ├── tests/ │ ├── helper.py │ ├── test_doctrings.py │ ├── test_free_threading.py │ ├── test_invalid_inputs.py │ ├── test_mmh3.py │ └── test_mmh3_hasher.py ├── tox.ini └── util/ ├── FILE_HEADER └── refresh.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ # This .clang-format was originally written by Paul Ganssle # Its exact license is unknown but compatible with MIT # https://gist.github.com/pganssle/0e3a5f828b4d07d79447f6ced8e7e4db # A clang-format style that approximates Python's PEP 7 # Useful for IDE integration BasedOnStyle: Google AlwaysBreakAfterReturnType: All AllowShortIfStatementsOnASingleLine: false AlignAfterOpenBracket: Align BreakBeforeBraces: Stroustrup ColumnLimit: 79 DerivePointerAlignment: false IndentWidth: 4 Language: Cpp PointerAlignment: Right ReflowComments: true SpaceBeforeParens: ControlStatements SpacesInParentheses: false TabWidth: 4 UseTab: Never ================================================ FILE: .github/actionlint.yml ================================================ # As of March 5, 2026, actionlint via super-linter 8.5.0 does not support macOS 26, so we ignore the runner-label warning for now. paths: .github/workflows/**/*.{yml,yaml}: ignore: - 'label "macos-26" is unknown.+' - 'label "macos-26-intel" is unknown.+' ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" day: "monday" open-pull-requests-limit: 5 groups: dependencies: patterns: - "*" cooldown: default-days: 7 commit-message: prefix: "chore(ci)" - package-ecosystem: "pip" directory: "/" schedule: interval: "weekly" day: "monday" open-pull-requests-limit: 5 groups: dependencies: patterns: - "*" cooldown: default-days: 7 commit-message: prefix: "chore" include: "scope" ================================================ FILE: .github/workflows/benchmark-base-hash.yml ================================================ --- name: Benchmark Base Hash on: workflow_dispatch: permissions: {} jobs: benchmark: permissions: contents: read packages: read runs-on: ubuntu-24.04 env: BENCHMARK_MAX_SIZE: 65536 steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Set up Python uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.14" - name: Install dependencies run: | pip install --upgrade pip pip install . pip install ".[benchmark]" - name: Tune the system for benchmarking run: | echo "Running \"lscpu -a -e\"..." lscpu -a -e echo -n "Checking randomize_va_space: " cat /proc/sys/kernel/randomize_va_space echo "randomize_va_space should be 2, meaning ASLR is fully enabled." systemctl status irqbalance echo "Stopping irqbalance..." sudo systemctl stop irqbalance echo -n "Checking default_smp_affinity: " cat /proc/irq/default_smp_affinity echo 3 | sudo tee /proc/irq/default_smp_affinity > /dev/null echo -n "Updated default_smp_affinity to: " cat /proc/irq/default_smp_affinity echo -n "Checking perf_event_max_sample_rate: " cat /proc/sys/kernel/perf_event_max_sample_rate echo 1 | sudo tee /proc/sys/kernel/perf_event_max_sample_rate > /dev/null echo -n "Updated perf_event_max_sample_rate to: " cat /proc/sys/kernel/perf_event_max_sample_rate - name: Benchmark hash functions run: | mkdir var taskset -c 2,3 python benchmark/benchmark.py \ -o var/mmh3_base_hash_500.json \ --test-hash mmh3_base_hash \ --test-buffer-size-max "$BENCHMARK_MAX_SIZE" taskset -c 2,3 python benchmark/benchmark.py \ -o var/mmh3_32_500.json \ --test-hash mmh3_32 \ --test-buffer-size-max "$BENCHMARK_MAX_SIZE" pip uninstall -y mmh3 pip install mmh3==4.1.0 taskset -c 2,3 python benchmark/benchmark.py \ -o var/mmh3_base_hash_410.json \ --test-hash mmh3_base_hash \ --test-buffer-size-max "$BENCHMARK_MAX_SIZE" - name: Reset the system from benchmarking run: | echo -n "Checking perf_event_max_sample_rate: " cat /proc/sys/kernel/perf_event_max_sample_rate echo 100000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rate > /dev/null echo -n "Updated perf_event_max_sample_rate to: " cat /proc/sys/kernel/perf_event_max_sample_rate echo -n "Checking default_smp_affinity: " cat /proc/irq/default_smp_affinity echo f | sudo tee /proc/irq/default_smp_affinity > /dev/null echo -n "Updated default_smp_affinity to: " cat /proc/irq/default_smp_affinity echo "Restarting irqbalance..." sudo systemctl restart irqbalance systemctl status irqbalance - name: Upload artifacts uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: benchmark-results path: var ================================================ FILE: .github/workflows/benchmark.yml ================================================ --- name: Benchmark on: workflow_dispatch: permissions: {} jobs: benchmark: permissions: contents: read packages: read runs-on: ubuntu-24.04 env: BENCHMARK_MAX_SIZE: 262144 steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Set up Python uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.14" - name: Install dependencies run: | pip install --upgrade pip pip install . pip install ".[benchmark]" - name: Tune the system for benchmarking run: | echo "Running \"lscpu -a -e\"..." lscpu -a -e echo -n "Checking randomize_va_space: " cat /proc/sys/kernel/randomize_va_space echo "randomize_va_space should be 2, meaning ASLR is fully enabled." systemctl status irqbalance echo "Stopping irqbalance..." sudo systemctl stop irqbalance echo -n "Checking default_smp_affinity: " cat /proc/irq/default_smp_affinity echo 3 | sudo tee /proc/irq/default_smp_affinity > /dev/null echo -n "Updated default_smp_affinity to: " cat /proc/irq/default_smp_affinity echo -n "Checking perf_event_max_sample_rate: " cat /proc/sys/kernel/perf_event_max_sample_rate echo 1 | sudo tee /proc/sys/kernel/perf_event_max_sample_rate > /dev/null echo -n "Updated perf_event_max_sample_rate to: " cat /proc/sys/kernel/perf_event_max_sample_rate - name: Benchmark hash functions run: | mkdir var declare -a hash_list=("mmh3_32" "mmh3_128" "xxh_32" "xxh_64" \ "xxh3_64" "xxh3_128" "md5" "sha1") for hash_name in "${hash_list[@]}"; do echo "${hash_name}" taskset -c 2,3 python benchmark/benchmark.py \ -o var/"${hash_name}".json \ --test-hash "${hash_name}" \ --test-buffer-size-max "$BENCHMARK_MAX_SIZE" done - name: Reset the system from benchmarking run: | echo -n "Checking perf_event_max_sample_rate: " cat /proc/sys/kernel/perf_event_max_sample_rate echo 100000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rate > /dev/null echo -n "Updated perf_event_max_sample_rate to: " cat /proc/sys/kernel/perf_event_max_sample_rate echo -n "Checking default_smp_affinity: " cat /proc/irq/default_smp_affinity echo f | sudo tee /proc/irq/default_smp_affinity > /dev/null echo -n "Updated default_smp_affinity to: " cat /proc/irq/default_smp_affinity echo "Restarting irqbalance..." sudo systemctl restart irqbalance systemctl status irqbalance - name: Upload artifacts uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: benchmark-results path: var ================================================ FILE: .github/workflows/build.yml ================================================ # This workflow is intended for quick building tests. # Use wheels.yml for complete building/uploading tests. --- name: Build on: # yamllint disable-line rule:truthy push: branches: "**" pull_request: types: - opened - synchronize - reopened permissions: {} jobs: build: permissions: contents: read packages: read strategy: matrix: os: [macos-26, windows-2025, ubuntu-24.04] python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "3.14t"] runs-on: ${{ matrix.os }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools build pip install . pip install ".[test,type]" - name: Test with pytest run: python -m pytest - name: Test type hints with mypy run: mypy --strict tests - name: Test building from the source distribution shell: bash run: | pip uninstall -y mmh3 python -m build --sdist python -m pip install dist/*.tar.gz python -m pytest mypy --strict tests ================================================ FILE: .github/workflows/draft-pdf.yml ================================================ --- name: Draft Paper on: push: branches: - paper workflow_dispatch: permissions: {} jobs: paper: permissions: contents: read packages: read runs-on: ubuntu-latest name: Paper Draft if: github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/master' steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Build draft PDF uses: openjournals/openjournals-draft-action@85a18372e48f551d8af9ddb7a747de685fbbb01c # v1.0 with: journal: joss # This should be the path to the paper within your repo. paper-path: paper/paper.md - name: Upload uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: paper # This is the output path where Pandoc will write the compiled # PDF. Note, this should be the same directory as the input # paper.md path: paper/paper.pdf ================================================ FILE: .github/workflows/superlinter.yml ================================================ --- name: Super-Linter on: # yamllint disable-line rule:truthy push: branches: "**" pull_request: types: - opened - synchronize - reopened permissions: {} jobs: # Set the job key. The key is displayed as the job name # when a job name is not provided super-lint: # Name the Job name: Lint code base # Set the type of machine to run on runs-on: ubuntu-latest permissions: contents: read packages: read # To report GitHub Actions status checks statuses: write steps: # Checks out a copy of your repository on the ubuntu-latest machine - name: Checkout code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false # super-linter needs the full git history to get the # list of files that changed across commits fetch-depth: 0 # Runs the Super-Linter action - name: Run Super-Linter uses: super-linter/super-linter@61abc07d755095a68f4987d1c2c3d1d64408f1f9 # v8.5.0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} LINTER_RULES_PATH: / GITHUB_ACTIONS_CONFIG_FILE: .github/actionlint.yml PYTHON_PYLINT_CONFIG_FILE: pyproject.toml PYTHON_RUFF_CONFIG_FILE: pyproject.toml PYTHON_RUFF_FORMAT_CONFIG_FILE: pyproject.toml # Suppressed because it conflicts with clang-format in some cases VALIDATE_CPP: false # Suppressed because copy/paste is sometimes required at low level VALIDATE_JSCPD: false # Suppressed in favor of Ruff VALIDATE_PYTHON_BLACK: false VALIDATE_PYTHON_FLAKE8: false VALIDATE_PYTHON_ISORT: false # Suppressed because it even accuses book titles VALIDATE_NATURAL_LANGUAGE: false # Suppressed because it does not honor the ignore-paths option VALIDATE_PYTHON_PYLINT: false # super-linter 7 does not honor the ignore-paths option of pylint # so we run pylint separately - name: Set up Python uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.14" - name: Run pylint run: | pip install pylint pylint --recursive=y . ================================================ FILE: .github/workflows/wheels.yml ================================================ --- name: Wheel-Builder on: push: tags: - "v*.*.*" workflow_dispatch: permissions: {} jobs: build_wheels: name: Build wheel for ${{ matrix.platform }} ${{ matrix.archs }} ${{ matrix.build }} (runs on ${{ matrix.os }}) runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-24.04] archs: [x86_64, i686, aarch64, ppc64le, s390x] build: [manylinux, musllinux] include: - os: windows-2025 archs: AMD64 - os: windows-2025 archs: x86 - os: windows-2025 archs: ARM64 - os: macos-26-intel archs: x86_64 - os: macos-26 archs: arm64 - os: macos-26 archs: universal2 - os: ubuntu-24.04 platform: android archs: x86_64 build: android - os: macos-26 platform: android archs: arm64_v8a build: android - os: macos-26 platform: ios archs: arm64_iphoneos - os: macos-26 platform: ios archs: arm64_iphonesimulator - os: macos-26-intel platform: ios archs: x86_64_iphonesimulator steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Set up Python uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.14" - name: Set dev version for TestPyPI if: github.event_name == 'workflow_dispatch' shell: python run: | import re, datetime timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M") text = open("pyproject.toml", encoding="utf-8").read() m = re.search(r'version\s*=\s*"(.+?)"', text) if not m: raise RuntimeError("version field not found in pyproject.toml") version = m.group(1) base_version = version.split("-")[0] new_text = re.sub( r'version\s*=\s*".*?"', f'version = "{base_version}.dev{timestamp}"', text, count=1 ) open("pyproject.toml", "w", encoding="utf-8").write(new_text) - name: Set up QEMU if: runner.os == 'Linux' && matrix.platform != 'android' uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0 # https://github.blog/changelog/2024-04-02-github-actions-hardware-accelerated-android-virtualization-now-available/ - name: Set up KVM for Android emulation if: runner.os == 'Linux' && matrix.platform == 'android' run: | echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules sudo udevadm control --reload-rules sudo udevadm trigger --name-match=kvm - name: Build wheels uses: pypa/cibuildwheel@ee02a1537ce3071a004a6b08c41e72f0fdc42d9a # v3.4.0 with: output-dir: wheelhouse env: CIBW_BUILD: "{cp310,cp311,cp312,cp313,cp314,cp314t}-${{ matrix.build }}*" CIBW_PLATFORM: ${{ matrix.platform || 'auto' }} CIBW_ARCHS: ${{ matrix.archs }} CIBW_BUILD_FRONTEND: "build" CIBW_TEST_REQUIRES: "pytest" CIBW_TEST_SOURCES_ANDROID: "./tests" CIBW_TEST_SOURCES_IOS: "./tests" CIBW_TEST_COMMAND: "pytest {project}" CIBW_TEST_COMMAND_ANDROID: "python -m pytest ./tests" CIBW_TEST_COMMAND_IOS: "python -m pytest ./tests" CIBW_TEST_SKIP: "*-win_arm64 *-android_arm64_v8a" - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: Wheel-${{ matrix.os }}-${{ matrix.platform }}-${{ matrix.build }}${{ matrix.archs }} path: ./wheelhouse/*.whl build_sdist: name: Build a source distribution runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Set up Python uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.14" - name: Set dev version for TestPyPI if: github.event_name == 'workflow_dispatch' shell: python run: | import re, datetime timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M") text = open("pyproject.toml", encoding="utf-8").read() m = re.search(r'version\s*=\s*"(.+?)"', text) if not m: raise RuntimeError("version field not found in pyproject.toml") version = m.group(1) base_version = version.split("-")[0] new_text = re.sub( r'version\s*=\s*".*?"', f'version = "{base_version}.dev{timestamp}"', text, count=1 ) open("pyproject.toml", "w", encoding="utf-8").write(new_text) - name: Build sdist run: | python -m pip install --upgrade pip pip install setuptools build python -m build --sdist - name: Test building from the source distribution shell: bash run: | pip install ".[test,type]" pip uninstall -y mmh3 python -m pip install dist/*.tar.gz python -m pytest mypy --strict tests - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: path: dist/*.tar.gz publish-to-pypi: name: "Publish artifacts to PyPI" if: startsWith(github.ref, 'refs/tags/') needs: [build_wheels, build_sdist] runs-on: ubuntu-24.04 environment: name: pypi url: https://pypi.org/p/mmh3 permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - name: Set up built items uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 with: path: dist merge-multiple: true - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 publish-to-testpypi: name: "Publish artifacts to TestPyPI" if: github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/master' needs: [build_wheels, build_sdist] runs-on: ubuntu-24.04 environment: name: testpypi url: https://test.pypi.org/p/mmh3 permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - name: Set up built items uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 with: path: dist merge-multiple: true - name: Publish package distributions to TestPyPI uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 with: repository-url: https://test.pypi.org/legacy/ ================================================ FILE: .gitignore ================================================ # From https://github.com/github/gitignore # CC0 1.0 Universal # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # pytest .pytest_cache/ # mypy .mypy_cache/ # macOS .DS_Store # vscode .vscode/ # Directory from an external source created by git submodule update --init # Adding this path is useful for other tools like markdwonlint and prettier util/smhasher/ ================================================ FILE: .gitmodules ================================================ [submodule "src/mmh3/_mmh3/smhasher"] path = util/smhasher url = https://github.com/aappleby/smhasher.git ================================================ FILE: .markdown-lint.yml ================================================ # MD024/no-duplicate-heading : Multiple headings with the same content : https://github.com/DavidAnson/markdownlint/blob/v0.34.0/doc/md024.md MD024: # Only check sibling headings (default is false) # Set to true to conform to the Keep a Changelog format # See also https://github.com/olivierlacan/keep-a-changelog/issues/274#issuecomment-484065486 siblings_only: true ================================================ FILE: .readthedocs.yml ================================================ # Read the Docs configuration file for MkDocs projects # See https://docs.readthedocs.io/en/stable/config-file/v2.html for detail # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-22.04 tools: python: "3.12" sphinx: builder: html configuration: docs/conf.py # Build all formats formats: - pdf - epub # Optionally declare the Python requirements required to build your docs python: install: - method: pip path: . extra_requirements: - docs ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented here. For a list of contributors, see the [Contributors](https://mmh3.readthedocs.io/en/stable/CONTRIBUTORS.html) page. The format is based on [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/). This project has adhered to [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html) since version 3.0.0. ## [5.2.1] - 2026-03-06 ### Added - Add support for the Android wheel for Python 3.14. ### Removed - Drop support for Python 3.9, as it has reached the end of life on 2025-10-31. ## [5.2.0] - 2025-07-29 ### Added - Add support for Python 3.14, including 3.14t (no-GIL) wheels. However, thread safety for the no-GIL variant is not fully tested yet. Please report any issues you encounter ([#134](https://github.com/hajimes/mmh3/pull/134), [#136](https://github.com/hajimes/mmh3/pull/136)). - Add support for Android (Python 3.13 only) and iOS (Python 3.13 and 3.14) wheels, enabled by the major version update of [cibuildwheel](https://github.com/pypa/cibuildwheel) ([#135](https://github.com/hajimes/mmh3/pull/135)). ## [5.1.0] - 2025-01-25 ### Added - Improve the performance of `hash128()`, `hash64()`, and `hash_bytes()` by using [METH_FASTCALL](https://docs.python.org/3/c-api/structures.html#c.METH_FASTCALL), reducing the overhead of function calls ([#116](https://github.com/hajimes/mmh3/pull/116)). - Add the software paper for this library ([doi:10.21105/joss.06124](https://doi.org/10.21105/joss.06124)), following its publication in the [_Journal of Open Source Software_](https://joss.theoj.org) ([#118](https://github.com/hajimes/mmh3/pull/118)). ### Removed - Drop support for Python 3.8, as it has reached the end of life on 2024-10-07 ([#117](https://github.com/hajimes/mmh3/pull/117)). ## [5.0.1] - 2024-09-22 ### Fixed - Fix the issue that the package cannot be built from the source distribution ([#90](https://github.com/hajimes/mmh3/issues/90)). ## [5.0.0] - 2024-09-18 ### Added - Add support for Python 3.13. - Improve the performance of the `hash()` function with [METH_FASTCALL](https://docs.python.org/3/c-api/structures.html#c.METH_FASTCALL), reducing the overhead of function calls. For data sizes between 1–2 KB (e.g., 48x48 favicons), performance is 10%–20% faster. For smaller data (~500 bytes, like 16x16 favicons), performance increases by approximately 30% ([#87](https://github.com/hajimes/mmh3/pull/87)). - Add `digest` functions that support the new buffer protocol ([PEP 688](https://peps.python.org/pep-0688/)) as input ([#75](https://github.com/hajimes/mmh3/pull/75)). These functions are implemented with `METH_FASTCALL` too, offering improved performance ([#84](https://github.com/hajimes/mmh3/pull/84)). - Slightly improve the performance of the `hash_bytes()` function ([#88](https://github.com/hajimes/mmh3/pull/88)) - Add Read the Docs documentation ([#54](https://github.com/hajimes/mmh3/issues/54)). - Document benchmark results ([#53](https://github.com/hajimes/mmh3/issues/53)). ### Changed - **Backward-incompatible**: The `seed` argument is now strictly validated to ensure it falls within the range [0, 0xFFFFFFFF]. A `ValueError` is raised if the seed is out of range ([#84](https://github.com/hajimes/mmh3/pull/84)). - **Backward-incompatible**: Change the constructors of hasher classes to accept a buffer as the first argument ([#83](https://github.com/hajimes/mmh3/pull/83)). - The type of flag arguments has been changed from `bool` to `Any` ([#84](https://github.com/hajimes/mmh3/pull/84)). - Change the format of CHANGELOG.md to conform to the [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) standard ([#63](https://github.com/hajimes/mmh3/pull/63)). ### Deprecated - Deprecate the `hash_from_buffer()` function. Use `mmh3_32_sintdigest()` or `mmh3_32_uintdigest()` as alternatives ([#84](https://github.com/hajimes/mmh3/pull/84)). ### Fixed - Fix a reference leak in the `hash_from_buffer()` function ([#75](https://github.com/hajimes/mmh3/pull/75)). - Fix type hints ([#76](https://github.com/hajimes/mmh3/pull/76), [#77](https://github.com/hajimes/mmh3/pull/77), [#84](https://github.com/hajimes/mmh3/pull/84)). ## [4.1.0] - 2024-01-09 ### Added - Add support for Python 3.12. ### Fixed - Fix issues with Bazel by changing the directory structure of the project ([#50](https://github.com/hajimes/mmh3/issues/50)). - Fix incorrect type hints ([#51](https://github.com/hajimes/mmh3/issues/51)). - Fix invalid results on s390x when the arg `x64arch` of `hash64` or `hash_bytes()` is set to `False` ([#52](https://github.com/hajimes/mmh3/issues/52)). ## [4.0.1] - 2023-07-14 ### Changed - Refactor the project structure ([#48](https://github.com/hajimes/mmh3/issues/48)). ### Fixed - Fix incorrect type hints. ## [4.0.0] - 2023-05-22 ### Added - Add experimental support for `hashlib`-compliant hasher classes ([#39](https://github.com/hajimes/mmh3/issues/39)). Note that they are not yet fully tuned for performance. - Add support for type hints ([#44](https://github.com/hajimes/mmh3/issues/44)). - Add wheels for more platforms (`musllinux`, `s390x`, `win_arm64`, and `macosx_universal2`). - Add a code of conduct (the ACM Code of Ethics and Professional Conduct). ### Changed - Switch license from CC0 to MIT ([#43](https://github.com/hajimes/mmh3/issues/43)). ### Removed - **Backward-incompatible**: A hash function now returns the same value under big-endian platforms as that under little-endian ones ([#47](https://github.com/hajimes/mmh3/issues/47)). - **Backward-incompatible**: Remove the `__version__` constant from the module ([#42](https://github.com/hajimes/mmh3/issues/42)). Use `importlib.metadata` instead. - Drop support for Python 3.7, as it will reach the end of life on 2023-06-27. ## [3.1.0] - 2023-03-24 ### Added - Add support for Python 3.10 and 3.11 ([#35](https://github.com/hajimes/mmh3/pull/35), [#37](https://github.com/hajimes/mmh3/pull/37)). - Add support for 32-bit architectures such as `i686` and `armv7l`. From now on, `hash()` and `hash_from_buffer()` on these architectures will generate the same hash values as those on other environments ([#40](https://github.com/hajimes/mmh3/pull/40)). - In relation to the above, `manylinux2014_i686` wheels are now available. - Support for hashing huge data (>16GB) ([#34](https://github.com/hajimes/mmh3/pull/34)). ### Removed - Drop support for Python 3.6; remove legacy code for Python 2.x at the source code level. ## [3.0.0] - 2021-02-23 ### Added - Python wheels are now available, thanks to the power of [cibuildwheel](https://github.com/joerick/cibuildwheel). - Supported platforms are `manylinux1_x86_64`, `manylinux2010_x86_64`, `manylinux2014_aarch64`, `win32`, `win_amd64`, `macosx_10_9_x86_64`, and `macosx_11_0_arm64` (Apple Silicon). - Add support for newer macOS environments ([#22](https://github.com/hajimes/mmh3/pull/22)). - Add support for Python 3.7, 3.8, and 3.9. ### Changed - Migrate CI from Travis CI and AppVeyor to GitHub Actions. ### Removed - Drop support for Python 2.7, 3.3, 3.4, and 3.5. ## [2.5.1] - 2017-10-31 ### Fixed - Bugfix for `hash_bytes()` ([#15](https://github.com/hajimes/mmh3/pull/15)). ## [2.5] - 2017-10-28 ### Added - Add `hash_from_buffer()` ([#13](https://github.com/hajimes/mmh3/pull/13)). - Add a keyword argument `signed`. ## [2.4] - 2017-05-27 ### Added - Support seeds with 32-bit unsigned integers ([#6](https://github.com/hajimes/mmh3/pull/6)). - Support 64-bit data (under 64-bit environments) - Add unit testing and continuous integration with Travis CI and AppVeyor. ### Fixed - Fix compile errors for Python 3.6 under Windows systems. ## [2.3.2] - 2017-05-26 ### Changed - Relicensed from public domain to CC0-1.0. ## [2.3.1] - 2015-06-07 ### Fixed - Fix compile errors for gcc >=5. ## [2.3] - 2013-12-08 ### Added - Add `hash128()`, which returns a 128-bit signed integer ([#3](https://github.com/hajimes/mmh3/pull/3)). ### Fixed - Fix a misplaced operator which could cause memory leak in a rare condition ([#2](https://github.com/hajimes/mmh3/pull/2)). - Fix a malformed value to a Python/C API function which may cause runtime errors in recent Python 3.x versions. ## [2.2] - 2013-03-03 ### Added - Improve portability to support systems with old gcc (version < 4.4) such as CentOS/RHEL 5.x ([#1](https://github.com/hajimes/mmh3/pull/1)). ## [2.1] - 2013-02-25 ### Added - Add `__version__` constant. Check if it exists when the following revision matters for your application. ### Changed - Incorporate the revision r147, which includes robustness improvement and minor tweaks. Beware that due to this revision, **the result of 32-bit version of 2.1 is NOT the same as that of 2.0**. E.g.,: ```pycon >>> mmh3.hash("foo") # in mmh3 2.0 -292180858 >>> mmh3.hash("foo") # in mmh3 2.1 -156908512 ``` The results of hash64 and hash_bytes remain unchanged. Austin Appleby, the author of Murmurhash, ensured this revision was the final modification to MurmurHash3's results and any future changes would be to improve performance only. ## [2.0] - 2011-06-07 ### Added - Support both Python 2.7 and 3.x. ### Changed - Change the module interface. ## [1.0] - 2011-04-27 ### Added - As [Softpedia collected mmh3 1.0 on April 27, 2011](https://web.archive.org/web/20110430172027/https://linux.softpedia.com/get/Programming/Libraries/mmh3-68314.shtml), it must have been uploaded to PyPI on or slightly before this date. [5.2.1]: https://github.com/hajimes/mmh3/compare/v5.2.0...v5.2.1 [5.2.0]: https://github.com/hajimes/mmh3/compare/v5.1.0...v5.2.0 [5.1.0]: https://github.com/hajimes/mmh3/compare/v5.0.1...v5.1.0 [5.0.1]: https://github.com/hajimes/mmh3/compare/v5.0.0...v5.0.1 [5.0.0]: https://github.com/hajimes/mmh3/compare/v4.1.0...v5.0.0 [4.1.0]: https://github.com/hajimes/mmh3/compare/v4.0.1...v4.1.0 [4.0.1]: https://github.com/hajimes/mmh3/compare/v4.0.0...v4.0.1 [4.0.0]: https://github.com/hajimes/mmh3/compare/v3.1.0...v4.0.0 [3.1.0]: https://github.com/hajimes/mmh3/compare/v3.0.0...v3.1.0 [3.0.0]: https://github.com/hajimes/mmh3/compare/v2.5.1...v3.0.0 [2.5.1]: https://github.com/hajimes/mmh3/compare/v2.5...v2.5.1 [2.5]: https://github.com/hajimes/mmh3/compare/v2.4...v2.5 [2.4]: https://github.com/hajimes/mmh3/compare/v2.3.2...v2.4 [2.3.2]: https://github.com/hajimes/mmh3/compare/v2.3.1...v2.3.2 [2.3.1]: https://github.com/hajimes/mmh3/compare/v2.3...v2.3.1 [2.3]: https://github.com/hajimes/mmh3/compare/v2.2...v2.3 [2.2]: https://github.com/hajimes/mmh3/compare/v2.1...v2.2 [2.1]: https://github.com/hajimes/mmh3/compare/v2.0...v2.1 [2.0]: https://github.com/hajimes/mmh3/releases/tag/v2.0 [1.0]: https://web.archive.org/web/20110430172027/https://linux.softpedia.com/get/Programming/Libraries/mmh3-68314.shtml ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2011-2026 Hajime Senuma Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # mmh3 [![Documentation Status](https://readthedocs.org/projects/mmh3/badge/?version=stable)](https://mmh3.readthedocs.io/en/stable/) [![GitHub Super-Linter](https://github.com/hajimes/mmh3/actions/workflows/superlinter.yml/badge.svg?branch=master)](https://github.com/hajimes/mmh3/actions?query=workflow%3ASuper-Linter+branch%3Amaster) [![Build](https://github.com/hajimes/mmh3/actions/workflows/build.yml/badge.svg?branch=master)](https://github.com/hajimes/mmh3/actions/workflows/build.yml?branch=master) [![PyPi Version](https://img.shields.io/pypi/v/mmh3.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/mmh3/) [![Python Versions](https://img.shields.io/pypi/pyversions/mmh3.svg)](https://pypi.org/project/mmh3/) [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/license/mit/) [![Total Downloads](https://static.pepy.tech/badge/mmh3)](https://pepy.tech/projects/mmh3?versions=*%2C5.*%2C4.*%2C3.*%2C2.*) [![Recent Downloads](https://static.pepy.tech/badge/mmh3/month)](https://pepy.tech/projects/mmh3?versions=*%2C5.*%2C4.*%2C3.*%2C2.*) [![DOI](https://joss.theoj.org/papers/10.21105/joss.06124/status.svg)](https://doi.org/10.21105/joss.06124) `mmh3` is a Python extension for [MurmurHash (MurmurHash3)](https://en.wikipedia.org/wiki/MurmurHash), a set of fast and robust non-cryptographic hash functions invented by Austin Appleby. By combining `mmh3` with probabilistic techniques like [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter), [MinHash](https://en.wikipedia.org/wiki/MinHash), and [feature hashing](https://en.wikipedia.org/wiki/Feature_hashing), you can develop high-performance systems in fields such as data mining, machine learning, and natural language processing. Another popular use of `mmh3` is to [calculate favicon hashes](https://gist.github.com/yehgdotnet/b9dfc618108d2f05845c4d8e28c5fc6a), which are utilized by [Shodan](https://www.shodan.io), the world's first IoT search engine. This page provides a quick start guide. For more comprehensive information, please refer to the [documentation](https://mmh3.readthedocs.io/en/stable/). ## Installation ```shell pip install mmh3 ``` ## Usage ### Basic usage ```pycon >>> import mmh3 >>> mmh3.hash(b"foo") # returns a 32-bit signed int -156908512 >>> mmh3.hash("foo") # accepts str (UTF-8 encoded) -156908512 >>> mmh3.hash(b"foo", 42) # uses 42 as the seed -1322301282 >>> mmh3.hash(b"foo", 0, False) # returns a 32-bit unsigned int 4138058784 ``` `mmh3.mmh3_x64_128_digest()`, introduced in version 5.0.0, efficienlty hashes buffer objects that implement the buffer protocol ([PEP 688](https://peps.python.org/pep-0688/)) without internal memory copying. The function returns a `bytes` object of 16 bytes (128 bits). It is particularly suited for hashing large memory views, such as `bytearray`, `memoryview`, and `numpy.ndarray`, and performs faster than the 32-bit variants like `hash()` on 64-bit machines. ```pycon >>> mmh3.mmh3_x64_128_digest(numpy.random.rand(100)) b'\x8c\xee\xc6z\xa9\xfeR\xe8o\x9a\x9b\x17u\xbe\xdc\xee' ``` Various alternatives are available, offering different return types (e.g., signed integers, tuples of unsigned integers) and optimized for different architectures. For a comprehensive list of functions, refer to the [API Reference](https://mmh3.readthedocs.io/en/stable/api.html). ### `hashlib`-style hashers `mmh3` implements hasher objects with interfaces similar to those in `hashlib` from the standard library, although they are still experimental. See [Hasher Classes](https://mmh3.readthedocs.io/en/stable/api.html#hasher-classes) in the API Reference for more information. ## Changelog See [Changelog (latest version)](https://mmh3.readthedocs.io/en/latest/changelog.html) for the complete changelog. ### [5.2.1] - 2026-03-06 #### Added - Add support for the Android wheel for Python 3.14. #### Removed - Drop support for Python 3.9, as it has reached the end of life on 2025-10-31. ### [5.2.0] - 2025-07-29 #### Added - Add support for Python 3.14, including 3.14t (no-GIL) wheels. However, thread safety for the no-GIL variant is not fully tested yet. Please report any issues you encounter ([#134](https://github.com/hajimes/mmh3/pull/134), [#136](https://github.com/hajimes/mmh3/pull/136)). - Add support for Android (Python 3.13 only) and iOS (Python 3.13 and 3.14) wheels, enabled by the major version update of [cibuildwheel](https://github.com/pypa/cibuildwheel) ([#135](https://github.com/hajimes/mmh3/pull/135)). ### [5.1.0] - 2025-01-25 #### Added - Improve the performance of `hash128()`, `hash64()`, and `hash_bytes()` by using [METH_FASTCALL](https://docs.python.org/3/c-api/structures.html#c.METH_FASTCALL), reducing the overhead of function calls ([#116](https://github.com/hajimes/mmh3/pull/116)). - Add the software paper for this library ([doi:10.21105/joss.06124](https://doi.org/10.21105/joss.06124)), following its publication in the [_Journal of Open Source Software_](https://joss.theoj.org) ([#118](https://github.com/hajimes/mmh3/pull/118)). #### Removed - Drop support for Python 3.8, as it has reached the end of life on 2024-10-07 ([#117](https://github.com/hajimes/mmh3/pull/117)). ## License [MIT](https://github.com/hajimes/mmh3/blob/master/LICENSE), unless otherwise noted within a file. ## Frequently Asked Questions ### Different results from other MurmurHash3-based libraries By default, `mmh3` returns **signed** values for the 32-bit and 64-bit versions and **unsigned** values for `hash128` due to historical reasons. To get the desired result, use the `signed` keyword argument. Starting from version 4.0.0, **`mmh3` is endian-neutral**, meaning that its hash functions return the same values on big-endian platforms as they do on little-endian ones. In contrast, the original C++ library by Appleby is endian-sensitive. If you need results that comply with the original library on big-endian systems, please use version 3.\*. For compatibility with [Google Guava (Java)](https://github.com/google/guava), see . For compatibility with [murmur3 (Go)](https://pkg.go.dev/github.com/spaolacci/murmur3), see . ### Handling errors with negative seeds From the version 5.0.0, `mmh3` functions accept only **unsigned** 32-bit integer seeds to enable faster type-checking and conversion. However, this change may cause issues if you need to calculate hash values using negative seeds within the range of signed 32-bit integers. For instance, [Telegram-iOS](https://github.com/TelegramMessenger/Telegram-iOS) uses `-137723950` as a hard-coded seed (bitwise equivalent to `4157243346`). To handle such cases, you can convert a signed 32-bit integer to its unsigned equivalent by applying a bitwise AND operation with `0xffffffff`. Here's an example: ```pycon >>> mmh3.hash(b"quux", 4294967295) 258499980 >>> d = -1 >>> mmh3.hash(b"quux", d & 0xffffffff) 258499980 ``` Alternatively, if the seed is hard-coded (as in the Telegram-iOS case), you can precompute the unsigned value for simplicity. ## Contributing Guidelines See [Contributing](https://mmh3.readthedocs.io/en/stable/CONTRIBUTING.html). ## Authors MurmurHash3 was originally developed by Austin Appleby and distributed under public domain [https://github.com/aappleby/smhasher](https://github.com/aappleby/smhasher). Ported and modified for Python by Hajime Senuma. ## External Tutorials ### High-performance computing The following textbooks and tutorials are great resources for learning how to use `mmh3` (and other hash algorithms in general) for high-performance computing. - Chapter 11: _Using Less Ram_ in Micha Gorelick and Ian Ozsvald. 2014. _High Performance Python: Practical Performant Programming for Humans_. O'Reilly Media. [ISBN: 978-1-4493-6159-4](https://www.amazon.com/dp/1449361595). - 3rd edition of the above (2025). [ISBN: 978-1098165963](https://www.amazon.com/dp/1098165969/). - Max Burstein. February 2, 2013. _[Creating a Simple Bloom Filter](http://www.maxburstein.com/blog/creating-a-simple-bloom-filter/)_. - Duke University. April 14, 2016. _[Efficient storage of data in memory](http://people.duke.edu/~ccc14/sta-663-2016/20B_Big_Data_Structures.html)_. - Bugra Akyildiz. August 24, 2016. _[A Gentle Introduction to Bloom Filter](https://www.kdnuggets.com/2016/08/gentle-introduction-bloom-filter.html)_. KDnuggets. ### Internet of things [Shodan](https://www.shodan.io), the world's first [IoT](https://en.wikipedia.org/wiki/Internet_of_things) search engine, uses MurmurHash3 hash values for [favicons](https://en.wikipedia.org/wiki/Favicon) (icons associated with web pages). [ZoomEye](https://www.zoomeye.org) follows Shodan's convention. [Calculating these values with mmh3](https://gist.github.com/yehgdotnet/b9dfc618108d2f05845c4d8e28c5fc6a) is useful for OSINT and cybersecurity activities. - Jan Kopriva. April 19, 2021. _[Hunting phishing websites with favicon hashes](https://isc.sans.edu/diary/Hunting+phishing+websites+with+favicon+hashes/27326)_. SANS Internet Storm Center. - Nikhil Panwar. May 2, 2022. _[Using Favicons to Discover Phishing & Brand Impersonation Websites](https://bolster.ai/blog/how-to-use-favicons-to-find-phishing-websites)_. Bolster. - Faradaysec. July 25, 2022. _[Understanding Spring4Shell: How used is it?](https://faradaysec.com/understanding-spring4shell/)_. Faraday Security. - Debjeet. August 2, 2022. _[How To Find Assets Using Favicon Hashes](https://payatu.com/blog/favicon-hash/)_. Payatu. ## How to Cite This Library If you use this library in your research, it would be appreciated if you could cite the following paper published in the [_Journal of Open Source Software_](https://joss.theoj.org): Hajime Senuma. 2025. [mmh3: A Python extension for MurmurHash3](https://doi.org/10.21105/joss.06124). _Journal of Open Source Software_, 10(105):6124. In BibTeX format: ```tex @article{senumaMmh3PythonExtension2025, title = {{mmh3}: A {Python} extension for {MurmurHash3}}, author = {Senuma, Hajime}, year = {2025}, month = jan, journal = {Journal of Open Source Software}, volume = {10}, number = {105}, pages = {6124}, issn = {2475-9066}, doi = {10.21105/joss.06124}, copyright = {http://creativecommons.org/licenses/by/4.0/} } ``` ## Related Libraries - : mmh3 in pure python (Fredrik Kihlander and Swapnil Gusani) - : Python bindings for CityHash (Eugene Scherba) - : Python bindings for FarmHash (Veelion Chong) - : Python bindings for MetroHash (Eugene Scherba) - : Python bindings for xxHash (Yue Du) [5.2.1]: https://github.com/hajimes/mmh3/compare/v5.2.0...v5.2.1 [5.2.0]: https://github.com/hajimes/mmh3/compare/v5.1.0...v5.2.0 [5.1.0]: https://github.com/hajimes/mmh3/compare/v5.0.1...v5.1.0 ================================================ FILE: benchmark/benchmark.py ================================================ """Benchmark module for various hash functions.""" import hashlib import itertools import math import random import time from collections.abc import Callable from typing import Final import pymmh3 import pyperf import xxhash import mmh3 K1: Final[int] = 0b1001111000110111011110011011000110000101111010111100101010000111 K2: Final[int] = 0b1100001010110010101011100011110100100111110101001110101101001111 MASK: Final[int] = 0xFFFFFFFFFFFFFFFF def init_buffer(ba: bytearray) -> bytearray: """Initializes a byte array with a pattern. Initializes a byte array with a pattern based on xxHash's benchmarking. https://github.com/Cyan4973/xxHash/blob/dev/tests/bench/benchHash.c Args: ba: The byte array to initialize. Returns: The initialized byte array. """ acc = K2 for i, _ in enumerate(ba): acc = (acc * K1) & MASK ba[i] = acc >> 56 return ba def generate_size(size: int, p: float) -> int: """Generate a random size for a buffer. Args: size: The size of the buffer to hash. p: The percentage of the buffer size to vary. Returns: The random size of the buffer. """ lower = math.ceil(size * (1 - p)) upper = math.floor(size * (1 + p)) return random.randint(lower, upper) def perf_hash(loops: int, f: Callable, size: int) -> float: """Benchmark a hash function. Args: loops: The number of outer loops to run. f: The hash function to benchmark size: The size of the buffer to hash. Returns: The time taken to hash the buffer in fractional seconds. """ # pylint: disable=too-many-locals if size <= 0: raise ValueError("size must be greater than 0") range_it = itertools.repeat(None, loops) data = bytearray(size + 9) data = init_buffer(data) data0 = bytes(data[0:size]) data1 = bytes(data[1 : size + 1]) data2 = bytes(data[2 : size + 2]) data3 = bytes(data[3 : size + 3]) data4 = bytes(data[4 : size + 4]) data5 = bytes(data[5 : size + 5]) data6 = bytes(data[6 : size + 6]) data7 = bytes(data[7 : size + 7]) data8 = bytes(data[8 : size + 8]) data9 = bytes(data[9 : size + 9]) t0 = time.perf_counter() for _ in range_it: f(data0) f(data1) f(data2) f(data3) f(data4) f(data5) f(data6) f(data7) f(data8) f(data9) return time.perf_counter() - t0 def perf_hash_random(loops: int, f: Callable, size: int) -> float: """Benchmark a hash function with varying data sizes. Args: loops: The number of outer loops to run. f: The hash function to benchmark size: The size of the buffer to hash. Returns: The time taken to hash the buffer in fractional seconds. """ # pylint: disable=too-many-locals if size <= 0: raise ValueError("size must be greater than 0") range_it = itertools.repeat(None, loops) random.seed(42) inner_loops = 10 extra_size = 255 data = bytearray(size + extra_size) data = init_buffer(data) pos_list = [random.randint(0, extra_size) for _ in range(inner_loops)] size_list = [generate_size(size, 0.1) for _ in range(inner_loops)] data0 = bytes(data[pos_list[0] : pos_list[0] + size_list[0]]) data1 = bytes(data[pos_list[1] : pos_list[1] + size_list[1]]) data2 = bytes(data[pos_list[2] : pos_list[2] + size_list[2]]) data3 = bytes(data[pos_list[3] : pos_list[3] + size_list[3]]) data4 = bytes(data[pos_list[4] : pos_list[4] + size_list[4]]) data5 = bytes(data[pos_list[5] : pos_list[5] + size_list[5]]) data6 = bytes(data[pos_list[6] : pos_list[6] + size_list[6]]) data7 = bytes(data[pos_list[7] : pos_list[7] + size_list[7]]) data8 = bytes(data[pos_list[8] : pos_list[8] + size_list[8]]) data9 = bytes(data[pos_list[9] : pos_list[9] + size_list[9]]) t0 = time.perf_counter() for _ in range_it: f(data0) f(data1) f(data2) f(data3) f(data4) f(data5) f(data6) f(data7) f(data8) f(data9) return time.perf_counter() - t0 def perf_hash_latency(loops: int, f: Callable, size: int) -> float: """Benchmark a hash function with overhead costs with varying data sizes. Based on xxHash's ``benchLatency`` function. https://github.com/Cyan4973/xxHash/blob/dev/tests/bench/benchHash.c Args: loops: The number of outer loops to run. f: The hash function to benchmark size: The size of the buffer to hash. Returns: The time taken to hash the buffer in fractional seconds. """ # pylint: disable=too-many-locals if size <= 0: raise ValueError("size must be greater than 0") range_it = itertools.repeat(None, loops) random.seed(42) n = 0 size0 = generate_size(size, 0.1) size1 = generate_size(size, 0.1) size2 = generate_size(size, 0.1) size3 = generate_size(size, 0.1) size4 = generate_size(size, 0.1) size5 = generate_size(size, 0.1) size6 = generate_size(size, 0.1) size7 = generate_size(size, 0.1) size8 = generate_size(size, 0.1) size9 = generate_size(size, 0.1) data = bytearray(math.floor(size * 1.1) + 255) view_to_hash = memoryview(bytes(init_buffer(data))) t0 = time.perf_counter() for _ in range_it: n = f(view_to_hash[n : n + size0])[0] n = f(view_to_hash[n : n + size1])[0] n = f(view_to_hash[n : n + size2])[0] n = f(view_to_hash[n : n + size3])[0] n = f(view_to_hash[n : n + size4])[0] n = f(view_to_hash[n : n + size5])[0] n = f(view_to_hash[n : n + size6])[0] n = f(view_to_hash[n : n + size7])[0] n = f(view_to_hash[n : n + size8])[0] n = f(view_to_hash[n : n + size9])[0] return time.perf_counter() - t0 def add_cmdline_args(cmd: list, args) -> None: """Add command line arguments to the runner. Args: cmd: The command line arguments to extend. args: The parsed command line arguments. """ cmd.extend(("--test-hash", args.test_hash)) cmd.extend(("--test-type", args.test_type)) cmd.extend(("--test-buffer-size-max", str(args.test_buffer_size_max))) # "if hasattr" is used to check for the existence of the function in the # module, to compare the performance of the current implementation with the # old one (version 4.1.0), which does not implement the new functions. # These conditions should be removed in the future. HASHES = { "mmh3_base_hash": mmh3.hash, "mmh3_32": ( mmh3.mmh3_32_digest if hasattr(mmh3, "mmh3_32_digest") else mmh3.hash_bytes ), "mmh3_128": ( mmh3.mmh3_x64_128_digest if hasattr(mmh3, "mmh3_x64_128_digest") else mmh3.hash128 ), "xxh_32": xxhash.xxh32_digest, "xxh_64": xxhash.xxh64_digest, "xxh3_64": xxhash.xxh3_64_digest, "xxh3_128": xxhash.xxh3_128_digest, "md5": lambda ba: hashlib.md5(ba).digest(), "sha1": lambda ba: hashlib.sha1(ba).digest(), "pymmh3_32": pymmh3.hash, "pymmh3_128": pymmh3.hash128, } BENCHMARKING_TYPES = { "naive": perf_hash, "random": perf_hash_random, "latency": perf_hash_latency, } if __name__ == "__main__": runner = pyperf.Runner(add_cmdline_args=add_cmdline_args) runner.argparser.add_argument( "--test-hash", type=str, help="Type of hash function to benchmark", required=True, choices=HASHES.keys(), ) runner.argparser.add_argument( "--test-type", type=str, help="Type of benchmarking to perform (experimental)", choices=BENCHMARKING_TYPES.keys(), default="random", ) runner.argparser.add_argument( "--test-buffer-size-max", type=int, help="The maximum size of the buffer to hash (default: 1024)", default=1024, ) process_args = runner.parse_args() fib1, fib2 = 1, 2 while fib1 <= process_args.test_buffer_size_max: runner.bench_time_func( f"{fib1} bytes", BENCHMARKING_TYPES[process_args.test_type], HASHES[process_args.test_hash], fib1, inner_loops=10, ) fib1, fib2 = fib2, fib1 + fib2 ================================================ FILE: benchmark/generate_table.py ================================================ # pylint: disable=R0801 """An ad-hoc script to generate a markdown table of benchmarking results. This file should be incorporated into the main plot module in the future. """ import argparse import hashlib import os from typing import TypeVar import pandas as pd import pyperf import xxhash import mmh3 T = TypeVar("T") def pad_with_nan(data: dict[T, list[float]]) -> dict[T, list[float]]: """Pad the data with NaN values to make the length of all lists equal. Args: data: The data to pad. Returns: The padded data. """ max_len = max(len(v) for v in data.values()) for k, v in data.items(): data[k] = v + [float("nan")] * (max_len - len(v)) return data def ordered_intersection(list1: list[T], list2: list[T]) -> list[T]: """Return the intersection of two lists in the order of the first list. Args: list1: The first list. list2: The second list. Returns: The intersection of the two lists in the order of the first list. """ return [item for item in list1 if item in list2] DIGEST_SIZES = { "mmh3_base_hash": mmh3.mmh3_32().digest_size, "mmh3_32": mmh3.mmh3_32().digest_size, "mmh3_128": mmh3.mmh3_x64_128().digest_size, "xxh_32": xxhash.xxh32().digest_size, "xxh_64": xxhash.xxh64().digest_size, "xxh3_64": xxhash.xxh3_64().digest_size, "xxh3_128": xxhash.xxh3_128().digest_size, "md5": hashlib.md5().digest_size, "sha1": hashlib.sha1().digest_size, "pymmh3_32": mmh3.mmh3_32().digest_size, "pymmh3_128": mmh3.mmh3_x64_128().digest_size, } XXHASH_REFERENCE = { "mmh3_32": 3.9, "mmh3_128": None, "xxh_32": 9.7, "xxh_64": 9.1, "xxh3_64": 31.5, "xxh3_128": 29.6, "md5": 0.6, "sha1": 0.8, } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("filenames", nargs="+") args = parser.parse_args() result_latency: dict[str, list[float]] = {} index: list[int] = [] for file_name in args.filenames: suite = pyperf.BenchmarkSuite.load(file_name) base_name = os.path.basename(file_name) hash_name = os.path.splitext(base_name)[0] result_latency[hash_name] = [] index = [] for bench_name in suite.get_benchmark_names(): bench = suite.get_benchmark(bench_name) data_size = int(bench_name.split(" ")[0]) index.append(data_size) latency_seconds = bench.median() result_latency[hash_name].append(latency_seconds) result_latency = pad_with_nan(result_latency) ordered_hash_names = ordered_intersection( list(DIGEST_SIZES.keys()), list(result_latency.keys()) ) df_latency = pd.DataFrame(result_latency, index=index) df_latency = df_latency[ordered_hash_names] df_t = df_latency.copy() df_t = df_t[df_t.index <= 256] small_data_velocity = 0.000001 / df_t.mean() max_row = df_latency.iloc[-1] max_row = float(index[-1]) / max_row max_row = max_row / (2**30) input_bandwidth_df = pd.DataFrame(max_row) input_bandwidth_df.index.name = "Hash" input_bandwidth_df.columns = ["Bandwidth"] digest_size_series = pd.Series(DIGEST_SIZES)[ordered_hash_names] input_bandwidth_df["Width"] = digest_size_series * 8 input_bandwidth_df.sort_values("Bandwidth", ascending=False, inplace=True) input_bandwidth_df = input_bandwidth_df[["Width", "Bandwidth"]] input_bandwidth_df["Small Data Velocity"] = small_data_velocity input_bandwidth_df["✕ Width"] = ( input_bandwidth_df["Width"] * input_bandwidth_df["Small Data Velocity"] ).round(0) input_bandwidth_df["cf. Collet (2020)"] = pd.Series(XXHASH_REFERENCE) # Prettify the table input_bandwidth_df["Bandwidth"] = input_bandwidth_df["Bandwidth"].map( lambda x: f"{x:.2f} GiB/s" ) input_bandwidth_df["Small Data Velocity"] = input_bandwidth_df[ "Small Data Velocity" ].map(lambda x: f"{x:.2f}") input_bandwidth_df["cf. Collet (2020)"] = input_bandwidth_df[ "cf. Collet (2020)" ].map(lambda x: f"{x:.1f} GiB/s" if pd.notna(x) else "N/A") print(input_bandwidth_df.to_markdown()) ================================================ FILE: benchmark/plot_graph.py ================================================ """Plot the graph of the benchmark results.""" import argparse import hashlib import os from typing import TypeVar import matplotlib.pyplot as plt import pandas as pd import pyperf import xxhash import mmh3 T = TypeVar("T") def pad_with_nan(data: dict[T, list[float]]) -> dict[T, list[float]]: """Pad the data with NaN values to make the length of all lists equal. Args: data: The data to pad. Returns: The padded data. """ max_len = max(len(v) for v in data.values()) for k, v in data.items(): data[k] = v + [float("nan")] * (max_len - len(v)) return data def ordered_intersection(list1: list[T], list2: list[T]) -> list[T]: """Return the intersection of two lists in the order of the first list. Args: list1: The first list. list2: The second list. Returns: The intersection of the two lists in the order of the first list. """ return [item for item in list1 if item in list2] DIGEST_SIZES = { "mmh3_base_hash": mmh3.mmh3_32().digest_size, "mmh3_32": mmh3.mmh3_32().digest_size, "mmh3_128": mmh3.mmh3_x64_128().digest_size, "xxh_32": xxhash.xxh32().digest_size, "xxh_64": xxhash.xxh64().digest_size, "xxh3_64": xxhash.xxh3_64().digest_size, "xxh3_128": xxhash.xxh3_128().digest_size, "md5": hashlib.md5().digest_size, "sha1": hashlib.sha1().digest_size, "pymmh3_32": mmh3.mmh3_32().digest_size, "pymmh3_128": mmh3.mmh3_x64_128().digest_size, } THROUGHPUT_FILE_NAME = "throughput.png" THROUGHPUT_SMALL_FILE_NAME = "throughput_small.png" LATENCY_FILE_NAME = "latency.png" LATENCY_SMALL_FILE_NAME = "latency_small.png" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--output-dir", required=True) parser.add_argument("filenames", nargs="+") args = parser.parse_args() result_latency: dict[str, list[float]] = {} result_throughput: dict[str, list[float]] = {} index: list[int] = [] for file_name in args.filenames: suite = pyperf.BenchmarkSuite.load(file_name) base_name = os.path.basename(file_name) hash_name = os.path.splitext(base_name)[0] result_throughput[hash_name] = [] result_latency[hash_name] = [] index = [] for bench_name in suite.get_benchmark_names(): bench = suite.get_benchmark(bench_name) data_size = int(bench_name.split(" ")[0]) index.append(data_size) latency_seconds = bench.median() result_throughput[hash_name].append( DIGEST_SIZES[hash_name] / latency_seconds ) result_latency[hash_name].append(latency_seconds) result_throughput = pad_with_nan(result_throughput) result_latency = pad_with_nan(result_latency) ordered_hash_names = ordered_intersection( list(DIGEST_SIZES.keys()), list(result_throughput.keys()) ) df_throughput = pd.DataFrame(result_throughput, index=index) df_throughput = df_throughput[ordered_hash_names] df_latency = pd.DataFrame(result_latency, index=index) df_latency = df_latency[ordered_hash_names] plt.rcParams["figure.dpi"] = 72 * 3 plt.figure() df_throughput_all = df_throughput / 1024 df_throughput_all.index = df_throughput_all.index / 1024 df_throughput_all.plot( xlabel="Input size (KiB)", ylabel="Throughput (KiB/s)", logy=True ) plt.savefig(os.path.join(args.output_dir, THROUGHPUT_FILE_NAME)) df_throughput_small = df_throughput / 1024 / 1024 df_throughput_small = df_throughput_small.drop(columns=["md5", "sha1"]) df_throughput_small = df_throughput_small[df_throughput_small.index <= 2048] df_throughput_small.plot(xlabel="Input size (bytes)", ylabel="Throughput (MiB/s)") plt.savefig(os.path.join(args.output_dir, THROUGHPUT_SMALL_FILE_NAME)) df_latency_all = df_latency * 1000 df_latency_all.index = df_latency_all.index / 1024 df_latency_all.plot(xlabel="Input size (KiB)", ylabel="Latency (ms)") plt.savefig(os.path.join(args.output_dir, LATENCY_FILE_NAME)) df_latency_small = df_latency * 1000 * 1000 * 1000 df_latency_small = df_latency_small.drop(columns=["md5", "sha1"]) df_latency_small = df_latency_small[df_latency_small.index <= 2048] df_latency_small.plot(xlabel="Input size (bytes)", ylabel="Latency (ns)") plt.savefig(os.path.join(args.output_dir, LATENCY_SMALL_FILE_NAME)) df_throughput = pd.DataFrame( result_throughput, index=df_latency.index / (1024 * 1024) ) plt.close("all") ================================================ FILE: benchmark/plot_graph_base_hash.py ================================================ # pylint: disable=R0801 """An ad-hoc script to plot the graph of the benchmark results for mmh3.hash. This file should be incorporated into the main plot module in the future. """ import argparse import os from typing import TypeVar import matplotlib.pyplot as plt import pandas as pd import pyperf import mmh3 T = TypeVar("T") def pad_with_nan(data: dict[T, list[float]]) -> dict[T, list[float]]: """Pad the data with NaN values to make the length of all lists equal. Args: data: The data to pad. Returns: The padded data. """ max_len = max(len(v) for v in data.values()) for k, v in data.items(): data[k] = v + [float("nan")] * (max_len - len(v)) return data def ordered_intersection(list1: list[T], list2: list[T]) -> list[T]: """Return the intersection of two lists in the order of the first list. Args: list1: The first list. list2: The second list. Returns: The intersection of the two lists in the order of the first list. """ return [item for item in list1 if item in list2] DIGEST_SIZES = { "mmh3_base_hash_500": mmh3.mmh3_32().digest_size, "mmh3_base_hash_410": mmh3.mmh3_32().digest_size, "mmh3_32_500": mmh3.mmh3_32().digest_size, } LATENCY_FILE_NAME = "latency_hash.png" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--output-dir", required=True) parser.add_argument("filenames", nargs="+") args = parser.parse_args() result_latency: dict[str, list[float]] = {} index: list[int] = [] for file_name in args.filenames: suite = pyperf.BenchmarkSuite.load(file_name) base_name = os.path.basename(file_name) hash_name = os.path.splitext(base_name)[0] result_latency[hash_name] = [] index = [] for bench_name in suite.get_benchmark_names(): bench = suite.get_benchmark(bench_name) data_size = int(bench_name.split(" ")[0]) index.append(data_size) latency_seconds = bench.median() result_latency[hash_name].append(latency_seconds) result_latency = pad_with_nan(result_latency) ordered_hash_names = ordered_intersection( list(DIGEST_SIZES.keys()), list(result_latency.keys()) ) df_latency = pd.DataFrame(result_latency, index=index) df_latency = df_latency[ordered_hash_names] plt.rcParams["figure.dpi"] = 72 * 3 plt.figure() df_latency_small = df_latency * 1000 * 1000 * 1000 df_latency_small = df_latency_small.drop(columns=["mmh3_32_500"]) df_latency_small = df_latency_small.rename( columns={ "mmh3_base_hash_410": "hash() in mmh3 4.1.0", "mmh3_base_hash_500": "hash() in mmh3 5.0.0", } ) df_latency_small = df_latency_small[df_latency_small.index <= 2**12] df_latency_small.plot(xlabel="Input size (bytes)", ylabel="Latency (ns)") plt.savefig(os.path.join(args.output_dir, LATENCY_FILE_NAME)) plt.close("all") ================================================ FILE: docs/CODE_OF_CONDUCT.md ================================================ # Code of Conduct Contributors to this project are expected to follow the ACM Code of Ethics and Professional Conduct, available at [https://www.acm.org/code-of-ethics](https://www.acm.org/code-of-ethics). The current version of the Code and its guidelines was adopted by the ACM Council on June 22, 2018. ================================================ FILE: docs/CONTRIBUTING.md ================================================ # Contributing Thank you for your interest in contributing to the `mmh3` project. We appreciate your support and look forward to your contributions. Please read [README](https://github.com/hajimes/mmh3/blob/master/README.md) to get an overview of the `mmh3` project, and follow our [Code of Conduct](./CODE_OF_CONDUCT) (ACM Code of Ethics and Professional Conduct). ## Submitting issues We welcome your contributions, whether it's submitting a bug report or suggesting a new feature through the [issue tracker](https://github.com/hajimes/mmh3/issues). Before creating a new issue, please check the [Frequently Asked Questions section in README](https://github.com/hajimes/mmh3#frequently-asked-questions) to see if the problem has already been noted. ## Project structure As of version 5.1.0, the project layout is structured as follows: - `src/mmh3` - `mmh3module.c`: the main file that serves as the interface between Python and the MurmurHash3 c implementations. - `murmurhash.c`: implementations of the MurmurHash3 family. Auto-generated from Austin Appleby's original code. DO NOT edit this file manually. See [README in the util directory](https://github.com/hajimes/mmh3/blob/master/util/README.md) for details. - `murmurhash.h`: headers and macros for MurmurHash3. Auto-generated from `util/refresh.py`. DO NOT edit this file manually. - `hashlib.h`: taken from [CPython's code base](https://github.com/python/cpython/blob/9ce0f48e918860ffa32751a85b0fe7967723e2e3/Modules/hashlib.h). - `util` - `refresh.py`: file that generates `src/mmh3/murmurhash.c` and `src/mmh3/murmurhash.h` from the original MurmurHash3 C++ code. Edit this file to modify the contents of these files. - `benchmark` - `benchmark.py`: script to run benchmarks. - `plot_graph.py`: script to plot benchmark results. - `docs`: project documentation directory - `paper`: directory containing the academic paper for this project - `.github/workflows`: GitHub Actions workflows ## Project setup Run: ```shell git clone https://github.com/hajimes/mmh3.git ``` This project uses `tox-uv` to automate testing and other tasks. You can install `tox-uv` by running: ```shell pipx install uv uv tool install tox --with tox-uv ``` In addition, `npx` (included with `npm` >= 5.2.0) is required within the `tox` environments to run linters. ## Testing and linting Before submitting your changes, make sure to run the project's tests to ensure everything is working as expected. To run all tests, use the following command: ```shell tox ``` During development, you can run the tests for a specific environment by specifying the environment name. For example, to run tests for a specific version of Python (e.g., Python 3.13), use: ```shell tox -e py313 ``` For type checking, run: ```shell tox -e type ``` To run linters with automated formatting, use: ```shell tox -e lint ``` ### (Optional) Testing on s390x When you have modified the code in a way which may cause endian issues, you may want to locally test on s390x, the only big-endian platform officially supported by Python. [_Emulating a big-endian s390x with QEMU_](https://til.simonwillison.net/docker/emulate-s390x-with-qemu) by Simon Willison is a good introduction to Docker/QEMU settings for emulating s390x. If the above does not work, you may also want to try the following: ```shell docker run --rm --privileged tonistiigi/binfmt --install all docker buildx create --name mybuilder --use docker run -it multiarch/ubuntu-core:s390x-focal /bin/bash ``` ## Pull request Once you've pushed your changes to your fork, you can [create a pull request (PR)](https://github.com/hajimes/mmh3/pulls) on the main project repository. Please provide a clear and detailed description of your changes in the PR, and reference any related issues. ## util directory ### Algorithm implementations used by the `mmh3` module The `util` directory contains C files that were generated from the [SMHasher](https://github.com/aappleby/smhasher) C++ project by Austin Appleby. The idea of the subproject directory loosely follows the [`hashlib` implementation of CPython](https://github.com/python/cpython/tree/main/Modules/_hacl). ### Updating mmh3 core C code Run `tox -e build_cfiles`. This will fetch Appleby's original SMHasher project as a git submodule and then generate PEP 7-compliant C code from the original project. To perform further edits, add transformation code to the `refresh.py` script, instead of editing `murmurhash3.*` files manually. Then, run `tox -e build_cfiles` again to update the `murmurhash3.*` files. ### Local files 1. `./util/README.md` 1. `./util/refresh.py` 1. `./util/FILE_HEADER` ### Generated files 1. `./src/mmh3/murmurhash3.c` 1. `./src/mmh3/murmurhash3.h` ## Benchmarking To run benchmarks locally, try the following command: ```shell tox -e benchmark -- -o OUTPUT_FILE \ --test-hash HASH_NAME --test-buffer-size-max HASH_SIZE ``` where `OUTPUT_FILE` is the output file name (json formatted), `HASH_NAME` is the name of the hash, and `HASH_SIZE` is the maximum buffer size to be tested in bytes. For example, ```shell mkdir -p _results tox -e benchmark -- -o _results/mmh3_128.json \ --test-hash mmh3_128 --test-buffer-size-max 262144 ``` As of version 5.1.0, the following hash function identifiers are available for benchmarking: `mmh3_32`, `mmh3_128`, `xxh_32`, `xxh_64`, `xxh3_64`, `xxh3_128`, `pymmh3_32`, `pymmh3_128`, `md5`, and `sha1`. The owner of the repository can run the benchmark on GitHub Actions by using the workflow defined in `.github/workflows/benchmark.yml`. After obtaining the benchmark results, you can plot graphs by `plot_graph.py`. The following is an example of how to run the script: ```shell tox -e plot -- --output-dir docs/_static RESULT_DIR/*.json ``` where `RESULT_DIR` is the directory containing the benchmark results. The names of json files should be in the format of `HASH_IDENTIFER.json`, e.g., `mmh3_128.json`. ## Documentation Project documentation files are mainly written in the Markdown format and are located in the `docs`. The documentation is automatically built and [hosted on the Read the Docs](https://mmh3.readthedocs.io/en/latest/). To build the documentation locally, use the following command: ```shell tox -e docs ``` To check the result of the built documentation, open `docs/_build/html/index.html` in your browser. ================================================ FILE: docs/CONTRIBUTORS.md ================================================ # Contributors This page acknowledges contributors to the project. For details on the project's history and changes, please refer to the [Changelog](./changelog.md) page. If you're interested in contributing, be sure to review the [Contributing](./CONTRIBUTING.md) guide. ## Code Contributors We gratefully acknowledge the contributions of the following individuals: - [Alexander Maznev](https://github.com/pik), [#6](https://github.com/hajimes/mmh3/pull/6). - [@arieleizenberg](https://github.com/arieleizenberg), [#34](https://github.com/hajimes/mmh3/pull/34). - [Micha Gorelick](https://github.com/mynameisfiber), [#1](https://github.com/hajimes/mmh3/pull/1). - [Danil Shein](https://github.com/dshein-alt), [#40](https://github.com/hajimes/mmh3/pull/40). - [Derek Wilson](https://github.com/underrun), [#2](https://github.com/hajimes/mmh3/pull/2), [#3](https://github.com/hajimes/mmh3/pull/3). - [Dimitri Vorona](https://github.com/alendit), [#13](https://github.com/hajimes/mmh3/pull/13). - [@doozr](https://github.com/doozr), [#15](https://github.com/hajimes/mmh3/pull/15). - [Dušan Nikolić](https://github.com/n-dusan), [#37](https://github.com/hajimes/mmh3/pull/37). - [Matthew Honnibal](https://github.com/honnibal), [#22](https://github.com/hajimes/mmh3/pull/22). - [wouter bolsterlee](https://github.com/wbolster), [#35](https://github.com/hajimes/mmh3/pull/35). ## Community Contributors We would also like to thank the following contributors for their valuable bug reports, feature suggestions, and other contributions: - [Antoine Pitrou](https://github.com/pitrou), [#10](https://github.com/hajimes/mmh3/issues/10). - [Benjamin Bengfort](https://github.com/bbengfort), [#46](https://github.com/hajimes/mmh3/issues/46). - [Christian von Schultz](https://github.com/vonschultz), [#50](https://github.com/hajimes/mmh3/issues/50). - [Dan Blanchard](https://github.com/dan-blanchard), [#8](https://github.com/hajimes/mmh3/issues/8). - [Heather Lapointe](https://github.com/Alphadelta14), [#25](https://github.com/hajimes/mmh3/issues/25). - [Jacques Dark](https://github.com/jqdark), [#12](https://github.com/hajimes/mmh3/issues/12). - [Matej Spiller Muys](https://github.com/matejsp), [#90](https://github.com/hajimes/mmh3/issues/90). - [Niklas Semmler](https://github.com/niklassemmler), [#7](https://github.com/hajimes/mmh3/issues/7). - [Ryan](https://github.com/ryanfwy), [#25](https://github.com/hajimes/mmh3/issues/25). - [Sebastian Kreft](https://github.com/sk-), [#17](https://github.com/hajimes/mmh3/issues/17). - [Tom Mitchell](https://github.com/tcmitchell), [#51](https://github.com/hajimes/mmh3/issues/51). - [Varunkumar Nagarajan](https://github.com/varunkumar), [#39](https://github.com/hajimes/mmh3/issues/39). - [@xqdd](https://github.com/xqdd), [#9](https://github.com/hajimes/mmh3/issues/9). - [@yzssbo](https://github.com/yzssbo), [#25](https://github.com/hajimes/mmh3/issues/25). ## Paper Editors and Reviewers We extend our heartfelt thanks to the following editors and reviewers of the [_Journal of Open Source Software_](https://joss.theoj.org) (JOSS), whose feedback greatly enhanced this project: - [Daniel S. Katz](https://github.com/danielskatz) (Managing Editor-in-Chief) - [Vince Knight](https://github.com/drvinceknight) (Editor) - [Marek Šuppa](https://github.com/mrshu) (Reviewer) - [Jules Pénuchot](https://github.com/JPenuchot) (Reviewer) - [Gaëtan Cassiers](https://github.com/cassiersg) (Reviewer) ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/api.md ================================================ # API Reference The MurmurHash3 algorithm has three variants: - MurmurHash3_x86_32: Generates 32-bit hashes using 32-bit arithmetic. - MurmurHash3_x64_128: Generates 128-bit hashes using 64-bit arithmetic. - MurmurHash3_x86_128: Generates 128-bit hashes using 32-bit arithmetic. The `mmh3` library provides functions and classes for each variant. Although this API reference is comprehensive, you may find the following functions particularly useful: - [mmh3.hash()](#mmh3.hash): Uses the 32-bit variant as its backend and accepts `bytes` or `str` as input (strings are UTF-8 encoded). This function is slower than the x64_128 variant in 64-bit environments but is portable across different architectures. It can also be used to calculate favicon hash footprints for platforms like [Shodan](https://www.shodan.io) and [ZoomEye](https://www.zoomeye.hk). - [mmh3.mmh3_x64_128_digest()](#mmh3.mmh3_x64_128_digest): Uses the x64_128 variant as its backend. This function accepts a buffer (e.g., `bytes`, `bytearray`, `memoryview`, and `numpy` arrays) and returns a 128-bit hash as a `bytes` object, similar to the `hashlib` module in the Python Standard Library. It performs faster than the 32-bit variant on 64-bit machines. Note that **`mmh3` is endian-neutral**, while the original C++ library is endian-sensitive (see also [Frequently Asked Questions](https://github.com/hajimes/mmh3#frequently-asked-questions)). This feature of `mmh3` is essential when portability across different architectures is required, such as when calculating hash footprints for web services. ```{caution} [Buffer-accepting hash functions](#buffer-accepting-hash-functions) (except the deprecated `hash_from_buffer`) accept positional-arguments only. Using keyword arguments will raise a `TypeError`. ``` ```{note} Support for no-GIL mode (officially introduced in Python 3.14) was added in version 5.2.0. - Basic hash functions are inherently thread-safe by design. - Buffer-accepting hash functions are thread-safe, **provided the input buffer is thread-safe**. - Hasher classes are thread-safe, again **assuming the input buffer is thread-safe**. However, thread safety under the no-GIL variant has not yet been fully tested as of 5.2.0. If you encounter any issues, please report them via the [issue tracker](https://github.com/hajimes/mmh3/issues). ``` ## Basic Hash Functions The following functions are used to hash immutable types, specifically `bytes` and `str`. String inputs are automatically converted to `bytes` using UTF-8 encoding before hashing. Although `hash128()`, `hash64()`, and `mmh3.hash_bytes()` are provided for compatibility with previous versions and are not marked for deprecation, the [buffer-accepting hash functions](#buffer-accepting-hash-functions) introduced in version 5.1.0 are recommended for new code. ```{eval-rst} .. autofunction:: mmh3.hash .. autofunction:: mmh3.hash128 .. autofunction:: mmh3.hash64 .. autofunction:: mmh3.hash_bytes ``` ## Buffer-Accepting Hash Functions The following functions are used to hash types that implement the buffer protocol such as `bytes`, `bytearray`, `memoryview`, and `numpy` arrays. ```{seealso} The buffer protocol, [originally implemented as a part of Python/C API](https://docs.python.org/3/c-api/buffer.html), was formally defined as a Python-level API in [PEP 688](https://peps.python.org/pep-0688/) in 2022 and its corresponding type hint was introduced as [collections.abc.Buffer](https://docs.python.org/3/library/collections.abc.html#collections.abc.Buffer) in Python 3.12. For earlier Python versions, `mmh3` uses a type alias for the type hint [\_typeshed.ReadableBuffer](https://github.com/python/typeshed/blob/d326c9bd424ad60c2b63c2ca1c5c1006c61c3562/stdlib/_typeshed/__init__.pyi#L281), which is itself an alias for [typing_extensions.Buffer](https://typing-extensions.readthedocs.io/en/stable/#typing_extensions.Buffer), the backported type hint for `collections.abc.Buffer`. ``` ```{eval-rst} .. autofunction:: mmh3.hash_from_buffer .. autofunction:: mmh3.mmh3_32_digest .. autofunction:: mmh3.mmh3_32_sintdigest .. autofunction:: mmh3.mmh3_32_uintdigest .. autofunction:: mmh3.mmh3_x64_128_digest .. autofunction:: mmh3.mmh3_x64_128_sintdigest .. autofunction:: mmh3.mmh3_x64_128_stupledigest .. autofunction:: mmh3.mmh3_x64_128_uintdigest .. autofunction:: mmh3.mmh3_x64_128_utupledigest .. autofunction:: mmh3.mmh3_x86_128_digest .. autofunction:: mmh3.mmh3_x86_128_sintdigest .. autofunction:: mmh3.mmh3_x86_128_stupledigest .. autofunction:: mmh3.mmh3_x86_128_uintdigest .. autofunction:: mmh3.mmh3_x86_128_utupledigest ``` ## Hasher Classes `mmh3` implements hashers with interfaces similar to those in `hashlib` from the standard library: `mmh3_32()` for 32-bit hashing, `mmh3_x64_128()` for 128-bit hashing optimized for x64 architectures, and `mmh3_x86_128()` for 128-bit hashing optimized for x86 architectures. In addition to the standard `digest()` method, each hasher provides `sintdigest()`, which returns a signed integer, and `uintdigest()`, which returns an unsigned integer. The 128-bit hashers also include `stupledigest()` and `utupledigest()`, which return two 64 bit integers. Please note that as of version 5.0.0, the implementation is still experimental, and performance may be unsatisfactory (particularly `mmh3_x86_128()`). Additionally, `hexdigest()` is not supported; use `digest().hex()` instead. ```pycon >>> import mmh3 >>> hasher = mmh3.mmh3_x64_128(b"foo", 42) # seed=42 >>> hasher.update(b"bar") >>> hasher.digest() b'\x82_n\xdd \xac\xb6j\xef\x99\xb1e\xc4\n\xc9\xfd' >>> hasher.sintdigest() # 128 bit signed int -2943813934500665152301506963178627198 >>> hasher.uintdigest() # 128 bit unsigned int 337338552986437798311073100468589584258 >>> hasher.stupledigest() # two 64 bit signed ints (7689522670935629698, -159584473158936081) >>> hasher.utupledigest() # two 64 bit unsigned ints (7689522670935629698, 18287159600550615535) ``` ```{eval-rst} .. autoclass:: mmh3.mmh3_32 :members: ``` ```{eval-rst} .. autoclass:: mmh3.mmh3_x64_128 :members: ``` ```{eval-rst} .. autoclass:: mmh3.mmh3_x86_128 :members: ``` ================================================ FILE: docs/benchmark.md ================================================ # Benchmarks ## Settings ### Machine - Ubuntu 22.04 instance on GitHub Actions - The benchmarking suits are implemented as GitHub Actions workflows. - [4 processors, 16 GB RAM, 14 GB storage (SSD)](https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories) - According to profiling with `pyperf`, each processor operates at a frequency between 2.4 and 3.3 GHz. - Tuning by the folloiwing settings: - All tests in a benchmarking suite are executed within the same GitHub Actions job. For more details, refer to [Rodríguez-Guerra (2021)](https://labs.quansight.org/blog/2021/08/github-actions-benchmarks). - [CPU pinning](https://manuel.bernhardt.io/posts/2023-11-16-core-pinning/) to isolate the benchmarking process. - See the [documentation of pyperf](https://pyperf.readthedocs.io/en/latest/system.html) for more details on the following settings: - Stop `irqbalance`. - Set `/proc/irq/default_smp_affinity` to `3` (CPU 0 and 1), where the benchmarking processes are pinned to CPU 2 and 3. - Set `/proc/sys/kernel/perf_event_max_sample_rate` to `1`. - `/proc/sys/kernel/randomize_va_space` = 2 (default) ### Software - Python environment: - CPython 3.12.5 (64-bit) - Hash libraries: - mmh3 5.0.0-dev - [python-xxhash](https://github.com/ifduyue/python-xxhash) 3.5.0 - [hashlib](https://docs.python.org/3/library/hashlib.html) (Standard library) - `md5` is tested for `lambda x: hashlib.md5(x).digest()`, and so is `sha1`. therefore, the results for these functions include the overhead of creating the hash object and a function call. - Benchmarking library: - [pyperf](https://github.com/psf/pyperf) 2.7.0 - Used the [bench_time_func](https://pyperf.readthedocs.io/en/latest/api.html#Runner.bench_time_func) interface to eliminate the overhead of the function call. - Processed time are measured by [time.perf_counter()](https://docs.python.org/3/library/time.html#time.perf_counter) in nanoseconds. ## Method - A benchmarking test is performed for each specified byte size, which is derived from the Fibonacci sequence. - For each input size, the test generates a set of 10 `bytes` instances, where each instance's size is pseudo-randomly selected from the range `[ceil(input * 0.9), floor(input * 1.1)]`. - This randomization is crucial as it increases the difficulty of branch predictions, creating a more realistic scenario. For further details, see [xxHash: Performance comparison](https://github.com/Cyan4973/xxHash/wiki/Performance-comparison#throughput-on-small-data-of-random-length-1-n). - This inner loop of 10 iterations is repeated for a certain number of cycles, referred to as the outer loop, which is auto-calibrated by `pyperf`. - To avoid the overhead during the loop, iterators are pre-generated using `itertools.repeat()` outside the loop. See [Peters (2002)](https://www.oreilly.com/library/view/python-cookbook/0596001673/ch17.html) and the real code of the `timeit` module in the Python Standard Library. - The final result are measured using the median, as it is more robust than the mean, especially on untuned or unstable environments such as GitHub Actions. For more details, see [pyperf: Analyze benchmark results](https://pyperf.readthedocs.io/en/latest/analyze.html). ## Results The resulting graphs are plotted using the `pandas` and `matplotlib` libraries. ### Comparison of Version Improvements JSON files containing the benchmark results are available at: [hajimes/mmh3-benchmarks/results_basic-hash/2024-09-17_6bb9987](https://github.com/hajimes/mmh3-benchmarks/tree/main/results_basic-hash/2024-09-17_6bb9987) ```{figure} _static/latency_hash.png :alt: Latency for hash() in version 4.1.0 and 5.0.0. :align: center Figure 1: Latency for `mmh3.hash()` in version 4.1.0 and 5.0.0. Smaller is better. ``` ### Comparison of Hash Functions Across Libraries JSON files containing the benchmark results are available at: [hajimes/mmh3-benchmarks/results/2024-09-17_30da46e](https://github.com/hajimes/mmh3-benchmarks/tree/main/results/2024-09-17_30da46e) In the following graphs: - `mmh32_32` refers to `mmh3.mmh3_32_digest()`. 32-bit output using 32-bit arithmetic. Developed in 2011. - `mmh3_128` refers to `mmh3.mmh3_x64_128_digest()`. 128-bit output using 64-bit arithmetic. Developed in 2011. - `xxh_32` refers to `xxhash.xxh32_digest()`. 32-bit output using 32-bit arithmetic. Developed in 2014. - `xxh_64` refers to `xxhash.xxh64_digest()`. 64-bit output using 64-bit arithmetic. Developed in 2014. - `xxh3_64` refers to `xxhash.xxh3_64_digest()`. 64-bit output using vectorized arithmetic. Developed in 2020. - `xxh3_128` refers to `xxhash.xxh3_64_digest()`. 128-bit output using vectorized arithmetic. Developed in 2020. - `md5` refers to `hashlib.md5()`. 128-bit output using a cryptogprahic algorithm. Developed in 1992. - `sha1` refers to `hashlib.sha1()`. 160-bit output using a cryptogprahic algorithm. Developed in 1995. ```{figure} _static/latency_small.png :alt: Latency for small data :align: center Figure 2: Latency for small data. Smaller is better. ``` ```{figure} _static/latency.png :alt: Latency for large data :align: center Figure 3: Latency for large data. Smaller is better. ``` The following graphs show the throughput, measured as the size of hash output generated per second by each function. ```{figure} _static/throughput_small.png :alt: Throughput for small data :align: center Figure 4: Throughput for small data. Larger is better. ``` ```{figure} _static/throughput.png :alt: Throughput for large data :align: center Figure 5: Throughput for large data. Larger is better. The y-axis is logscale. ``` ## Concluding Remarks Version 5.0.0 of the `mmh3` library has improved the performance of the `hash()` function and other new functions by adopting [METH_FASTCALL](https://docs.python.org/3/c-api/structures.html#c.METH_FASTCALL). This enhancement reduces the overhead of function calls. For data sizes between 1–2 KB (such as 48x48 favicons), performance has improved by 10%–20%. For smaller data (~500 bytes, like 16x16 favicons), performance increases by approximately 30%. However, the performance gain from this revision remains constant, meaning the relative improvement diminishes as data size increases. When comparing hash functions across libraries, `mmh3 5.0.0` is the most performant for small data sizes, while the `xxh3` families in `xxhash 3.5.0` excel with larger data. This is largely due to the new version of `mmh3` utilizing `METH_FASTCALL`, which reduces the overhead of function calls. However, `xxhash` may adopt the same interface in the future, potentially making this advantage temporary. To further improve `mmh3` performance, the core algorithm itself would need an overhaul. Overall, these benchmarking results serve as a useful reference when selecting a hash function for your application, and they provide a solid foundation for future performance enhancements to our library. ## References - Python Standard Library. [timeit](https://docs.python.org/3/library/timeit.html). - [pyperf: Analyze benchmark results](https://pyperf.readthedocs.io/en/latest/analyze.html). - [pyperf: Tune the system for benchmarks](https://pyperf.readthedocs.io/en/latest/system.html). - [pyperf issues #1: Use a better measures than average and standard deviation #1](https://github.com/psf/pyperf/issues/1). - [pyperf issues #75: Reconsidering min()?](https://github.com/psf/pyperf/issues/75). - [pytest-benchmark: Usage](https://pytest-benchmark.readthedocs.io/en/latest/usage.html). - [xxHash: Performance comparison](https://github.com/Cyan4973/xxHash/wiki/Performance-comparison). - [xxHash benchmark program](https://github.com/Cyan4973/xxHash/tree/release/tests/bench). - Manuel Bernhardt. 2023. [On pinning and isolating CPU cores](https://manuel.bernhardt.io/posts/2023-11-16-core-pinning/). - Micha Gorelick and Ian Ozsvald. 2020. [High Performance Python: Practical Performant Programming for Humans, 2nd ed](https://www.oreilly.com/library/view/high-performance-python/9781492055013/). O'Reilly Media. ISBN: 978-1492055020. Chapter 2. - Tim Peters. 2002. [Chapter 17. Algorithms: Introduction](https://www.oreilly.com/library/view/python-cookbook/0596001673/ch17.html) in _Python Cookbook_, 3rd ed. O'Reilly Media. ISBN: 978-0596001674. - Jaime Rodríguez-Guerra. 2021. [Is GitHub Actions suitable for running benchmarks?](https://labs.quansight.org/blog/2021/08/github-actions-benchmarks). - Victor Stinner. 2016. [My journey to stable benchmark, part 1 (system)](https://vstinner.github.io/journey-to-stable-benchmark-system.html). - Victor Stinner. 2016. [My journey to stable benchmark, part 3 (average)](https://vstinner.github.io/journey-to-stable-benchmark-average.html). ================================================ FILE: docs/changelog.md ================================================ ```{include} ../CHANGELOG.md ``` ================================================ FILE: docs/conf.py ================================================ # pylint: disable=C0114,C0103 # Configuration file for the Sphinx documentation builder. # # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # import os # import sys # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "mmh3" project_copyright = "2011-2025, Hajime Senuma" author = "Hajime Senuma" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ "sphinx.ext.autodoc", "sphinx.ext.napoleon", "sphinx_copybutton", "myst_parser", ] templates_path = ["_templates"] exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = "shibuya" html_static_path = ["_static"] html_theme_options = { "github_url": "https://github.com/hajimes/mmh3", } # -- Options for autodoc ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html autodoc_member_order = "groupwise" myst_heading_anchors = 3 ================================================ FILE: docs/index.rst ================================================ mmh3 documentation ================== mmh3 is a Python extension for `MurmurHash (MurmurHash3) `_, a set of fast and robust non-cryptographic hash functions invented by Austin Appleby. .. toctree:: :maxdepth: 2 :caption: User Guideline Quickstart api benchmark Changelog CONTRIBUTORS .. toctree:: :maxdepth: 2 :caption: Project documentation CONTRIBUTING CODE_OF_CONDUCT Indices and tables ================== * :ref:`genindex` * :ref:`search` ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.https://www.sphinx-doc.org/ exit /b 1 ) if "%1" == "" goto help %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/quickstart.md ================================================ ```{include} ../README.md ``` ================================================ FILE: paper/paper.bib ================================================ @article{adja_blockchain-based_2021, title = {A blockchain-based certificate revocation management and status verification system}, author = {Adja, Yves Christian Elloh and Hammi, Badis and Serhrouchni, Ahmed and Zeadally, Sherali}, year = 2021, journal = {Computers \& Security}, volume = 104, pages = 102209, doi = {10.1016/j.cose.2021.102209}, issn = {0167-4048}, url = {https://www.sciencedirect.com/science/article/pii/S016740482100033X}, keywords = {Authentication, Blockchain, Bloom filter, Certificate, Decentralization, PKI, Revocation, Security, X509} } @misc{appleby_murmurhash3_2011, title = {{MurmurHash3} and {SMHasher}}, author = {Appleby, Austin}, year = 2011, url = {https://github.com/aappleby/smhasher} } @misc{Bernhardt2023, title = {On pinning and isolating CPU cores}, author = {Bernhardt, Manuel}, year = 2023, url = {https://manuel.bernhardt.io/posts/2023-11-16-core-pinning/} } @article{Bloom1970, title = {Space/Time Trade-Offs in Hash Coding with Allowable Errors}, author = {Bloom, Burton H.}, year = 1970, month = {jul}, journal = {Commun. ACM}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = 13, number = 7, pages = {422–426}, doi = {10.1145/362686.362692}, issn = {0001-0782}, url = {https://doi.org/10.1145/362686.362692}, issue_date = {July 1970}, numpages = 5, keywords = {retrieval efficiency, storage efficiency, hash addressing, scatter storage, searching, storage layout, retrieval trade-offs, hash coding} } @inproceedings{Broder1997a, title = {On the resemblance and containment of documents}, author = {Broder, Andrei Z.}, year = 1997, booktitle = {Proceedings. Compression and Complexity of {SEQUENCES} 1997}, publisher = {IEEE Comput. Soc}, pages = {21--29}, doi = {10.1109/SEQUEN.1997.666900}, isbn = {0-8186-8132-2}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=666900}, note = {ISSN: 0818681322}, keywords = {MinHash} } @misc{collet_xxhash_2014, title = {{xxHash}}, author = {Collet, Yan}, year = 2014, url = {https://github.com/Cyan4973/xxHash} } @misc{collet_xxhash_comparison_2020, title = {{xxHash}: Performance comparison (2020)}, author = {Collet, Yan}, year = 2020, url = {https://github.com/Cyan4973/xxHash/wiki/Performance-comparison} } @misc{du_xxhash_2014, title = {{xxhash}}, author = {Du, Yue}, year = 2014, url = {https://github.com/ifduyue/python-xxhash} } @misc{faraday_security_understanding_2022, title = {Understanding {Spring4Shell}}, author = {{Faraday Security}}, year = 2022, month = jul, url = {https://faradaysec.com/understanding-spring4shell/} } @book{gorelick_high_2020, title = {High Performance {P}ython: Practical Performant Programming for Humans}, author = {Gorelick, Micha and Ozsvald, Ian}, year = 2020, month = jun, publisher = {O'Reilly Media}, isbn = {978-1-4920-5502-0}, edition = {2nd edition} } @software{hugo_van_kemenade_2024_13624792, author = {Van Kemenade, Hugo and Si, Richard and Dollenstein, Zsolt}, title = {hugovk/top-pypi-packages: Release 2024.09}, month = sep, year = 2024, publisher = {Zenodo}, version = {2024.09}, doi = {10.5281/zenodo.13624792}, url = {https://doi.org/10.5281/zenodo.13624792} } @inproceedings{kakwani_indicnlpsuite_2020, title = {{IndicNLPSuite}: Monolingual Corpora, Evaluation Benchmarks and Pre-trained Multilingual Language Models for {I}ndian Languages}, author = {Kakwani, Divyanshu and Kunchukuttan, Anoop and Golla, Satish and N.C., Gokul and Bhattacharyya, Avik and Khapra, Mitesh M. and Kumar, Pratyush}, year = 2020, month = nov, booktitle = {Findings of the {A}ssociation for {C}omputational {L}inguistics: {EMNLP} 2020}, publisher = {Association for Computational Linguistics}, address = {Online}, pages = {4948--4961}, doi = {10.18653/v1/2020.findings-emnlp.445}, url = {https://aclanthology.org/2020.findings-emnlp.445} } @misc{kihlander_pymmh3_2013, title = {{PYMMH3}}, author = {Kihlander, Fredrik and Gusani, Swapnil}, year = 2013, url = {https://github.com/wc-duck/pymmh3} } @techreport{kopriva_hunting_2021, title = {Hunting phishing websites with favicon hashes}, author = {Kopriva, Jan}, year = 2021, month = apr, url = {https://isc.sans.edu/diary/27326}, institution = {SANS Internet Storm Center} } @book{kumar_probabilistic_2021, title = {Probabilistic Data Structures for Blockchain-Based {I}nternet of {T}hings Applications}, author = {Kumar, Neeraj and Miglani, Arzoo}, year = 2021, month = jan, publisher = {CRC Press}, doi = {10.1201/9781003080046}, isbn = {978-0-367-52990-1} } @book{medjedovic_algorithms_2022, title = {Algorithms and Data Structures for Massive Datasets}, author = {Medjedovic, Dzejla and Tahirovic, Emin and Dedovic, Ines}, year = 2022, month = jul, publisher = {Manning}, isbn = {978-1-61729-803-5} } @techreport{Matherly2017, title = {Complete Guide to Shodan: Collect. Analyze. Visualize. Make Internet Intelligence Work for You.}, author = {Matherly, John}, year = 2017, edition = {Version 2017-08-23}, institution = {Shodan} } @misc{Matherly2024, title = {Deep Dive: http.favicon}, author = {Matherly, John}, year = 2024, month = {jan}, url = {https://blog.shodan.io/deep-dive-http-favicon/}, institution = {Shodan} } @incollection{Peters2002, author = {Peters, Tim}, title = {Algorithms: Introduction}, chapter = {17}, year = 2002, month = {jul}, booktitle = {Python Cookbook}, publisher = {O'Reilly Media}, editor = {Martelli, Alex and Ascher, David}, edition = {1st edition} } @techreport{RodriguezGuerra2021, title = {Is GitHub Actions suitable for running benchmarks?}, author = {Rodríguez-Guerra, Jaime}, year = 2021, month = {aug}, url = {https://labs.quansight.org/blog/2021/08/github-actions-benchmarks}, institution = {Quansight Labs} } @inproceedings{Senuma2011, title = {K-means Clustering with Feature Hashing}, author = {Senuma, Hajime}, year = 2011, booktitle = {Proceedings of the 49th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics: Student Session}, pages = {122--126}, note = {Issue: June} } @inproceedings{Senuma2016, title = {Learning Succinct Models: Pipelined Compression with {L}1-Regularization, Hashing, {E}lias–{F}ano Indices, and Quantization}, author = {Senuma, Hajime and Aizawa, Akiko}, year = 2016, booktitle = {Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers}, pages = {2774--2784} } @article{Shi2009, title = {Hash Kernels for Structured Data}, author = {Shi, Qinfeng and Petterson, James and Dror, Gideon and Langford, John and Smola, Alex and Vishwanathan, S.V.N.}, year = 2009, journal = {Journal of Machine Learning Research}, volume = 10, pages = {2615--2637} } @misc{shodan_its_2021, title = {it's the {MMH3} hash of the http.html property. See: {PyPI} mmh3}, author = {Shodan}, year = 2021, month = may, journal = {Twitter}, url = {https://twitter.com/shodanhq/status/1395501365456261122} } @misc{Stinner2016, title = {My journey to stable benchmark}, author = {Stinner, Victor}, year = 2016, url = {https://vstinner.github.io/journey-to-stable-benchmark-system.html} } @inproceedings{Tang2024, title = {Data Splitting based Double Layer Encryption for Secure Ciphertext Deduplication in Cloud Storage}, author = {Tang, Xin and Jin, Luchao}, year = 2024, journal = {2024 IEEE 17th International Conference on Cloud Computing (CLOUD)}, pages = {153--163}, doi = {10.1109/CLOUD62652.2024.00027} } @inproceedings{Weinberger2009, title = {Feature Hashing for Large Scale Multitask Learning}, author = {Weinberger, Kilian and Dasgupta, Anirban and Langford, John and Smola, Alex and Attenberg, Josh}, year = 2009, booktitle = {Proceedings of the 26th International Conference on Machine Learning}, doi = {10.1145/1553374.1553516}, note = {arXiv: 0902.2206v5} } ================================================ FILE: paper/paper.md ================================================ --- title: "mmh3: A Python extension for MurmurHash3" tags: - Python - hash - high-performance computing - artificial intelligence - natural language processing - internet of things - cybersecurity authors: - name: Hajime Senuma orcid: 0000-0001-8542-1768 affiliation: 1 affiliations: - name: National Institute of Informatics, Japan index: 1 date: 15 Dec 2024 bibliography: paper.bib --- # Summary In recent years, artificial intelligence (AI) has rapidly evolved, particularly in natural language processing (NLP) with services like OpenAI's ChatGPT. Likewise, the Internet of Things (IoT) continues to grow as a key area of ubiquitous computing, exemplified by Shodan, the first IoT search engine. Underlying these advancements are high-performance algorithms and data structures relying on non-cryptographic hash functions, which are characteristically fast, produce statistically well-distributed bits, exhibit an avalanche effect (where a one-bit change in the input alters at least half of the output), and are collision resistant. Because cryptographic strength is unnecessary in these cases, they benefit from the efficiency of non-cryptographic hashes. MurmurHash3 and its test suite, SMHasher, was developed by @appleby_murmurhash3_2011 and is one of the earliest and most continuously popular hash functions specifically designed to implement the characteristics mentioned above. `mmh3` was launched in 2011 as a Python extension for MurmurHash3 and has been maintained ever since. Its API is simple to use for Python programmers, as it offers both one-shot hash functions and hasher classes that allow incremental updating, whose methods are compliant to `hashlib`, a part of the Python Standard Library. The library provides Python wheels (i.e., pre-built binary packages) for immediate use on various platforms, including Linux (x86_64, aarch64, i686, ppc64le, and s390x), Windows (win32, win_amd64, and win_arm64), and macOS (Intel Mac and Apple Silicon). From version 4.0.0, `mmh3` has been published under the MIT License, an OSI-approved permissive open-source license. As of September 1, 2024, `mmh3` was being downloaded more than 4 million times per month, and it ranks as the 973th most downloaded PyPI package (of around 566,000 projects), showing that only 0.17% of the remaining packages in the PyPI ecosystem are more popular [@hugo_van_kemenade_2024_13624792]. According to PePy, as of September 1, 2024, the total downloads of this library exceeded 130 millions. Libraries and organizations that use `mmh3` include Shodan, Microsoft Azure SDK for Python, Apache Iceberg (open table format for analytic datasets), Feast (feature store for machine learning), PyMilvus (Python SDK for Milvus, an open-source vector database), and pocsuite3 (open-source remote vulnerability testing framework). # Statement of need ## AI and High-Performance Computing AI is one of the most resource-demanding fields in computer science and engineering. To mitigate this problem, various techniques are employed under main systems, in which non-cryptographic hash functions play key roles in a number of algorithms and data structures. A notable technique is _feature hashing_ [@Weinberger2009; @Shi2009]. In its simplest usage, when given a string-indexed data vector, it converts the vector into an integer-indexed data vector in which each index is the hash result of the original string index; collision values are summed. Despite its simple and intuitive usage, a machine-learning process with feature hashing is statistically guaranteed to be nearly as accurate as its original process. Feature hashing has been shown to be useful for various situations, including K-means clustering [@Senuma2011] and succinct model learning [@Senuma2016]. Other popular techniques that leverage non-cryptographic hash functions include _Bloom Filter_ [@Bloom1970], a compact data structure that tests whether an element is a member of a certain set (with false positive matches), and _MinHash_ [@Broder1997a], an algorithm that quickly estimates the similarity of two sets. `mmh3` appears in scholarly papers on various topics, including Indian language NLP suites [@kakwani_indicnlpsuite_2020], a secure system based on probabilistic structures [@adja_blockchain-based_2021], as well as secure ciphertext deduplication in cloud storage [@Tang2024]. It has also appeared in technical books and computer science texts [@gorelick_high_2020; @kumar_probabilistic_2021; @medjedovic_algorithms_2022]. ## Internet of Things `mmh3` is applicable to the IoT field. According to @shodan_its_2021, Shodan [@Matherly2017] uses `mmh3` as its fingerprint for a favicon (i.e., an icon associated with a web page or website). @Matherly2024 explained the adoption of `mmh3` due to its speed and compact hash size, noting that cryptographic guarantees provided by `md5` and other hashes were not necessary for their use case. ZoomEye, another popular IoT search engine, follows Shodan’s convention. For cybersecurity, @kopriva_hunting_2021 reported a method of discovering possible phishing websites by searching websites with Shodan, whose favicon’s `mmh3` hash value was the same as that of a genuine one. Another use case of `mmh3` in this area includes open-source intelligence (OSINT) activities, such as measuring the popularity of web frameworks and servers, as some users do not change their default favicon settings specified by applications [@faraday_security_understanding_2022]. # Related software `PYMMH` [@kihlander_pymmh3_2013] is a pure Python implementation of the MurmurHash3 algorithms. Among various other Python bindings for non-cryptographic hashes, `python-xxhash` by Yue Du [@du_xxhash_2014] is another popular hash library, featuring xxHash developed by Yan Collet [@collet_xxhash_2014]. # Benchmarks We conducted microbenchmarking experiments to compare the efficiency of Python-C hash libraries, balancing accuracy, reproducibility, and reliability. Our methodology follows practices from microbenchmarking literature, including works by @Peters2002, @Stinner2016, @collet_xxhash_comparison_2020, @gorelick_high_2020, @RodriguezGuerra2021, and @Bernhardt2023. \autoref{bandwidth} and \autoref{latency} summarize the benchmarking results. While the `xxh3` family in `python-xxhash 3.5.0` shows superior performance for large inputs, the `mmh3 5.0.0` implementation excels with smaller inputs (common scenarios for non-cryptographic hashes), due to its use of `METH_FASTCALL`, an overhead-reducing interface introduced in Python 3.7. For details, see the documentation of the project: . Additionally, the benchmarking results are publicly available as JSON files in the repository: . : \label{bandwidth}Benchmarking results for Python extensions. Small data velocity is defined as the inverse of the mean latency (in microseconds) for inputs in the range of 1–256 bytes. Collet (2020) refers to the results of original C implementations experimented by the author of xxHash, using a CPU clocked at 3.6–4.9 GHz (ours: 2.4–3.3 GHz). | Hash | Width | Bandwidth | Small Data Velocity | cf. Collet (2020) | | :----------- | -------: | :-------------- | ------------------: | :---------------- | | xxh3_128 | 128 bits | **22.42 GiB/s** | 8.96 | 29.6 GiB/s | | xxh3_64 | 64 bits | 22.41 GiB/s | 9.5 | 31.5 GiB/s | | xxh_64 | 64 bits | 8.90 GiB/s | 9.3 | 9.1 GiB/s | | **mmh3_128** | 128 bits | 6.91 GiB/s | **19.04** | N/A | | xxh_32 | 32 bits | 6.15 GiB/s | 8.91 | 9.7 GiB/s | | **mmh3_32** | 32 bits | 2.86 GiB/s | 18.41 | 3.9 GiB/s | | sha1 | 16 bits | 1.63 GiB/s | 2.4 | 0.8 GiB/s | | md5 | 128 bits | 0.65 GiB/s | 1.95 | 0.6 GiB/s | ![\label{latency}Latency for small to medium-sized inputs. Lower is better.](../docs/_static/latency_small.png) # Acknowledgements The author extends sincere gratitude to Akiko Aizawa for her helpful comments on this paper. Appreciation is also given to all those involved in the development and maintenance of `mmh3`. Special thanks go to Micha Gorelick, who made the first pull request to the project and later introduced the library in her technical book [@gorelick_high_2020]. # References ================================================ FILE: pyproject.toml ================================================ [build-system] # setuptools >= 74.1.0 required to build C extensions via pyproject.toml requires = ["setuptools >= 74.1.0", "wheel"] build-backend = "setuptools.build_meta" [project] name = "mmh3" version = "5.2.1" description = "Python extension for MurmurHash (MurmurHash3), a set of fast and robust hash functions." readme = "README.md" license = {file = "LICENSE"} keywords = ["utility", "hash", "MurmurHash"] requires-python = ">=3.10" authors = [ {name = "Hajime Senuma", email="hajime.senuma@gmail.com"} ] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Free Threading :: 2 - Beta", "Topic :: Software Development :: Libraries", "Topic :: Utilities" ] [project.optional-dependencies] test = [ "pytest == 9.0.2", "pytest-sugar == 1.1.1" ] lint = [ "actionlint-py == 1.7.11.24", "clang-format == 22.1.0", "codespell == 2.4.1", "pylint == 4.0.5", "ruff == 0.15.4" ] type = [ "mypy == 1.19.1" ] docs = [ "myst-parser == 5.0.0", "shibuya == 2026.1.9", "sphinx == 8.2.3", "sphinx-copybutton == 0.5.2" ] benchmark = [ "pymmh3 == 0.0.5", "pyperf == 2.10.0", "xxhash == 3.6.0" ] plot = [ "matplotlib == 3.10.8", "pandas == 3.0.1" ] [project.urls] Homepage = "https://pypi.org/project/mmh3/" Documentation = "https://mmh3.readthedocs.io/" Repository = "https://github.com/hajimes/mmh3" Changelog = "https://github.com/hajimes/mmh3/blob/master/CHANGELOG.md" "Bug Tracker" = "https://github.com/hajimes/mmh3/issues" [tool.codespell] # As of 2026-03-02, skip has an issue on super-linter # https://github.com/super-linter/super-linter/issues/7466 skip = "*/paper.bib,./build" # Collet is a surname, Commun is an abbr for a journal name, # fo is used in several test strings, and Ines is also a surname. ignore-words-list = "Collet,Commun,fo,Ines" [tool.ruff] src = ["src/mmh3/__init__.pyi", "util", "tests", "benchmark", "docs"] [tool.ruff.lint] select = ["E", "W", "F", "I", "UP", "B", "SIM", "C4", "ISC", "NPY"] [tool.ruff.lint.isort] known-first-party = ["mmh3"] [tool.setuptools] include-package-data = true ext-modules = [ {name = "mmh3", sources = ["./src/mmh3/mmh3module.c", "./src/mmh3/murmurhash3.c"]} ] [tool.setuptools.package-data] mmh3 = ["*.h"] [tool.pylint] ignore-paths = [ "^build", "^venv", "^.venv", "^.tox", "^src/mmh3/__init__.pyi" ] # Use multiple processes to speed up Pylint. # The value 0 specifies the number of processors to be auto-detected. # This setting can be found in the template file of super-linter 7.0.0. jobs = 0 # import-error: An error tricky to resolve, especially on super-linter. # wrong-import-order: Respect Ruff's import order. disable = [ "import-error", "wrong-import-order" ] ================================================ FILE: src/mmh3/__init__.pyi ================================================ import sys from typing import Any, final if sys.version_info >= (3, 12): from collections.abc import Buffer else: from _typeshed import ReadableBuffer as Buffer def hash(key: bytes | str, seed: int = 0, signed: Any = True) -> int: ... def hash_from_buffer(key: Buffer | str, seed: int = 0, signed: Any = True) -> int: ... def hash64( key: bytes | str, seed: int = 0, x64arch: Any = True, signed: Any = True ) -> tuple[int, int]: ... def hash128( key: bytes | str, seed: int = 0, x64arch: Any = True, signed: Any = False ) -> int: ... def hash_bytes(key: bytes | str, seed: int = 0, x64arch: Any = True) -> bytes: ... def mmh3_32_digest(key: Buffer | str, seed: int = 0) -> bytes: ... def mmh3_32_sintdigest(key: Buffer | str, seed: int = 0) -> int: ... def mmh3_32_uintdigest(key: Buffer | str, seed: int = 0) -> int: ... def mmh3_x64_128_digest(key: Buffer | str, seed: int = 0) -> bytes: ... def mmh3_x64_128_sintdigest(key: Buffer | str, seed: int = 0) -> int: ... def mmh3_x64_128_uintdigest(key: Buffer | str, seed: int = 0) -> int: ... def mmh3_x64_128_stupledigest(key: Buffer | str, seed: int = 0) -> tuple[int, int]: ... def mmh3_x64_128_utupledigest(key: Buffer | str, seed: int = 0) -> tuple[int, int]: ... def mmh3_x86_128_digest(key: Buffer | str, seed: int = 0) -> bytes: ... def mmh3_x86_128_sintdigest(key: Buffer | str, seed: int = 0) -> int: ... def mmh3_x86_128_uintdigest(key: Buffer | str, seed: int = 0) -> int: ... def mmh3_x86_128_stupledigest(key: Buffer | str, seed: int = 0) -> tuple[int, int]: ... def mmh3_x86_128_utupledigest(key: Buffer | str, seed: int = 0) -> tuple[int, int]: ... class Hasher: def __init__(self, data: Buffer | None = None, seed: int = 0) -> None: ... def update(self, data: Buffer) -> None: ... def digest(self) -> bytes: ... def sintdigest(self) -> int: ... def uintdigest(self) -> int: ... def copy(self) -> Hasher: ... @property def digest_size(self) -> int: ... @property def block_size(self) -> int: ... @property def name(self) -> str: ... @final class mmh3_32(Hasher): ... @final class mmh3_x64_128(Hasher): def stupledigest(self) -> tuple[int, int]: ... def utupledigest(self) -> tuple[int, int]: ... @final class mmh3_x86_128(Hasher): def stupledigest(self) -> tuple[int, int]: ... def utupledigest(self) -> tuple[int, int]: ... ================================================ FILE: src/mmh3/hashlib.h ================================================ // This code was taken from a part of CPython's code base (Modules/hashlib.h) // at commit 9ce0f48e918860ffa32751a85b0fe7967723e2e3 // Below is a copy of the license of CPython // PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 // -------------------------------------------- // // 1. This LICENSE AGREEMENT is between the Python Software Foundation // ("PSF"), and the Individual or Organization ("Licensee") accessing and // otherwise using this software ("Python") in source or binary form and // its associated documentation. // // 2. Subject to the terms and conditions of this License Agreement, PSF hereby // grants Licensee a nonexclusive, royalty-free, world-wide license to // reproduce, analyze, test, perform and/or display publicly, prepare // derivative works, distribute, and otherwise use Python alone or in any // derivative version, provided, however, that PSF's License Agreement and // PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004, // 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, // 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; All // Rights Reserved" are retained in Python alone or in any derivative version // prepared by Licensee. // // 3. In the event Licensee prepares a derivative work that is based on // or incorporates Python or any part thereof, and wants to make // the derivative work available to others as provided herein, then // Licensee hereby agrees to include in any such work a brief summary of // the changes made to Python. // // 4. PSF is making Python available to Licensee on an "AS IS" // basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR // IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND // DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS // FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT // INFRINGE ANY THIRD PARTY RIGHTS. // // 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON // FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS // A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, // OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. // // 6. This License Agreement will automatically terminate upon a material // breach of its terms and conditions. // // 7. Nothing in this License Agreement shall be deemed to create any // relationship of agency, partnership, or joint venture between PSF and // Licensee. This License Agreement does not grant permission to use PSF // trademarks or trade name in a trademark sense to endorse or promote // products or services of Licensee, or any third party. // // 8. By copying, installing or otherwise using Python, Licensee // agrees to be bound by the terms and conditions of this License // Agreement. /* * Given a PyObject* obj, fill in the Py_buffer* viewp with the result * of PyObject_GetBuffer. Sets an exception and issues the erraction * on any errors, e.g. 'return NULL' or 'goto error'. */ #define GET_BUFFER_VIEW_OR_ERROR(obj, viewp, erraction) \ do { \ if (PyUnicode_Check((obj))) { \ PyErr_SetString(PyExc_TypeError, \ "Strings must be encoded before hashing"); \ erraction; \ } \ if (!PyObject_CheckBuffer((obj))) { \ PyErr_SetString(PyExc_TypeError, \ "object supporting the buffer API required"); \ erraction; \ } \ if (PyObject_GetBuffer((obj), (viewp), PyBUF_SIMPLE) == -1) { \ erraction; \ } \ if ((viewp)->ndim > 1) { \ PyErr_SetString(PyExc_BufferError, \ "Buffer must be single dimension"); \ PyBuffer_Release((viewp)); \ erraction; \ } \ } while (0) #define GET_BUFFER_VIEW_OR_ERROUT(obj, viewp) \ GET_BUFFER_VIEW_OR_ERROR(obj, viewp, return NULL) ================================================ FILE: src/mmh3/mmh3module.c ================================================ // To handle 64-bit data; see https://docs.python.org/3/c-api/arg.html #ifndef PY_SSIZE_T_CLEAN #define PY_SSIZE_T_CLEAN #endif #include #include #include #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #include #endif #include "hashlib.h" #include "murmurhash3.h" #if defined(_MSC_VER) typedef signed __int8 int8_t; typedef signed __int32 int32_t; typedef signed __int64 int64_t; typedef unsigned __int8 uint8_t; typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t; // Other compilers #else // defined(_MSC_VER) #include #endif // defined(_MSC_VER) #define MMH3_32_DIGESTSIZE 4 #define MMH3_128_DIGESTSIZE 16 #define MMH3_32_BLOCKSIZE 12 #define MMH3_128_BLOCKSIZE 32 #define MMH3_VALIDATE_SEED_RETURN_NULL(seed) \ if (seed < 0 || seed > 0xFFFFFFFF) { \ PyErr_SetString(PyExc_ValueError, "seed is out of range"); \ return NULL; \ } #define MMH3_VALIDATE_SEED_RETURN_INT(seed, buf) \ if (seed < 0 || seed > 0xFFFFFFFF) { \ PyBuffer_Release(&buf); \ PyErr_SetString(PyExc_ValueError, "seed is out of range"); \ return -1; \ } // obj: PyObject* // target_str: const char * // len: Py_ssize_t #define MMH3_HASH_VALIDATE_AND_SET_BYTES(obj, target_str, len) \ if (PyBytes_Check(obj)) { \ target_str_len = PyBytes_Size(obj); \ target_str = PyBytes_AS_STRING(obj); \ } \ else if (PyUnicode_Check(obj)) { \ target_str_len = PyUnicode_GET_LENGTH(obj); \ target_str = PyUnicode_AsUTF8AndSize(obj, &target_str_len); \ } \ else { \ PyErr_Format(PyExc_TypeError, \ "argument 1 must be read-only bytes-like object, " \ "not '%s'", \ Py_TYPE(obj)->tp_name); \ return NULL; \ } // obj: PyObject* // seed: unsigned long #define MMH3_HASH_VALIDATE_AND_SET_SEED(obj, seed) \ if (!PyLong_Check(obj)) { \ PyErr_Format(PyExc_TypeError, \ "'%s' object cannot be interpreted as an integer", \ Py_TYPE(obj)->tp_name); \ return NULL; \ } \ seed = PyLong_AsUnsignedLong(obj); \ if (seed == (unsigned long)-1 && PyErr_Occurred()) { \ if (PyErr_ExceptionMatches(PyExc_OverflowError)) { \ PyErr_SetString(PyExc_ValueError, "seed is out of range"); \ return NULL; \ } \ } \ if (seed > 0xFFFFFFFF) { \ PyErr_SetString(PyExc_ValueError, "seed is out of range"); \ return NULL; \ } // nargs: Py_ssize_t // name: const char * // pos: int #define MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, name, pos) \ if (nargs >= pos) { \ PyErr_Format(PyExc_TypeError, \ "argument for function given by name " \ "('%s') and position (%d)", \ name, pos); \ return NULL; \ } #define MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed) \ if (nargs < 1) { \ PyErr_SetString(PyExc_TypeError, \ "function takes at least 1 argument (0 given)"); \ return NULL; \ } \ if (nargs > 2) { \ PyErr_Format(PyExc_TypeError, \ "function takes at most 2 arguments (%d given)", \ (int)nargs); \ return NULL; \ } \ if (nargs == 2) { \ if (!PyLong_Check(args[1])) { \ PyErr_Format(PyExc_TypeError, \ "'%s' object cannot be interpreted as an integer", \ Py_TYPE(args[1])->tp_name); \ return NULL; \ } \ const unsigned long seed_tmp = PyLong_AsUnsignedLong(args[1]); \ if (seed_tmp == (unsigned long)-1 && PyErr_Occurred()) { \ if (PyErr_ExceptionMatches(PyExc_OverflowError)) { \ PyErr_SetString(PyExc_ValueError, "seed is out of range"); \ return NULL; \ } \ } \ if (seed_tmp > 0xFFFFFFFF) { \ PyErr_SetString(PyExc_ValueError, "seed is out of range"); \ return NULL; \ } \ seed = (uint32_t)seed_tmp; \ } //----------------------------------------------------------------------------- // Helpers for mutex manipulations for hashers #ifdef Py_GIL_DISABLED #define MMH3_HASHER_LOCK(obj) PyMutex_Lock(&(obj->mutex)) #define MMH3_HASHER_UNLOCK(obj) PyMutex_Unlock(&(obj->mutex)) #define MMH3_HASHER_INIT_MUTEX(obj) \ PyMutex t = {0}; \ obj->mutex = t; #else #define MMH3_HASHER_LOCK(obj) (void)0 #define MMH3_HASHER_UNLOCK(obj) (void)0 #define MMH3_HASHER_INIT_MUTEX(obj) (void)0 #endif //----------------------------------------------------------------------------- // One shot functions PyDoc_STRVAR( mmh3_hash_doc, "hash(key, seed=0, signed=True) -> int\n" "\n" "Return a hash as a 32-bit integer.\n" "\n" "Calculated by the MurmurHash3_x86_32 algorithm.\n" "\n" "Args:\n" " key (bytes | str): The input data to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" " signed (Any): If True, return a signed integer. Otherwise, return\n" " an unsigned integer.\n" "\n" "Returns:\n" " int: The hash value as a 32-bit integer.\n" "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``signed`` argument has been changed from\n" " ``bool`` to ``Any``. Performance improvements have been made.\n"); static PyObject * mmh3_hash(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { const char *target_str; Py_ssize_t target_str_len; unsigned long seed = 0; int32_t result[1]; long long_result = 0; int is_signed = 1; #ifndef _MSC_VER #if __LONG_WIDTH__ == 64 || defined(__APPLE__) static uint64_t mask[] = {0x0ffffffff, 0xffffffffffffffff}; #endif #endif if ((nargs < 1) && kwnames == NULL) { PyErr_SetString(PyExc_TypeError, "function missing required argument 'key' (pos 1)"); return NULL; } if (nargs > 3) { PyErr_Format(PyExc_TypeError, "function takes at most 3 arguments (%d given)", (int)nargs); return NULL; } if (nargs >= 1) { MMH3_HASH_VALIDATE_AND_SET_BYTES(args[0], target_str, target_str_len); } if (nargs >= 2) { MMH3_HASH_VALIDATE_AND_SET_SEED(args[1], seed); } if (nargs >= 3) { is_signed = PyObject_IsTrue(args[2]); } if (kwnames) { for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); i++) { const char *kwname = PyUnicode_AsUTF8(PyTuple_GetItem(kwnames, i)); if (strcmp(kwname, "key") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "key", 1); MMH3_HASH_VALIDATE_AND_SET_BYTES(args[nargs + i], target_str, target_str_len); } else if (strcmp(kwname, "seed") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "seed", 2); MMH3_HASH_VALIDATE_AND_SET_SEED(args[nargs + i], seed); } else if (strcmp(kwname, "signed") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "signed", 3); is_signed = PyObject_IsTrue(args[nargs + i]); } else { PyErr_Format( PyExc_TypeError, "'%s' is an invalid keyword argument for this function", kwname); return NULL; } } } murmurhash3_x86_32(target_str, target_str_len, (uint32_t)seed, result); #if defined(_MSC_VER) /* for Windows envs */ long_result = result[0]; if (is_signed == 1) { return PyLong_FromLong(long_result); } else { return PyLong_FromUnsignedLong(long_result); } #else // defined(_MSC_VER) /* for standard envs */ #if __LONG_WIDTH__ == 64 || defined(__APPLE__) long_result = result[0] & mask[is_signed]; return PyLong_FromLong(long_result); #else // __LONG_WIDTH__ == 64 || defined(__APPLE__) long_result = result[0]; if (is_signed == 1) { return PyLong_FromLong(long_result); } else { return PyLong_FromUnsignedLong(long_result); } #endif // __LONG_WIDTH__ == 64 || defined(__APPLE__) #endif // defined(_MSC_VER) } PyDoc_STRVAR( mmh3_hash_from_buffer_doc, "hash_from_buffer(key, seed=0, signed=True) -> int\n" "\n" "Return a hash for the buffer as a 32-bit integer.\n" "\n" "Calculated by the MurmurHash3_x86_32 algorithm. Designed for large " "memory-views such as numpy arrays.\n" "\n" "Args:\n" " key (Buffer | str): The buffer to hash. String inputs are also\n" " supported and are automatically converted to `bytes` using\n" " UTF-8 encoding before hashing.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" " signed (Any): If True, return a signed integer. Otherwise, return\n" " an unsigned integer.\n" "\n" "Returns:\n" " int: The hash value as a 32-bit integer.\n" "\n" ".. deprecated:: 5.0.0\n" " Use ``mmh3_32_sintdigest()`` or ``mmh3_32_uintdigest()`` instead.\n" "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``signed`` argument has been changed from\n" " ``bool`` to ``Any``.\n"); static PyObject * mmh3_hash_from_buffer(PyObject *self, PyObject *args, PyObject *keywds) { Py_buffer target_buf; long long seed = 0; int32_t result[1]; long long_result = 0; int is_signed = 1; static char *kwlist[] = {"key", "seed", "signed", NULL}; #ifndef _MSC_VER #if __LONG_WIDTH__ == 64 || defined(__APPLE__) static uint64_t mask[] = {0x0ffffffff, 0xffffffffffffffff}; #endif #endif if (!PyArg_ParseTupleAndKeywords(args, keywds, "s*|Lp", kwlist, &target_buf, &seed, &is_signed)) { return NULL; } MMH3_VALIDATE_SEED_RETURN_NULL(seed); murmurhash3_x86_32(target_buf.buf, target_buf.len, (uint32_t)seed, result); PyBuffer_Release(&target_buf); #if defined(_MSC_VER) /* for Windows envs */ long_result = result[0]; if (is_signed == 1) { return PyLong_FromLong(long_result); } else { return PyLong_FromUnsignedLong(long_result); } #else // defined(_MSC_VER) /* for standard envs */ #if __LONG_WIDTH__ == 64 || defined(__APPLE__) long_result = result[0] & mask[is_signed]; return PyLong_FromLong(long_result); #else // __LONG_WIDTH__ == 64 || defined(__APPLE__) long_result = result[0]; if (is_signed == 1) { return PyLong_FromLong(long_result); } else { return PyLong_FromUnsignedLong(long_result); } #endif // __LONG_WIDTH__ == 64 || defined(__APPLE__) #endif // defined(_MSC_VER) } PyDoc_STRVAR( mmh3_hash64_doc, "hash64(key, seed=0, x64arch=True, signed=True) -> tuple[int, int]\n" "\n" "Return a hash as a tuple of two 64-bit integers.\n" "\n" "Calculated by the MurmurHash3_x{64, 86}_128 algorithm.\n" "\n" "Args:\n" " key (bytes | str): The input data to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" " x64arch (Any): If True, use an algorithm optimized for 64-bit\n" " architecture. Otherwise, use one optimized for 32-bit\n" " architecture.\n" " signed (Any): If True, return a signed integer. Otherwise, return\n" " an unsigned integer.\n" "\n" "Returns:\n" " tuple[int, int]: The hash value as a tuple of two 64-bit " "integers.\n" "\n" ".. versionchanged:: 5.1.0\n" " Performance improvements.\n" "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``x64arch`` and ``signed`` arguments has been\n" " changed from ``bool`` to ``Any``.\n"); static PyObject * mmh3_hash64(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { const char *target_str; Py_ssize_t target_str_len; long long seed = 0; uint64_t result[2]; int x64arch = 1; int is_signed = 1; static char *valflag[] = {"KK", "LL"}; if ((nargs < 1) && kwnames == NULL) { PyErr_SetString(PyExc_TypeError, "function missing required argument 'key' (pos 1)"); return NULL; } if (nargs > 4) { PyErr_Format(PyExc_TypeError, "function takes at most 4 arguments (%d given)", (int)nargs); return NULL; } if (nargs >= 1) { MMH3_HASH_VALIDATE_AND_SET_BYTES(args[0], target_str, target_str_len); } if (nargs >= 2) { MMH3_HASH_VALIDATE_AND_SET_SEED(args[1], seed); } if (nargs >= 3) { x64arch = PyObject_IsTrue(args[2]); } if (nargs >= 4) { is_signed = PyObject_IsTrue(args[2]); } if (kwnames) { for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); i++) { const char *kwname = PyUnicode_AsUTF8(PyTuple_GetItem(kwnames, i)); if (strcmp(kwname, "key") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "key", 1); MMH3_HASH_VALIDATE_AND_SET_BYTES(args[nargs + i], target_str, target_str_len); } else if (strcmp(kwname, "seed") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "seed", 2); MMH3_HASH_VALIDATE_AND_SET_SEED(args[nargs + i], seed); } else if (strcmp(kwname, "x64arch") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "x64arch", 3); x64arch = PyObject_IsTrue(args[nargs + i]); } else if (strcmp(kwname, "signed") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "signed", 4); is_signed = PyObject_IsTrue(args[nargs + i]); } else { PyErr_Format( PyExc_TypeError, "'%s' is an invalid keyword argument for this function", kwname); return NULL; } } } if (x64arch == 1) { murmurhash3_x64_128(target_str, target_str_len, (uint32_t)seed, result); } else { murmurhash3_x86_128(target_str, target_str_len, (uint32_t)seed, result); } PyObject *retval = Py_BuildValue(valflag[is_signed], result[0], result[1]); return retval; } PyDoc_STRVAR( mmh3_hash128_doc, "hash128(key, seed=0, x64arch=True, signed=False) -> int\n" "\n" "Return a hash as a 128-bit integer.\n\n" "Calculated by the MurmurHash3_x{64, 86}_128 algorithm.\n" "\n" "Args:\n" " key (bytes | str): The input data to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" " x64arch (Any): If True, use an algorithm optimized for 64-bit\n" " architecture. Otherwise, use one optimized for 32-bit\n" " architecture.\n" " signed (Any): If True, return a signed integer. Otherwise, return\n" " an unsigned integer.\n" "\n" "Returns:\n" " int: The hash value as a 128-bit integer.\n" "\n" ".. versionchanged:: 5.1.0\n" " Performance improvements.\n" "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``x64arch`` and ``signed`` arguments has been\n" " changed from ``bool`` to ``Any``.\n"); static PyObject * mmh3_hash128(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { const char *target_str; Py_ssize_t target_str_len; long long seed = 0; uint64_t result[2]; int x64arch = 1; int is_signed = 0; if ((nargs < 1) && kwnames == NULL) { PyErr_SetString(PyExc_TypeError, "function missing required argument 'key' (pos 1)"); return NULL; } if (nargs > 4) { PyErr_Format(PyExc_TypeError, "function takes at most 4 arguments (%d given)", (int)nargs); return NULL; } if (nargs >= 1) { MMH3_HASH_VALIDATE_AND_SET_BYTES(args[0], target_str, target_str_len); } if (nargs >= 2) { MMH3_HASH_VALIDATE_AND_SET_SEED(args[1], seed); } if (nargs >= 3) { x64arch = PyObject_IsTrue(args[2]); } if (nargs >= 4) { is_signed = PyObject_IsTrue(args[2]); } if (kwnames) { for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); i++) { const char *kwname = PyUnicode_AsUTF8(PyTuple_GetItem(kwnames, i)); if (strcmp(kwname, "key") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "key", 1); MMH3_HASH_VALIDATE_AND_SET_BYTES(args[nargs + i], target_str, target_str_len); } else if (strcmp(kwname, "seed") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "seed", 2); MMH3_HASH_VALIDATE_AND_SET_SEED(args[nargs + i], seed); } else if (strcmp(kwname, "x64arch") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "x64arch", 3); x64arch = PyObject_IsTrue(args[nargs + i]); } else if (strcmp(kwname, "signed") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "signed", 4); is_signed = PyObject_IsTrue(args[nargs + i]); } else { PyErr_Format( PyExc_TypeError, "'%s' is an invalid keyword argument for this function", kwname); return NULL; } } } if (x64arch == 1) { murmurhash3_x64_128(target_str, target_str_len, seed, result); } else { murmurhash3_x86_128(target_str, target_str_len, seed, result); } #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray( (unsigned char *)result, MMH3_128_DIGESTSIZE, 1, is_signed); return retval; } PyDoc_STRVAR( mmh3_hash_bytes_doc, "hash_bytes(key, seed=0, x64arch=True) -> bytes\n" "\n" "Return a 16-byte hash of the ``bytes`` type.\n" "\n" "Args:\n" " key (bytes | str): The input data to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" " x64arch (Any): If True, use an algorithm optimized for 64-bit\n" " architecture. Otherwise, use one optimized for 32-bit\n" " architecture.\n" "Returns:\n" " bytes: The hash value as the ``bytes`` type with a length of 16\n" " bytes (128 bits).\n") "\n" ".. versionchanged:: 5.1.0\n" " Performance improvements.\n" "\n" ".. versionchanged:: 5.0.0\n" " The ``seed`` argument is now strictly checked for valid range.\n" " The type of the ``x64arch`` argument has been changed from\n" " ``bool`` to ``Any``.\n"; static PyObject * mmh3_hash_bytes(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { const char *target_str; Py_ssize_t target_str_len; long long seed = 0; uint64_t result[2]; int x64arch = 1; if ((nargs < 1) && kwnames == NULL) { PyErr_SetString(PyExc_TypeError, "function missing required argument 'key' (pos 1)"); return NULL; } if (nargs > 3) { PyErr_Format(PyExc_TypeError, "function takes at most 3 arguments (%d given)", (int)nargs); return NULL; } if (nargs >= 1) { MMH3_HASH_VALIDATE_AND_SET_BYTES(args[0], target_str, target_str_len); } if (nargs >= 2) { MMH3_HASH_VALIDATE_AND_SET_SEED(args[1], seed); } if (nargs >= 3) { x64arch = PyObject_IsTrue(args[2]); } if (kwnames) { for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); i++) { const char *kwname = PyUnicode_AsUTF8(PyTuple_GetItem(kwnames, i)); if (strcmp(kwname, "key") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "key", 1); MMH3_HASH_VALIDATE_AND_SET_BYTES(args[nargs + i], target_str, target_str_len); } else if (strcmp(kwname, "seed") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "seed", 2); MMH3_HASH_VALIDATE_AND_SET_SEED(args[nargs + i], seed); } else if (strcmp(kwname, "x64arch") == 0) { MMH3_HASH_VALIDATE_ARG_DUPLICATION(nargs, "x64arch", 3); x64arch = PyObject_IsTrue(args[nargs + i]); } else { PyErr_Format( PyExc_TypeError, "'%s' is an invalid keyword argument for this function", kwname); return NULL; } } } if (x64arch == 1) { murmurhash3_x64_128(target_str, target_str_len, seed, result); } else { murmurhash3_x86_128(target_str, target_str_len, seed, result); } #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif return PyBytes_FromStringAndSize((char *)result, MMH3_128_DIGESTSIZE); } //----------------------------------------------------------------------------- // Functions that accept a buffer PyDoc_STRVAR( mmh3_mmh3_32_digest_doc, "mmh3_32_digest(key, seed=0, /) -> bytes\n" "\n" "Return a 4-byte hash of the ``bytes`` type for the buffer.\n" "\n" "Calculated by the MurmurHash3_x86_32 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " bytes: The hash value as the ``bytes`` type with a length of\n" " 4 bytes (32 bits).\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_32_digest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; char result[MMH3_32_DIGESTSIZE]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_32(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint32_t *)result)[0] = bswap_32(((uint32_t *)result)[0]); #endif return PyBytes_FromStringAndSize((char *)result, MMH3_32_DIGESTSIZE); } PyDoc_STRVAR( mmh3_mmh3_32_sintdigest_doc, "mmh3_32_sintdigest(key, seed=0, /) -> int\n" "\n" "Return a hash for the buffer as a 32-bit signed integer.\n" "\n" "Calculated by the MurmurHash3_x86_32 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " int: The hash value as a 32-bit signed integer.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_32_sintdigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; int32_t result[1]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_32(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); return PyLong_FromLong(result[0]); } PyDoc_STRVAR( mmh3_mmh3_32_uintdigest_doc, "mmh3_32_uintdigest(key, seed=0, /) -> int\n" "\n" "Return a hash for the buffer as a 32-bit unsigned integer.\n" "\n" "Calculated by the MurmurHash3_x86_32 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " int: The hash value as a 32-bit unsigned integer.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_32_uintdigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint32_t result[1]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_32(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); return PyLong_FromUnsignedLong(result[0]); } PyDoc_STRVAR( mmh3_mmh3_x64_128_digest_doc, "mmh3_x64_128_digest(key, seed=0, /) -> bytes\n" "\n" "Return a 16-byte hash of the ``bytes`` type for the buffer.\n" "\n" "Calculated by the MurmurHash3_x64_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " bytes: The hash value as the ``bytes`` type with a length of\n" " 16 bytes (128 bits).\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x64_128_digest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x64_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif return PyBytes_FromStringAndSize((char *)result, MMH3_128_DIGESTSIZE); } PyDoc_STRVAR( mmh3_mmh3_x64_128_sintdigest_doc, "mmh3_x64_128_sintdigest(key, seed=0, /) -> int\n" "\n" "Return a hash for the buffer as a 128-bit signed integer.\n" "\n" "Calculated by the MurmurHash3_x64_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " int: The hash value as a 128-bit signed integer.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x64_128_sintdigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x64_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray((unsigned char *)result, MMH3_128_DIGESTSIZE, 1, 1); return retval; } PyDoc_STRVAR( mmh3_mmh3_x64_128_uintdigest_doc, "mmh3_x64_128_uintdigest(key, seed=0, /) -> int\n" "\n" "Return a hash for the buffer as a 128-bit unsigned integer.\n" "\n" "Calculated by the MurmurHash3_x64_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " int: The hash value as a 128-bit unsigned integer.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x64_128_uintdigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x64_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray((unsigned char *)result, MMH3_128_DIGESTSIZE, 1, 0); return retval; } PyDoc_STRVAR( mmh3_mmh3_x64_128_stupledigest_doc, "mmh3_x64_128_stupledigest(key, seed=0, /) -> tuple[int, int]\n" "\n" "Return a hash for the buffer as a tuple of two 64-bit signed integers.\n" "\n" "Calculated by the MurmurHash3_x64_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " tuple[int, int]: The hash value as a tuple of two 64-bit signed\n" " integers.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x64_128_stupledigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x64_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); PyObject *retval = Py_BuildValue("LL", result[0], result[1]); return retval; } PyDoc_STRVAR( mmh3_mmh3_x64_128_utupledigest_doc, "mmh3_x64_128_utupledigest(key, seed=0, /) -> tuple[int, int]\n" "\n" "Return a hash for the buffer as a tuple of two 64-bit unsigned " "integers.\n" "\n" "Calculated by the MurmurHash3_x64_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " tuple[int, int]: The hash value as a tuple of two 64-bit unsigned\n" " integers.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x64_128_utupledigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x64_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); PyObject *retval = Py_BuildValue("KK", result[0], result[1]); return retval; } PyDoc_STRVAR( mmh3_mmh3_x86_128_digest_doc, "mmh3_x86_128_digest(key, seed=0, /) -> bytes\n" "\n" "Return a 16-byte hash of the ``bytes`` type for the buffer.\n" "\n" "Calculated by the MurmurHash3_x86_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " bytes: The hash value as the ``bytes`` type with a length of\n" " 16 bytes (128 bits).\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x86_128_digest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif return PyBytes_FromStringAndSize((char *)result, MMH3_128_DIGESTSIZE); } PyDoc_STRVAR( mmh3_mmh3_x86_128_sintdigest_doc, "mmh3_x86_128_sintdigest(key, seed=0, /) -> int\n" "\n" "Return a hash for the buffer as a 128-bit signed integer.\n" "\n" "Calculated by the MurmurHash3_x86_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " int: The hash value as an signed 128-bit integer.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x86_128_sintdigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray((unsigned char *)result, MMH3_128_DIGESTSIZE, 1, 1); return retval; } PyDoc_STRVAR( mmh3_mmh3_x86_128_uintdigest_doc, "mmh3_x86_128_uintdigest(key, seed=0, /) -> int\n" "\n" "Return a hash for the buffer as a 128-bit unsigned integer.\n" "\n" "Calculated by the MurmurHash3_x86_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " int: The hash value as a 128-bit unsigned integer.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x86_128_uintdigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result[0] = bswap_64(result[0]); result[1] = bswap_64(result[1]); #endif /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray((unsigned char *)result, MMH3_128_DIGESTSIZE, 1, 0); return retval; } PyDoc_STRVAR( mmh3_mmh3_x86_128_stupledigest_doc, "mmh3_x86_128_stupledigest(key, seed=0, /) -> tuple[int, int]\n" "\n" "Return a hash for the buffer as a tuple of two 64-bit signed integers.\n" "\n" "Calculated by the MurmurHash3_x86_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " tuple[int, int]: The hash value as a tuple of two 64-bit signed\n" " integers.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x86_128_stupledigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); PyObject *retval = Py_BuildValue("LL", result[0], result[1]); return retval; } PyDoc_STRVAR( mmh3_mmh3_x86_128_utupledigest_doc, "mmh3_x86_128_utupledigest(key, seed=0, /) -> tuple[int, int]\n" "\n" "Return a hash for the buffer as a tuple of two 64-bit unsigned " "integers.\n" "\n" "Calculated by the MurmurHash3_x86_128 algorithm.\n" "\n" "Args:\n" " key (Buffer): The input buffer to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" "Returns:\n" " tuple[int, int]: The hash value as a tuple of two 64-bit unsigned\n" " integers.\n" "\n" ".. versionadded:: 5.0.0\n"); static PyObject * mmh3_mmh3_x86_128_utupledigest(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { Py_buffer target_buf; uint32_t seed = 0; uint64_t result[2]; MMH3_VALIDATE_ARGS_AND_SET_SEED(nargs, args, seed); GET_BUFFER_VIEW_OR_ERROUT(args[0], &target_buf); murmurhash3_x86_128(target_buf.buf, target_buf.len, seed, result); PyBuffer_Release(&target_buf); PyObject *retval = Py_BuildValue("KK", result[0], result[1]); return retval; } // Casting to PyCFunction is mandatory for // METH_VARARGS | METH_KEYWORDS functions. // See // https://docs.python.org/3/extending/extending.html#keyword-parameters-for-extension-functions static PyMethodDef Mmh3Methods[] = { {"hash", (PyCFunction)mmh3_hash, METH_FASTCALL | METH_KEYWORDS, mmh3_hash_doc}, {"hash_from_buffer", (PyCFunction)mmh3_hash_from_buffer, METH_VARARGS | METH_KEYWORDS, mmh3_hash_from_buffer_doc}, {"hash64", (PyCFunction)mmh3_hash64, METH_FASTCALL | METH_KEYWORDS, mmh3_hash64_doc}, {"hash128", (PyCFunction)mmh3_hash128, METH_FASTCALL | METH_KEYWORDS, mmh3_hash128_doc}, {"hash_bytes", (PyCFunction)mmh3_hash_bytes, METH_FASTCALL | METH_KEYWORDS, mmh3_hash_bytes_doc}, {"mmh3_32_digest", (PyCFunction)mmh3_mmh3_32_digest, METH_FASTCALL, mmh3_mmh3_32_digest_doc}, {"mmh3_32_sintdigest", (PyCFunction)mmh3_mmh3_32_sintdigest, METH_FASTCALL, mmh3_mmh3_32_sintdigest_doc}, {"mmh3_32_uintdigest", (PyCFunction)mmh3_mmh3_32_uintdigest, METH_FASTCALL, mmh3_mmh3_32_uintdigest_doc}, {"mmh3_x64_128_digest", (PyCFunction)mmh3_mmh3_x64_128_digest, METH_FASTCALL, mmh3_mmh3_x64_128_digest_doc}, {"mmh3_x64_128_sintdigest", (PyCFunction)mmh3_mmh3_x64_128_sintdigest, METH_FASTCALL, mmh3_mmh3_x64_128_sintdigest_doc}, {"mmh3_x64_128_uintdigest", (PyCFunction)mmh3_mmh3_x64_128_uintdigest, METH_FASTCALL, mmh3_mmh3_x64_128_uintdigest_doc}, {"mmh3_x64_128_stupledigest", (PyCFunction)mmh3_mmh3_x64_128_stupledigest, METH_FASTCALL, mmh3_mmh3_x64_128_stupledigest_doc}, {"mmh3_x64_128_utupledigest", (PyCFunction)mmh3_mmh3_x64_128_utupledigest, METH_FASTCALL, mmh3_mmh3_x64_128_utupledigest_doc}, {"mmh3_x86_128_digest", (PyCFunction)mmh3_mmh3_x86_128_digest, METH_FASTCALL, mmh3_mmh3_x86_128_digest_doc}, {"mmh3_x86_128_sintdigest", (PyCFunction)mmh3_mmh3_x86_128_sintdigest, METH_FASTCALL, mmh3_mmh3_x86_128_sintdigest_doc}, {"mmh3_x86_128_uintdigest", (PyCFunction)mmh3_mmh3_x86_128_uintdigest, METH_FASTCALL, mmh3_mmh3_x86_128_uintdigest_doc}, {"mmh3_x86_128_stupledigest", (PyCFunction)mmh3_mmh3_x86_128_stupledigest, METH_FASTCALL, mmh3_mmh3_x86_128_stupledigest_doc}, {"mmh3_x86_128_utupledigest", (PyCFunction)mmh3_mmh3_x86_128_utupledigest, METH_FASTCALL, mmh3_mmh3_x86_128_utupledigest_doc}, {NULL, NULL, 0, NULL}}; //----------------------------------------------------------------------------- // Hasher classes // // The design of hasher classes are loosely based on the Google Guava // implementation (Java) //----------------------------------------------------------------------------- // Hasher for murmurhash3_x86_32 typedef struct { PyObject_HEAD uint32_t h; uint64_t buffer; uint8_t shift; Py_ssize_t length; #ifdef Py_GIL_DISABLED PyMutex mutex; #endif } MMH3Hasher32; static PyTypeObject MMH3Hasher32Type; static FORCE_INLINE void update32_impl(MMH3Hasher32 *self, Py_buffer *buf) { Py_ssize_t i = 0; uint32_t h1 = 0; uint32_t k1 = 0; const uint32_t c1 = 0xe6546b64; const uint64_t mask = 0xffffffffUL; MMH3_HASHER_LOCK(self); h1 = self->h; for (; i + 4 <= buf->len; i += 4) { k1 = getblock32(buf->buf, i / 4); self->buffer |= (k1 & mask) << self->shift; self->length += 4; h1 ^= mixK1(self->buffer); h1 = mixH1(h1, 0, 13, c1); self->buffer >>= 32; } for (; i < buf->len; i++) { k1 = ((uint8_t *)buf->buf)[i]; self->buffer |= (k1 & mask) << self->shift; self->shift += 8; self->length += 1; if (self->shift >= 32) { h1 ^= mixK1(self->buffer); h1 = mixH1(h1, 0, 13, c1); self->buffer >>= 32; self->shift -= 32; } } self->h = h1; MMH3_HASHER_UNLOCK(self); PyBuffer_Release(buf); return; } static void MMH3Hasher32_dealloc(MMH3Hasher32 *self) { Py_TYPE(self)->tp_free((PyObject *)self); } static PyObject * MMH3Hasher32_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { MMH3Hasher32 *self; self = (MMH3Hasher32 *)type->tp_alloc(type, 0); if (self != NULL) { self->h = 0; self->buffer = 0; self->shift = 0; self->length = 0; MMH3_HASHER_INIT_MUTEX(self); } return (PyObject *)self; } /* It is impossible to add docstring for __init__ in Python C extension. Therefore, the constructor docstring should be described in the class docstring. See also https://stackoverflow.com/q/11913492 */ static int MMH3Hasher32_init(MMH3Hasher32 *self, PyObject *args, PyObject *kwds) { Py_buffer target_buf = {0}; long long seed = 0; static char *kwlist[] = {"data", "seed", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwds, "|y*L", kwlist, &target_buf, &seed)) return -1; MMH3_VALIDATE_SEED_RETURN_INT(seed, target_buf); self->h = (uint32_t)seed; if (target_buf.buf != NULL) { // target_buf will be released in update32_impl update32_impl(self, &target_buf); } return 0; } PyDoc_STRVAR( MMH3Hasher_update_doc, "update(data)\n" "\n" "Update this hash object's state with the provided bytes-like object.\n" "\n" "Args:\n" " data (Buffer): The buffer to hash.\n"); static PyObject * MMH3Hasher32_update(MMH3Hasher32 *self, PyObject *obj) { Py_buffer buf; GET_BUFFER_VIEW_OR_ERROUT(obj, &buf); // buf will be released in update32_impl update32_impl(self, &buf); Py_RETURN_NONE; } static FORCE_INLINE uint32_t digest32_impl(uint32_t h, uint64_t k1, Py_ssize_t length) { h ^= mixK1(k1); h ^= length; h = fmix32(h); return h; } PyDoc_STRVAR(MMH3Hasher_digest_doc, "digest() -> bytes\n" "\n" "Return the digest value as a ``bytes`` object.\n" "\n" "Returns:\n" " bytes: The digest value.\n"); static PyObject * MMH3Hasher32_digest(MMH3Hasher32 *self, PyObject *Py_UNUSED(ignored)) { MMH3_HASHER_LOCK(self); uint32_t h = digest32_impl(self->h, self->buffer, self->length); MMH3_HASHER_UNLOCK(self); char out[MMH3_32_DIGESTSIZE]; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint32_t *)out)[0] = bswap_32(h); #else ((uint32_t *)out)[0] = h; #endif return PyBytes_FromStringAndSize(out, MMH3_32_DIGESTSIZE); } PyDoc_STRVAR(MMH3Hasher_sintdigest_doc, "sintdigest() -> int\n" "\n" "Return the digest value as a signed integer.\n" "\n" "Returns:\n" " int: The digest value as a signed integer.\n"); static PyObject * MMH3Hasher32_sintdigest(MMH3Hasher32 *self, PyObject *Py_UNUSED(ignored)) { MMH3_HASHER_LOCK(self); uint32_t h = digest32_impl(self->h, self->buffer, self->length); MMH3_HASHER_UNLOCK(self); // Note that simple casting ("(int32_t) h") is an undefined behavior int32_t result = *(int32_t *)&h; return PyLong_FromLong(result); } PyDoc_STRVAR(MMH3Hasher_uintdigest_doc, "uintdigest() -> int\n" "\n" "Return the digest value as an unsigned integer.\n" "\n" "Returns:\n" " int: The digest value as an unsigned integer.\n"); static PyObject * MMH3Hasher32_uintdigest(MMH3Hasher32 *self, PyObject *Py_UNUSED(ignored)) { MMH3_HASHER_LOCK(self); uint32_t h = digest32_impl(self->h, self->buffer, self->length); MMH3_HASHER_UNLOCK(self); return PyLong_FromUnsignedLong(h); } PyDoc_STRVAR(MMH3Hasher32_copy_doc, "copy() -> mmh3_32\n" "\n" "Return a copy of the hash object..\n" "\n" "Returns:\n" " mmh3_32: A copy of this hash object.\n"); static PyObject * MMH3Hasher32_copy(MMH3Hasher32 *self, PyObject *Py_UNUSED(ignored)) { MMH3Hasher32 *p; if ((p = PyObject_New(MMH3Hasher32, &MMH3Hasher32Type)) == NULL) { return NULL; } MMH3_HASHER_LOCK(self); p->h = self->h; p->buffer = self->buffer; p->shift = self->shift; p->length = self->length; MMH3_HASHER_INIT_MUTEX(p); MMH3_HASHER_UNLOCK(self); return (PyObject *)p; } static PyMethodDef MMH3Hasher32_methods[] = { {"update", (PyCFunction)MMH3Hasher32_update, METH_O, MMH3Hasher_update_doc}, { "digest", (PyCFunction)MMH3Hasher32_digest, METH_NOARGS, MMH3Hasher_digest_doc, }, {"sintdigest", (PyCFunction)MMH3Hasher32_sintdigest, METH_NOARGS, MMH3Hasher_sintdigest_doc}, {"uintdigest", (PyCFunction)MMH3Hasher32_uintdigest, METH_NOARGS, MMH3Hasher_uintdigest_doc}, {"copy", (PyCFunction)MMH3Hasher32_copy, METH_NOARGS, MMH3Hasher32_copy_doc}, {NULL} /* Sentinel */ }; static PyObject * MMH3Hasher32_get_digest_size(PyObject *self, void *closure) { return PyLong_FromLong(MMH3_32_DIGESTSIZE); } static PyObject * MMH3Hasher32_get_block_size(PyObject *self, void *closure) { return PyLong_FromLong(MMH3_32_BLOCKSIZE); } static PyObject * MMH3Hasher32_get_name(PyObject *self, void *closure) { return PyUnicode_FromStringAndSize("mmh3_32", 7); } static PyGetSetDef MMH3Hasher32_getsetters[] = { {"digest_size", (getter)MMH3Hasher32_get_digest_size, NULL, "int: Number of bytes in this hashes output", NULL}, {"block_size", (getter)MMH3Hasher32_get_block_size, NULL, "int: Number of bytes of the internal block of this algorithm", NULL}, {"name", (getter)MMH3Hasher32_get_name, NULL, "str: The hash algorithm being used by this object", NULL}, {NULL} /* Sentinel */ }; PyDoc_STRVAR( MMH3Hasher32Type_doc, "__init__(data=None, seed=0)\n" "\n" "Hasher for incrementally calculating the murmurhash3_x86_32 hash.\n" "\n" "Args:\n" " data (Buffer | None): The initial data to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" ".. versionchanged:: 5.2.0\n" " Experimental no-GIL support; thread safety not fully verified.\n" "\n" ".. versionchanged:: 5.0.0\n" " Added the optional ``data`` parameter as the first argument.\n" " The ``seed`` argument is now strictly checked for valid range.\n"); static PyTypeObject MMH3Hasher32Type = { PyVarObject_HEAD_INIT(NULL, 0).tp_name = "mmh3.mmh3_32", .tp_doc = MMH3Hasher32Type_doc, .tp_basicsize = sizeof(MMH3Hasher32), .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, .tp_new = MMH3Hasher32_new, .tp_init = (initproc)MMH3Hasher32_init, .tp_dealloc = (destructor)MMH3Hasher32_dealloc, .tp_methods = MMH3Hasher32_methods, .tp_getset = MMH3Hasher32_getsetters, }; //----------------------------------------------------------------------------- // Hasher for murmurhash3_x64_128 typedef struct { PyObject_HEAD uint64_t h1; uint64_t h2; uint64_t buffer1; uint64_t buffer2; uint8_t shift; Py_ssize_t length; #ifdef Py_GIL_DISABLED PyMutex mutex; #endif } MMH3Hasher128x64; static PyTypeObject MMH3Hasher128x64Type; static FORCE_INLINE void update_x64_128_impl(MMH3Hasher128x64 *self, Py_buffer *buf) { Py_ssize_t i = 0; uint64_t h1 = 0; uint64_t h2 = 0; uint64_t k1 = 0; uint64_t k2 = 0; MMH3_HASHER_LOCK(self); h1 = self->h1; h2 = self->h2; for (; i + 16 <= buf->len; i += 16) { k1 = getblock64(buf->buf, (i / 16) * 2); k2 = getblock64(buf->buf, (i / 16) * 2 + 1); if (self->shift == 0) { // TODO: use bit ops self->buffer1 = k1; self->buffer2 = k2; } else if (self->shift < 64) { self->buffer1 |= k1 << self->shift; self->buffer2 = (k1 >> (64 - self->shift)) | (k2 << self->shift); } else if (self->shift == 64) { self->buffer2 = k1; } else { self->buffer2 |= k1 << (self->shift - 64); } h1 ^= mixK1_x64_128(self->buffer1); h1 = mixH_x64_128(h1, h2, 27, 0x52dce729UL); h2 ^= mixK2_x64_128(self->buffer2); h2 = mixH_x64_128(h2, h1, 31, 0x38495ab5UL); self->length += 16; if (self->shift == 0) { // TODO: use bit ops self->buffer1 = 0; self->buffer2 = 0; } else if (self->shift < 64) { self->buffer1 = k2 >> (64 - self->shift); self->buffer2 = 0; } else if (self->shift == 64) { self->buffer1 = k2; self->buffer2 = 0; } else { self->buffer1 = k1 >> (128 - self->shift) | (k2 << (self->shift - 64)); self->buffer2 = k2 >> (128 - self->shift); } } for (; i < buf->len; i++) { k1 = ((uint8_t *)buf->buf)[i]; if (self->shift < 64) { // TODO: use bit ops self->buffer1 |= k1 << self->shift; } else { self->buffer2 |= k1 << (self->shift - 64); } self->shift += 8; self->length += 1; if (self->shift >= 128) { h1 ^= mixK1_x64_128(self->buffer1); h1 = mixH_x64_128(h1, h2, 27, 0x52dce729UL); h2 ^= mixK2_x64_128(self->buffer2); h2 = mixH_x64_128(h2, h1, 31, 0x38495ab5UL); self->buffer1 = 0; self->buffer2 = 0; self->shift -= 128; } } self->h1 = h1; self->h2 = h2; MMH3_HASHER_UNLOCK(self); PyBuffer_Release(buf); } static void MMH3Hasher128x64_dealloc(MMH3Hasher128x64 *self) { Py_TYPE(self)->tp_free((PyObject *)self); } static PyObject * MMH3Hasher128x64_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { MMH3Hasher128x64 *self; self = (MMH3Hasher128x64 *)type->tp_alloc(type, 0); if (self != NULL) { self->h1 = 0; self->h2 = 0; self->buffer1 = 0; self->buffer2 = 0; self->shift = 0; self->length = 0; MMH3_HASHER_INIT_MUTEX(self); } return (PyObject *)self; } static int MMH3Hasher128x64_init(MMH3Hasher128x64 *self, PyObject *args, PyObject *kwds) { Py_buffer target_buf = {0}; long long seed = 0; static char *kwlist[] = {"data", "seed", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwds, "|y*L", kwlist, &target_buf, &seed)) return -1; MMH3_VALIDATE_SEED_RETURN_INT(seed, target_buf); self->h1 = (uint64_t)seed; self->h2 = self->h1; if (target_buf.buf != NULL) { // target_buf will be released in update_x64_128_impl update_x64_128_impl(self, &target_buf); } return 0; } static PyObject * MMH3Hasher128x64_update(MMH3Hasher128x64 *self, PyObject *obj) { Py_buffer buf; GET_BUFFER_VIEW_OR_ERROUT(obj, &buf); // buf will be released in update_x64_128_impl update_x64_128_impl(self, &buf); Py_RETURN_NONE; } static PyObject * MMH3Hasher128x64_digest(MMH3Hasher128x64 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x64_128_impl(self->h1, self->h2, self->buffer1, self->buffer2, self->length, out); MMH3_HASHER_UNLOCK(self); return PyBytes_FromStringAndSize(out, MMH3_128_DIGESTSIZE); } static PyObject * MMH3Hasher128x64_sintdigest(MMH3Hasher128x64 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x64_128_impl(self->h1, self->h2, self->buffer1, self->buffer2, self->length, out); MMH3_HASHER_UNLOCK(self); const int little_endian = 1; const int is_signed = 1; /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray( (unsigned char *)out, MMH3_128_DIGESTSIZE, little_endian, is_signed); return retval; } static PyObject * MMH3Hasher128x64_uintdigest(MMH3Hasher128x64 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x64_128_impl(self->h1, self->h2, self->buffer1, self->buffer2, self->length, out); MMH3_HASHER_UNLOCK(self); const int little_endian = 1; const int is_signed = 0; /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray( (unsigned char *)out, MMH3_128_DIGESTSIZE, little_endian, is_signed); return retval; } PyDoc_STRVAR(MMH3Hasher128_stupledigest_doc, "stupledigest() -> tuple[int, int]\n" "\n" "Return the digest value as a tuple of two signed integers.\n" "\n" "Returns:\n" " tuple[int, int]: The digest value as a tuple of two signed\n" " integers.\n"); static PyObject * MMH3Hasher128x64_stupledigest(MMH3Hasher128x64 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x64_128_impl(self->h1, self->h2, self->buffer1, self->buffer2, self->length, out); MMH3_HASHER_UNLOCK(self); const char *valflag = "LL"; uint64_t result1 = ((uint64_t *)out)[0]; uint64_t result2 = ((uint64_t *)out)[1]; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result1 = bswap_64(result1); result2 = bswap_64(result2); #endif return Py_BuildValue(valflag, result1, result2); } PyDoc_STRVAR( MMH3Hasher128_utupledigest_doc, "utupledigest() -> tuple[int, int]\n" "\n" "Return the digest value as a tuple of two unsigned integers.\n" "\n" "Returns:\n" " tuple[int, int]: The digest value as a tuple of two unsigned\n" " integers.\n"); static PyObject * MMH3Hasher128x64_utupledigest(MMH3Hasher128x64 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x64_128_impl(self->h1, self->h2, self->buffer1, self->buffer2, self->length, out); MMH3_HASHER_UNLOCK(self); const char *valflag = "KK"; uint64_t result1 = ((uint64_t *)out)[0]; uint64_t result2 = ((uint64_t *)out)[1]; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result1 = bswap_64(result1); result2 = bswap_64(result2); #endif return Py_BuildValue(valflag, result1, result2); } PyDoc_STRVAR(MMH3Hasher128x64_copy_doc, "copy() -> mmh3_128x64\n" "\n" "Return a copy of the hash object..\n" "\n" "Returns:\n" " mmh3_128x64: A copy of this hash object.\n"); static PyObject * MMH3Hasher128x64_copy(MMH3Hasher128x64 *self, PyObject *Py_UNUSED(ignored)) { MMH3Hasher128x64 *p; if ((p = PyObject_New(MMH3Hasher128x64, &MMH3Hasher128x64Type)) == NULL) { return NULL; } MMH3_HASHER_LOCK(self); p->h1 = self->h1; p->h2 = self->h2; p->buffer1 = self->buffer1; p->buffer2 = self->buffer2; p->shift = self->shift; p->length = self->length; MMH3_HASHER_INIT_MUTEX(p); MMH3_HASHER_UNLOCK(self); return (PyObject *)p; } static PyMethodDef MMH3Hasher128x64_methods[] = { {"update", (PyCFunction)MMH3Hasher128x64_update, METH_O, MMH3Hasher_update_doc}, {"digest", (PyCFunction)MMH3Hasher128x64_digest, METH_NOARGS, MMH3Hasher_digest_doc}, {"sintdigest", (PyCFunction)MMH3Hasher128x64_sintdigest, METH_NOARGS, MMH3Hasher_sintdigest_doc}, {"uintdigest", (PyCFunction)MMH3Hasher128x64_uintdigest, METH_NOARGS, MMH3Hasher_uintdigest_doc}, {"stupledigest", (PyCFunction)MMH3Hasher128x64_stupledigest, METH_NOARGS, MMH3Hasher128_stupledigest_doc}, {"utupledigest", (PyCFunction)MMH3Hasher128x64_utupledigest, METH_NOARGS, MMH3Hasher128_utupledigest_doc}, {"copy", (PyCFunction)MMH3Hasher128x64_copy, METH_NOARGS, MMH3Hasher128x64_copy_doc}, {NULL} /* Sentinel */ }; static PyObject * MMH3Hasher128x64_get_digest_size(PyObject *self, void *closure) { return PyLong_FromLong(MMH3_128_DIGESTSIZE); } static PyObject * MMH3Hasher128x64_get_block_size(PyObject *self, void *closure) { return PyLong_FromLong(MMH3_128_BLOCKSIZE); } static PyObject * MMH3Hasher128x64_get_name(PyObject *self, void *closure) { return PyUnicode_FromStringAndSize("mmh3_x64_128", 12); } static PyGetSetDef MMH3Hasher128x64_getsetters[] = { {"digest_size", (getter)MMH3Hasher128x64_get_digest_size, NULL, "int: Number of bytes in this hashes output.", NULL}, {"block_size", (getter)MMH3Hasher128x64_get_block_size, NULL, "int: Number of bytes of the internal block of this algorithm.", NULL}, {"name", (getter)MMH3Hasher128x64_get_name, NULL, "str: The hash algorithm being used by this object.", NULL}, {NULL} /* Sentinel */ }; PyDoc_STRVAR( MMH3Hasher128x64Type_doc, "__init__(data=None, seed=0)\n" "\n" "Hasher for incrementally calculating the murmurhash3_x64_128 hash.\n" "\n" "Args:\n" " data (Buffer | None): The initial data to hash.\n" " seed (int): The seed value. Must be an integer in the range\n" " [0, 0xFFFFFFFF].\n" "\n" ".. versionchanged:: 5.2.0\n" " Experimental no-GIL support; thread safety not fully verified.\n" "\n" ".. versionchanged:: 5.0.0\n" " Added the optional ``data`` parameter as the first argument.\n" " The ``seed`` argument is now strictly checked for valid range.\n"); static PyTypeObject MMH3Hasher128x64Type = { PyVarObject_HEAD_INIT(NULL, 0).tp_name = "mmh3.mmh3_x64_128", .tp_doc = MMH3Hasher128x64Type_doc, .tp_basicsize = sizeof(MMH3Hasher128x64), .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, .tp_new = MMH3Hasher128x64_new, .tp_init = (initproc)MMH3Hasher128x64_init, .tp_dealloc = (destructor)MMH3Hasher128x64_dealloc, .tp_methods = MMH3Hasher128x64_methods, .tp_getset = MMH3Hasher128x64_getsetters, }; //----------------------------------------------------------------------------- // Hasher for murmurhash3_x86_128 typedef struct { PyObject_HEAD uint32_t h1; uint32_t h2; uint32_t h3; uint32_t h4; uint32_t buffer1; uint32_t buffer2; uint32_t buffer3; uint32_t buffer4; uint8_t shift; Py_ssize_t length; #ifdef Py_GIL_DISABLED PyMutex mutex; #endif } MMH3Hasher128x86; static PyTypeObject MMH3Hasher128x86Type; static FORCE_INLINE void update_x86_128_impl(MMH3Hasher128x86 *self, Py_buffer *buf) { Py_ssize_t i = 0; uint32_t h1 = 0; uint32_t h2 = 0; uint32_t h3 = 0; uint32_t h4 = 0; uint32_t k1 = 0; MMH3_HASHER_LOCK(self); h1 = self->h1; h2 = self->h2; h3 = self->h3; h4 = self->h4; for (; i < buf->len; i++) { k1 = ((uint8_t *)buf->buf)[i]; if (self->shift < 32) { // TODO: use bit ops self->buffer1 |= k1 << self->shift; } else if (self->shift < 64) { self->buffer2 |= k1 << (self->shift - 32); } else if (self->shift < 96) { self->buffer3 |= k1 << (self->shift - 64); } else { self->buffer4 |= k1 << (self->shift - 96); } self->shift += 8; self->length += 1; if (self->shift >= 128) { const uint32_t c1 = 0x239b961b; const uint32_t c2 = 0xab0e9789; const uint32_t c3 = 0x38b34ae5; const uint32_t c4 = 0xa1e38b93; h1 ^= mixK_x86_128(self->buffer1, 15, c1, c2); h1 = mixH1(h1, h2, 19, 0x561ccd1bUL); h2 ^= mixK_x86_128(self->buffer2, 16, c2, c3); h2 = mixH1(h2, h3, 17, 0x0bcaa747UL); h3 ^= mixK_x86_128(self->buffer3, 17, c3, c4); h3 = mixH1(h3, h4, 15, 0x96cd1c35UL); h4 ^= mixK_x86_128(self->buffer4, 18, c4, c1); h4 = mixH1(h4, h1, 13, 0x32ac3b17UL); self->buffer1 = 0; self->buffer2 = 0; self->buffer3 = 0; self->buffer4 = 0; self->shift -= 128; } } self->h1 = h1; self->h2 = h2; self->h3 = h3; self->h4 = h4; MMH3_HASHER_UNLOCK(self); PyBuffer_Release(buf); } static void MMH3Hasher128x86_dealloc(MMH3Hasher128x86 *self) { Py_TYPE(self)->tp_free((PyObject *)self); } static PyObject * MMH3Hasher128x86_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { MMH3Hasher128x86 *self; self = (MMH3Hasher128x86 *)type->tp_alloc(type, 0); if (self != NULL) { self->h1 = 0; self->h2 = 0; self->h3 = 0; self->h4 = 0; self->buffer1 = 0; self->buffer2 = 0; self->buffer3 = 0; self->buffer4 = 0; self->shift = 0; self->length = 0; MMH3_HASHER_INIT_MUTEX(self); } return (PyObject *)self; } static int MMH3Hasher128x86_init(MMH3Hasher128x86 *self, PyObject *args, PyObject *kwds) { Py_buffer target_buf = {0}; long long seed = 0; static char *kwlist[] = {"data", "seed", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwds, "|y*L", kwlist, &target_buf, &seed)) return -1; MMH3_VALIDATE_SEED_RETURN_INT(seed, target_buf); self->h1 = (uint32_t)seed; self->h2 = self->h1; self->h3 = self->h1; self->h4 = self->h1; if (target_buf.buf != NULL) { // target_buf will be released in update_x86_128_impl update_x86_128_impl(self, &target_buf); } return 0; } static PyObject * MMH3Hasher128x86_update(MMH3Hasher128x86 *self, PyObject *obj) { Py_buffer buf; GET_BUFFER_VIEW_OR_ERROUT(obj, &buf); // buf will be released in update_x86_128_impl update_x86_128_impl(self, &buf); Py_RETURN_NONE; } static PyObject * MMH3Hasher128x86_digest(MMH3Hasher128x86 *self, PyObject *Py_UNUSED(ignored)) { char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x86_128_impl(self->h1, self->h2, self->h3, self->h4, self->buffer1, self->buffer2, self->buffer3, self->buffer4, self->length, out); MMH3_HASHER_UNLOCK(self); return PyBytes_FromStringAndSize(out, MMH3_128_DIGESTSIZE); } static PyObject * MMH3Hasher128x86_sintdigest(MMH3Hasher128x86 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x86_128_impl(self->h1, self->h2, self->h3, self->h4, self->buffer1, self->buffer2, self->buffer3, self->buffer4, self->length, out); MMH3_HASHER_UNLOCK(self); const int little_endian = 1; const int is_signed = 1; /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray( (unsigned char *)out, MMH3_128_DIGESTSIZE, little_endian, is_signed); return retval; } static PyObject * MMH3Hasher128x86_uintdigest(MMH3Hasher128x86 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x86_128_impl(self->h1, self->h2, self->h3, self->h4, self->buffer1, self->buffer2, self->buffer3, self->buffer4, self->length, out); MMH3_HASHER_UNLOCK(self); const int little_endian = 1; const int is_signed = 0; /** * _PyLong_FromByteArray is not a part of the official Python/C API * and may be removed in the future (although it is practically stable). * cf. * https://mail.python.org/pipermail/python-list/2006-August/372365.html */ PyObject *retval = _PyLong_FromByteArray( (unsigned char *)out, MMH3_128_DIGESTSIZE, little_endian, is_signed); return retval; } static PyObject * MMH3Hasher128x86_stupledigest(MMH3Hasher128x86 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x86_128_impl(self->h1, self->h2, self->h3, self->h4, self->buffer1, self->buffer2, self->buffer3, self->buffer4, self->length, out); MMH3_HASHER_UNLOCK(self); const char *valflag = "LL"; uint64_t result1 = ((uint64_t *)out)[0]; uint64_t result2 = ((uint64_t *)out)[1]; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result1 = bswap_64(result1); result2 = bswap_64(result2); #endif return Py_BuildValue(valflag, result1, result2); } static PyObject * MMH3Hasher128x86_utupledigest(MMH3Hasher128x86 *self, PyObject *Py_UNUSED(ignored)) { const char out[MMH3_128_DIGESTSIZE]; MMH3_HASHER_LOCK(self); digest_x86_128_impl(self->h1, self->h2, self->h3, self->h4, self->buffer1, self->buffer2, self->buffer3, self->buffer4, self->length, out); MMH3_HASHER_UNLOCK(self); const char *valflag = "KK"; uint64_t result1 = ((uint64_t *)out)[0]; uint64_t result2 = ((uint64_t *)out)[1]; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) result1 = bswap_64(result1); result2 = bswap_64(result2); #endif return Py_BuildValue(valflag, result1, result2); } PyDoc_STRVAR(MMH3Hasher128x86_copy_doc, "copy() -> mmh3_128x86\n" "\n" "Return a copy of the hash object..\n" "\n" "Returns:\n" " mmh3_128x86: A copy of this hash object.\n"); static PyObject * MMH3Hasher128x86_copy(MMH3Hasher128x86 *self, PyObject *Py_UNUSED(ignored)) { MMH3Hasher128x86 *p; if ((p = PyObject_New(MMH3Hasher128x86, &MMH3Hasher128x86Type)) == NULL) { return NULL; } MMH3_HASHER_LOCK(self); p->h1 = self->h1; p->h2 = self->h2; p->h3 = self->h3; p->h4 = self->h4; p->buffer1 = self->buffer1; p->buffer2 = self->buffer2; p->buffer3 = self->buffer3; p->buffer4 = self->buffer4; p->shift = self->shift; p->length = self->length; MMH3_HASHER_INIT_MUTEX(p); MMH3_HASHER_UNLOCK(self); return (PyObject *)p; } static PyMethodDef MMH3Hasher128x86_methods[] = { {"update", (PyCFunction)MMH3Hasher128x86_update, METH_O, MMH3Hasher_update_doc}, {"digest", (PyCFunction)MMH3Hasher128x86_digest, METH_NOARGS, MMH3Hasher_digest_doc}, {"sintdigest", (PyCFunction)MMH3Hasher128x86_sintdigest, METH_NOARGS, MMH3Hasher_sintdigest_doc}, {"uintdigest", (PyCFunction)MMH3Hasher128x86_uintdigest, METH_NOARGS, MMH3Hasher_uintdigest_doc}, {"stupledigest", (PyCFunction)MMH3Hasher128x86_stupledigest, METH_NOARGS, MMH3Hasher128_stupledigest_doc}, {"utupledigest", (PyCFunction)MMH3Hasher128x86_utupledigest, METH_NOARGS, MMH3Hasher128_utupledigest_doc}, {"copy", (PyCFunction)MMH3Hasher128x86_copy, METH_NOARGS, MMH3Hasher128x86_copy_doc}, {NULL} /* Sentinel */ }; static PyObject * MMH3Hasher128x86_get_digest_size(PyObject *self, void *closure) { return PyLong_FromLong(MMH3_128_DIGESTSIZE); } static PyObject * MMH3Hasher128x86_get_block_size(PyObject *self, void *closure) { return PyLong_FromLong(MMH3_128_BLOCKSIZE); } static PyObject * MMH3Hasher128x86_get_name(PyObject *self, void *closure) { return PyUnicode_FromStringAndSize("mmh3_x86_128", 12); } static PyGetSetDef MMH3Hasher128x86_getsetters[] = { {"digest_size", (getter)MMH3Hasher128x86_get_digest_size, NULL, "int: Number of bytes in this hashes output", NULL}, {"block_size", (getter)MMH3Hasher128x86_get_block_size, NULL, "int: Number of bytes of the internal block of this algorithm", NULL}, {"name", (getter)MMH3Hasher128x86_get_name, NULL, "str: The hash algorithm being used by this object", NULL}, {NULL} /* Sentinel */ }; PyDoc_STRVAR( MMH3Hasher128x86Type_doc, "__init__(data=None, seed=0)\n" "\n" "Hasher for incrementally calculating the murmurhash3_x86_128 hash.\n" "\n" "Args:\n" " data (Buffer | None): The initial data to hash.\n" " seed (int): The seed value. Must be an integer in the range " "[0, 0xFFFFFFFF].\n" "\n" ".. versionchanged:: 5.2.0\n" " Experimental no-GIL support; thread safety not fully verified.\n" "\n" ".. versionchanged:: 5.0.0\n" " Added the optional ``data`` parameter as the first argument.\n" " The ``seed`` argument is now strictly checked for valid range.\n"); static PyTypeObject MMH3Hasher128x86Type = { PyVarObject_HEAD_INIT(NULL, 0).tp_name = "mmh3.mmh3_x86_128", .tp_doc = MMH3Hasher128x86Type_doc, .tp_basicsize = sizeof(MMH3Hasher128x86), .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, .tp_new = MMH3Hasher128x86_new, .tp_init = (initproc)MMH3Hasher128x86_init, .tp_dealloc = (destructor)MMH3Hasher128x86_dealloc, .tp_methods = MMH3Hasher128x86_methods, .tp_getset = MMH3Hasher128x86_getsetters, }; //----------------------------------------------------------------------------- // Module static struct PyModuleDef mmh3module = { PyModuleDef_HEAD_INIT, "mmh3", "A Python front-end to MurmurHash3.\n" "\n" "A Python front-end to MurmurHash3, " "a fast and robust non-cryptographic hash library " "created by Austin Appleby (http://code.google.com/p/smhasher/).\n" "\n" "Ported by Hajime Senuma . " "If you find any bugs, please submit an issue via " "https://github.com/hajimes/mmh3.\n" "\n" "Typical usage example:\n" "\n" " mmh3.hash(\"foobar\", 42)", -1, Mmh3Methods, NULL, NULL, NULL, NULL}; PyMODINIT_FUNC PyInit_mmh3(void) { if (PyType_Ready(&MMH3Hasher32Type) < 0) return NULL; if (PyType_Ready(&MMH3Hasher128x64Type) < 0) return NULL; if (PyType_Ready(&MMH3Hasher128x86Type) < 0) return NULL; PyObject *module = PyModule_Create(&mmh3module); if (module == NULL) return NULL; #ifdef Py_GIL_DISABLED PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED); #endif Py_INCREF(&MMH3Hasher32Type); if (PyModule_AddObject(module, "mmh3_32", (PyObject *)&MMH3Hasher32Type) < 0) { Py_DECREF(&MMH3Hasher32Type); Py_DECREF(module); return NULL; } Py_INCREF(&MMH3Hasher128x64Type); if (PyModule_AddObject(module, "mmh3_x64_128", (PyObject *)&MMH3Hasher128x64Type) < 0) { Py_DECREF(&MMH3Hasher128x64Type); Py_DECREF(module); return NULL; } Py_INCREF(&MMH3Hasher128x86Type); if (PyModule_AddObject(module, "mmh3_x86_128", (PyObject *)&MMH3Hasher128x86Type) < 0) { Py_DECREF(&MMH3Hasher128x86Type); Py_DECREF(module); return NULL; } return module; } ================================================ FILE: src/mmh3/murmurhash3.c ================================================ /*** * This file is under MIT Hajime Senuma, just like other files. * See LICENSE for details. * * It was originally written by Austin Appleby in C++ under the public domain, * but ported to PEP 7 C for Python 3.6 and later by the mmh3 project. * * Any issues should be reported to https://github.com/hajimes/mmh3/issues. * * The following is the original public domain notice by Austin Appleby. */ //----------------------------------------------------------------------------- // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. // Note - The x86 and x64 versions do _not_ produce the same results, as the // algorithms are optimized for their respective platforms. You can still // compile and run any of them on any platform, but your performance with the // non-native version will be less than optimal. #include "murmurhash3.h" //----------------------------------------------------------------------------- void murmurhash3_x86_32(const void *key, Py_ssize_t len, uint32_t seed, void *out) { const uint8_t *data = (const uint8_t *)key; const Py_ssize_t nblocks = len / 4; uint32_t h1 = seed; const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; //---------- // body const uint32_t *blocks = (const uint32_t *)(data + nblocks * 4); for (Py_ssize_t i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks, i); k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1, 13); h1 = h1 * 5 + 0xe6546b64; } //---------- // tail const uint8_t *tail = (const uint8_t *)(data + nblocks * 4); uint32_t k1 = 0; switch (len & 3) { case 3: k1 ^= tail[2] << 16; case 2: k1 ^= tail[1] << 8; case 1: k1 ^= tail[0]; k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h1 = fmix32(h1); *(uint32_t *)out = h1; } //----------------------------------------------------------------------------- void murmurhash3_x86_128(const void *key, const Py_ssize_t len, uint32_t seed, void *out) { const uint8_t *data = (const uint8_t *)key; const Py_ssize_t nblocks = len / 16; uint32_t h1 = seed; uint32_t h2 = seed; uint32_t h3 = seed; uint32_t h4 = seed; const uint32_t c1 = 0x239b961b; const uint32_t c2 = 0xab0e9789; const uint32_t c3 = 0x38b34ae5; const uint32_t c4 = 0xa1e38b93; //---------- // body const uint32_t *blocks = (const uint32_t *)(data + nblocks * 16); for (Py_ssize_t i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks, i * 4 + 0); uint32_t k2 = getblock32(blocks, i * 4 + 1); uint32_t k3 = getblock32(blocks, i * 4 + 2); uint32_t k4 = getblock32(blocks, i * 4 + 3); k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1, 19); h1 += h2; h1 = h1 * 5 + 0x561ccd1b; k2 *= c2; k2 = ROTL32(k2, 16); k2 *= c3; h2 ^= k2; h2 = ROTL32(h2, 17); h2 += h3; h2 = h2 * 5 + 0x0bcaa747; k3 *= c3; k3 = ROTL32(k3, 17); k3 *= c4; h3 ^= k3; h3 = ROTL32(h3, 15); h3 += h4; h3 = h3 * 5 + 0x96cd1c35; k4 *= c4; k4 = ROTL32(k4, 18); k4 *= c1; h4 ^= k4; h4 = ROTL32(h4, 13); h4 += h1; h4 = h4 * 5 + 0x32ac3b17; } //---------- // tail const uint8_t *tail = (const uint8_t *)(data + nblocks * 16); uint32_t k1 = 0; uint32_t k2 = 0; uint32_t k3 = 0; uint32_t k4 = 0; switch (len & 15) { case 15: k4 ^= tail[14] << 16; case 14: k4 ^= tail[13] << 8; case 13: k4 ^= tail[12] << 0; k4 *= c4; k4 = ROTL32(k4, 18); k4 *= c1; h4 ^= k4; case 12: k3 ^= tail[11] << 24; case 11: k3 ^= tail[10] << 16; case 10: k3 ^= tail[9] << 8; case 9: k3 ^= tail[8] << 0; k3 *= c3; k3 = ROTL32(k3, 17); k3 *= c4; h3 ^= k3; case 8: k2 ^= tail[7] << 24; case 7: k2 ^= tail[6] << 16; case 6: k2 ^= tail[5] << 8; case 5: k2 ^= tail[4] << 0; k2 *= c2; k2 = ROTL32(k2, 16); k2 *= c3; h2 ^= k2; case 4: k1 ^= tail[3] << 24; case 3: k1 ^= tail[2] << 16; case 2: k1 ^= tail[1] << 8; case 1: k1 ^= tail[0] << 0; k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; h1 = fmix32(h1); h2 = fmix32(h2); h3 = fmix32(h3); h4 = fmix32(h4); h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint32_t *)out)[0] = h2; ((uint32_t *)out)[1] = h1; ((uint32_t *)out)[2] = h4; ((uint32_t *)out)[3] = h3; #else ((uint32_t *)out)[0] = h1; ((uint32_t *)out)[1] = h2; ((uint32_t *)out)[2] = h3; ((uint32_t *)out)[3] = h4; #endif } //----------------------------------------------------------------------------- void murmurhash3_x64_128(const void *key, const Py_ssize_t len, const uint32_t seed, void *out) { const uint8_t *data = (const uint8_t *)key; const Py_ssize_t nblocks = len / 16; uint64_t h1 = seed; uint64_t h2 = seed; const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); //---------- // body const uint64_t *blocks = (const uint64_t *)(data); for (Py_ssize_t i = 0; i < nblocks; i++) { uint64_t k1 = getblock64(blocks, i * 2 + 0); uint64_t k2 = getblock64(blocks, i * 2 + 1); k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1; h1 = ROTL64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729; k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2; h2 = ROTL64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; } //---------- // tail const uint8_t *tail = (const uint8_t *)(data + nblocks * 16); uint64_t k1 = 0; uint64_t k2 = 0; switch (len & 15) { case 15: k2 ^= ((uint64_t)tail[14]) << 48; case 14: k2 ^= ((uint64_t)tail[13]) << 40; case 13: k2 ^= ((uint64_t)tail[12]) << 32; case 12: k2 ^= ((uint64_t)tail[11]) << 24; case 11: k2 ^= ((uint64_t)tail[10]) << 16; case 10: k2 ^= ((uint64_t)tail[9]) << 8; case 9: k2 ^= ((uint64_t)tail[8]) << 0; k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2; case 8: k1 ^= ((uint64_t)tail[7]) << 56; case 7: k1 ^= ((uint64_t)tail[6]) << 48; case 6: k1 ^= ((uint64_t)tail[5]) << 40; case 5: k1 ^= ((uint64_t)tail[4]) << 32; case 4: k1 ^= ((uint64_t)tail[3]) << 24; case 3: k1 ^= ((uint64_t)tail[2]) << 16; case 2: k1 ^= ((uint64_t)tail[1]) << 8; case 1: k1 ^= ((uint64_t)tail[0]) << 0; k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h1 += h2; h2 += h1; h1 = fmix64(h1); h2 = fmix64(h2); h1 += h2; h2 += h1; ((uint64_t *)out)[0] = h1; ((uint64_t *)out)[1] = h2; } //----------------------------------------------------------------------------- ================================================ FILE: src/mmh3/murmurhash3.h ================================================ /*** * This file is under MIT Hajime Senuma, just like other files. * See LICENSE for details. * * It was originally written by Austin Appleby in C++ under the public domain, * but ported to PEP 7 C for Python 3.6 and later by the mmh3 project. * * Any issues should be reported to https://github.com/hajimes/mmh3/issues. * * The following is the original public domain notice by Austin Appleby. */ //----------------------------------------------------------------------------- // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. #ifndef _MURMURHASH3_H_ #define _MURMURHASH3_H_ // To handle 64-bit data; see https://docs.python.org/3/c-api/arg.html #ifndef PY_SSIZE_T_CLEAN #define PY_SSIZE_T_CLEAN #endif #include #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #include #endif //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) && (_MSC_VER < 1600) typedef signed __int8 int8_t; typedef signed __int32 int32_t; typedef signed __int64 int64_t; typedef unsigned __int8 uint8_t; typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t; // Other compilers #else // defined(_MSC_VER) #include #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) #define FORCE_INLINE __forceinline #include #define ROTL32(x, y) _rotl(x, y) #define ROTL64(x, y) _rotl64(x, y) #define BIG_CONSTANT(x) (x) // Other compilers #else // defined(_MSC_VER) #if ((__GNUC__ > 4) || (__GNUC__ == 4 && GNUC_MINOR >= 4)) /* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6. Don't inline for RHEL 5 gcc * which is 4.1*/ #define FORCE_INLINE inline __attribute__((always_inline)) #else #define FORCE_INLINE #endif static FORCE_INLINE uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - r)); } static FORCE_INLINE uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); } #define ROTL32(x, y) rotl32(x, y) #define ROTL64(x, y) rotl64(x, y) #define BIG_CONSTANT(x) (x##LLU) #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here static FORCE_INLINE uint32_t getblock32(const uint32_t *p, Py_ssize_t i) { #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) return bswap_32(p[i]); #else return p[i]; #endif } static FORCE_INLINE uint64_t getblock64(const uint64_t *p, Py_ssize_t i) { #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) return bswap_64(p[i]); #else return p[i]; #endif } //----------------------------------------------------------------------------- // Building blocks for multiply and rotate (MUR) operations. // Names are taken from Google Guava's implementation static FORCE_INLINE uint32_t mixK1(uint32_t k1) { const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; return k1; } static FORCE_INLINE uint32_t mixH1(uint32_t h1, const uint32_t h2, const uint8_t shift, const uint32_t c1) { h1 = ROTL32(h1, shift); h1 += h2; h1 = h1 * 5 + c1; return h1; } static FORCE_INLINE uint64_t mixK_x64_128(uint64_t k1, const uint8_t shift, const uint64_t c1, const uint64_t c2) { k1 *= c1; k1 = ROTL64(k1, shift); k1 *= c2; return k1; } static FORCE_INLINE uint64_t mixK1_x64_128(uint64_t k1) { const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; return k1; } static FORCE_INLINE uint64_t mixK2_x64_128(uint64_t k2) { const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; return k2; } static FORCE_INLINE uint64_t mixH_x64_128(uint64_t h1, uint64_t h2, const uint8_t shift, const uint32_t c) { h1 = ROTL64(h1, shift); h1 += h2; h1 = h1 * 5 + c; return h1; } static FORCE_INLINE uint64_t mixK_x86_128(uint32_t k, const uint8_t shift, const uint32_t c1, const uint32_t c2) { k *= c1; k = ROTL32(k, shift); k *= c2; return k; } //----------------------------------------------------------------------------- // Finalization mix - force all bits of a hash block to avalanche static FORCE_INLINE uint32_t fmix32(uint32_t h) { h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; h ^= h >> 16; return h; } //---------- static FORCE_INLINE uint64_t fmix64(uint64_t k) { k ^= k >> 33; k *= BIG_CONSTANT(0xff51afd7ed558ccd); k ^= k >> 33; k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); k ^= k >> 33; return k; } //----------------------------------------------------------------------------- // Finalization function static FORCE_INLINE void digest_x64_128_impl(uint64_t h1, uint64_t h2, const uint64_t k1, const uint64_t k2, const Py_ssize_t len, const char *out) { h1 ^= mixK1_x64_128(k1); h2 ^= mixK2_x64_128(k2); h1 ^= len; h2 ^= len; h1 += h2; h2 += h1; h1 = fmix64(h1); h2 = fmix64(h2); h1 += h2; h2 += h1; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint64_t *)out)[0] = bswap_64(h1); ((uint64_t *)out)[1] = bswap_64(h2); #else ((uint64_t *)out)[0] = h1; ((uint64_t *)out)[1] = h2; #endif } static FORCE_INLINE void digest_x86_128_impl(uint32_t h1, uint32_t h2, uint32_t h3, uint32_t h4, const uint32_t k1, const uint32_t k2, const uint32_t k3, const uint32_t k4, const Py_ssize_t len, const char *out) { const uint32_t c1 = 0x239b961b; const uint32_t c2 = 0xab0e9789; const uint32_t c3 = 0x38b34ae5; const uint32_t c4 = 0xa1e38b93; h1 ^= mixK_x86_128(k1, 15, c1, c2); h2 ^= mixK_x86_128(k2, 16, c2, c3); h3 ^= mixK_x86_128(k3, 17, c3, c4); h4 ^= mixK_x86_128(k4, 18, c4, c1); h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; h1 = fmix32(h1); h2 = fmix32(h2); h3 = fmix32(h3); h4 = fmix32(h4); h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint32_t *)out)[0] = bswap_32(h1); ((uint32_t *)out)[1] = bswap_32(h2); ((uint32_t *)out)[2] = bswap_32(h3); ((uint32_t *)out)[3] = bswap_32(h4); #else ((uint32_t *)out)[0] = h1; ((uint32_t *)out)[1] = h2; ((uint32_t *)out)[2] = h3; ((uint32_t *)out)[3] = h4; #endif } //----------------------------------------------------------------------------- void murmurhash3_x86_32(const void *key, Py_ssize_t len, uint32_t seed, void *out); void murmurhash3_x86_128(const void *key, Py_ssize_t len, uint32_t seed, void *out); void murmurhash3_x64_128(const void *key, Py_ssize_t len, uint32_t seed, void *out); //----------------------------------------------------------------------------- #endif // _MURMURHASH3_H_ ================================================ FILE: src/mmh3/py.typed ================================================ ================================================ FILE: tests/helper.py ================================================ """Helper functions for tests.""" # see also https://stackoverflow.com/a/1375939 def u32_to_s32(v: int) -> int: """Convert unsigned 32-bit integer to signed 32-bit integer. Args: v: Unsigned 32-bit integer. Returns: Signed 32-bit representation of the input. """ if v & 0x80000000: return -0x100000000 + v return v ================================================ FILE: tests/test_doctrings.py ================================================ # pylint: disable=missing-module-docstring,missing-function-docstring import mmh3 def test_function_docstrings() -> None: assert "__doc__" in dir(mmh3.hash) assert mmh3.hash.__doc__ is not None assert mmh3.hash.__doc__.startswith("hash(key, seed=0, signed=True) -> int\n\n") assert "__doc__" in dir(mmh3.hash_from_buffer) assert mmh3.hash_from_buffer.__doc__ is not None assert mmh3.hash_from_buffer.__doc__.startswith( "hash_from_buffer(key, seed=0, signed=True) -> int\n\n" ) assert "__doc__" in dir(mmh3.hash64) assert mmh3.hash64.__doc__ is not None assert mmh3.hash64.__doc__.startswith( "hash64(key, seed=0, x64arch=True, signed=True) -> tuple[int, int]\n\n" ) assert "__doc__" in dir(mmh3.hash128) assert mmh3.hash128.__doc__ is not None assert mmh3.hash128.__doc__.startswith( "hash128(key, seed=0, x64arch=True, signed=False) -> int\n\n" ) assert "__doc__" in dir(mmh3.hash_bytes) assert mmh3.hash_bytes.__doc__ is not None assert mmh3.hash_bytes.__doc__.startswith( "hash_bytes(key, seed=0, x64arch=True) -> bytes\n\n" ) def test_module_docstring() -> None: assert "__doc__" in dir(mmh3) assert mmh3.__doc__ is not None assert mmh3.__doc__.startswith("A Python front-end to MurmurHash3") ================================================ FILE: tests/test_free_threading.py ================================================ # pylint: disable=missing-module-docstring,missing-function-docstring from collections.abc import Callable from concurrent.futures import ThreadPoolExecutor from typing import Any import mmh3 def run_threaded(func: Callable[..., Any], num_threads: int = 8) -> None: with ThreadPoolExecutor(max_workers=num_threads) as executor: futures = [executor.submit(func) for _ in range(num_threads)] for future in futures: future.result() # wait for all threads to complete def test_parallel_hasher_mmh3_32_update() -> None: hasher = mmh3.mmh3_32() def closure() -> None: for _ in range(1000): hasher.update(b"foo") run_threaded(closure, num_threads=8) assert hasher.sintdigest() == mmh3.hash(b"foo" * 8000) def test_parallel_hasher_mmh3_x64_128_update() -> None: hasher = mmh3.mmh3_x64_128() def closure() -> None: for _ in range(1000): hasher.update(b"foo") run_threaded(closure, num_threads=8) assert hasher.sintdigest() == mmh3.hash128(b"foo" * 8000, x64arch=True, signed=True) def test_parallel_hasher_mmh3_x86_128_update() -> None: hasher = mmh3.mmh3_x86_128() def closure() -> None: for _ in range(1000): hasher.update(b"foo") run_threaded(closure, num_threads=8) assert hasher.sintdigest() == mmh3.hash128( b"foo" * 8000, x64arch=False, signed=True ) ================================================ FILE: tests/test_invalid_inputs.py ================================================ # pylint: disable=missing-module-docstring, missing-function-docstring # pylint: disable=no-value-for-parameter, too-many-function-args from typing import no_type_check import pytest import mmh3 @no_type_check def test_hash_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.hash() with pytest.raises(TypeError): mmh3.hash(b"hello, world", "42") with pytest.raises(TypeError): mmh3.hash(b"hello, world", 42, True, 1234) with pytest.raises(TypeError): mmh3.hash(b"hello, world", seed="42") with pytest.raises(TypeError): mmh3.hash([1, 2, 3], 42) # pylint: disable=redundant-keyword-arg with pytest.raises(TypeError): mmh3.hash(b"hello, world", key=b"42") @no_type_check def test_hash_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.hash(b"hello, world", -1) with pytest.raises(ValueError): mmh3.hash(b"hello, world", 2**32) @no_type_check def test_hash128_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.hash128() with pytest.raises(TypeError): mmh3.hash128(b"hello, world", "42") with pytest.raises(TypeError): mmh3.hash128(b"hello, world", 42, True, False, 1234) with pytest.raises(TypeError): mmh3.hash128(b"hello, world", seed="42") with pytest.raises(TypeError): mmh3.hash128([1, 2, 3], 42) @no_type_check def test_hash128_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.hash128(b"hello, world", -1) with pytest.raises(ValueError): mmh3.hash128(b"hello, world", 2**32) @no_type_check def test_hash64_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.hash64() with pytest.raises(TypeError): mmh3.hash64(b"hello, world", "42") with pytest.raises(TypeError): mmh3.hash64(b"hello, world", 42, True, False, 1234) with pytest.raises(TypeError): mmh3.hash64(b"hello, world", seed="42") with pytest.raises(TypeError): mmh3.hash64([1, 2, 3], 42) @no_type_check def test_hash64_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.hash64(b"hello, world", -1) with pytest.raises(ValueError): mmh3.hash64(b"hello, world", 2**32) @no_type_check def test_hash_bytes_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.hash_bytes() with pytest.raises(TypeError): mmh3.hash_bytes(b"hello, world", "42") with pytest.raises(TypeError): mmh3.hash_bytes(b"hello, world", 42, True, 1234) with pytest.raises(TypeError): mmh3.hash_bytes(b"hello, world", seed="42") with pytest.raises(TypeError): mmh3.hash_bytes([1, 2, 3], 42) @no_type_check def test_hash_bytes_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.hash_bytes(b"hello, world", -1) with pytest.raises(ValueError): mmh3.hash_bytes(b"hello, world", 2**32) @no_type_check def test_hash_from_buffer_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.hash_from_buffer() with pytest.raises(TypeError): mmh3.hash_from_buffer(b"hello, world", "42") with pytest.raises(TypeError): mmh3.hash_from_buffer(b"hello, world", 42, True, 1234) with pytest.raises(TypeError): mmh3.hash_from_buffer(b"hello, world", seed="42") with pytest.raises(TypeError): mmh3.hash_from_buffer([1, 2, 3], 42) @no_type_check def test_hash_from_buffer_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.hash_from_buffer(b"hello, world", -1) with pytest.raises(ValueError): mmh3.hash_from_buffer(b"hello, world", 2**32) @no_type_check def test_mmh3_32_digest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_32_digest() with pytest.raises(TypeError): mmh3.mmh3_32_digest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_32_digest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_32_digest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_32_digest([1, 2, 3], 42) @no_type_check def test_mmh3_32_digest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_32_digest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_32_digest(b"hello, world", 2**32) @no_type_check def test_mmh3_32_sintdigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_32_sintdigest() with pytest.raises(TypeError): mmh3.mmh3_32_sintdigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_32_sintdigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_32_sintdigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_32_sintdigest([1, 2, 3], 42) @no_type_check def test_mmh3_32_sintdigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_32_sintdigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_32_sintdigest(b"hello, world", 2**32) @no_type_check def test_mmh3_32_uintdigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_32_uintdigest() with pytest.raises(TypeError): mmh3.mmh3_32_uintdigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_32_uintdigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_32_uintdigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_32_uintdigest([1, 2, 3], 42) @no_type_check def test_mmh3_32_uintdigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_32_uintdigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_32_uintdigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x64_128_digest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x64_128_digest() with pytest.raises(TypeError): mmh3.mmh3_x64_128_digest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x64_128_digest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x64_128_digest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x64_128_digest([1, 2, 3], 42) @no_type_check def test_mmh3_x64_128_digest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x64_128_digest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x64_128_digest(b"hello, world", 2**32) @no_type_check def test_mmh3_x64_128_sintdigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x64_128_sintdigest() with pytest.raises(TypeError): mmh3.mmh3_x64_128_sintdigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x64_128_sintdigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x64_128_sintdigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x64_128_sintdigest([1, 2, 3], 42) @no_type_check def test_mmh3_x64_128_sintdigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x64_128_sintdigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x64_128_sintdigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x64_128_uintdigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x64_128_uintdigest() with pytest.raises(TypeError): mmh3.mmh3_x64_128_uintdigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x64_128_uintdigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x64_128_uintdigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x64_128_uintdigest([1, 2, 3], 42) @no_type_check def test_mmh3_x64_128_uintdigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x64_128_uintdigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x64_128_uintdigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x64_128_stupledigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x64_128_stupledigest() with pytest.raises(TypeError): mmh3.mmh3_x64_128_stupledigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x64_128_stupledigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x64_128_stupledigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x64_128_stupledigest([1, 2, 3], 42) @no_type_check def test_mmh3_x64_128_stupledigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x64_128_stupledigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x64_128_stupledigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x64_128_utupledigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x64_128_utupledigest() with pytest.raises(TypeError): mmh3.mmh3_x64_128_utupledigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x64_128_utupledigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x64_128_utupledigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x64_128_utupledigest([1, 2, 3], 42) @no_type_check def test_mmh3_x64_128_utupledigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x64_128_utupledigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x64_128_utupledigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x86_128_digest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x86_128_digest() with pytest.raises(TypeError): mmh3.mmh3_x86_128_digest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x86_128_digest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x86_128_digest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x86_128_digest([1, 2, 3], 42) @no_type_check def test_mmh3_x86_128_digest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x86_128_digest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x86_128_digest(b"hello, world", 2**32) @no_type_check def test_mmh3_x86_128_sintdigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x86_128_sintdigest() with pytest.raises(TypeError): mmh3.mmh3_x86_128_sintdigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x86_128_sintdigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x86_128_sintdigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x86_128_sintdigest([1, 2, 3], 42) @no_type_check def test_mmh3_x86_128_sintdigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x86_128_sintdigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x86_128_sintdigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x86_128_uintdigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x86_128_uintdigest() with pytest.raises(TypeError): mmh3.mmh3_x86_128_uintdigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x86_128_uintdigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x86_128_uintdigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x86_128_uintdigest([1, 2, 3], 42) @no_type_check def test_mmh3_x86_128_uintdigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x86_128_uintdigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x86_128_uintdigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x86_128_stupledigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x86_128_stupledigest() with pytest.raises(TypeError): mmh3.mmh3_x86_128_stupledigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x86_128_stupledigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x86_128_stupledigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x86_128_stupledigest([1, 2, 3], 42) @no_type_check def test_mmh3_x86_128_stupledigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x86_128_stupledigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x86_128_stupledigest(b"hello, world", 2**32) @no_type_check def test_mmh3_x86_128_utupledigest_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x86_128_utupledigest() with pytest.raises(TypeError): mmh3.mmh3_x86_128_utupledigest(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x86_128_utupledigest("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x86_128_utupledigest(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x86_128_utupledigest([1, 2, 3], 42) @no_type_check def test_mmh3_x86_128_utupledigest_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x86_128_utupledigest(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x86_128_utupledigest(b"hello, world", 2**32) @no_type_check def test_mmh3_32_init_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_32(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_32("hello, world") with pytest.raises(TypeError): mmh3.mmh3_32(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_32([1, 2, 3], 42) @no_type_check def test_mmh3_32_init_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_32(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_32(b"hello, world", 2**32) @no_type_check def test_mmh3_x64_128_init_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x64_128(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x64_128("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x64_128(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x64_128([1, 2, 3], 42) @no_type_check def test_mmh3_x64_128_init_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x64_128(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x64_128(b"hello, world", 2**32) @no_type_check def test_mmh3_x86_128_init_raises_typeerror() -> None: with pytest.raises(TypeError): mmh3.mmh3_x86_128(b"hello, world", 42, 1234) with pytest.raises(TypeError): mmh3.mmh3_x86_128("hello, world") with pytest.raises(TypeError): mmh3.mmh3_x86_128(b"hello, world", "42") with pytest.raises(TypeError): mmh3.mmh3_x86_128([1, 2, 3], 42) @no_type_check def test_mmh3_x86_128_init_raises_valueerror() -> None: with pytest.raises(ValueError): mmh3.mmh3_x86_128(b"hello, world", -1) with pytest.raises(ValueError): mmh3.mmh3_x86_128(b"hello, world", 2**32) ================================================ FILE: tests/test_mmh3.py ================================================ # pylint: disable=missing-module-docstring,missing-function-docstring import sys import mmh3 from helper import u32_to_s32 def test_hash() -> None: assert mmh3.hash("foo") == -156908512 # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 assert mmh3.hash(b"", seed=0) == 0 assert mmh3.hash(b"", seed=1) == 0x514E28B7 assert mmh3.hash(b"", seed=0xFFFFFFFF) == u32_to_s32(0x81F16F39) assert mmh3.hash(b"\x21\x43\x65\x87", 0) == u32_to_s32(0xF55B516B) assert mmh3.hash(b"\x21\x43\x65\x87", 0x5082EDEE) == u32_to_s32(0x2362F9DE) assert mmh3.hash(b"\x21\x43\x65", 0) == u32_to_s32(0x7E4A8634) assert mmh3.hash(b"\x21\x43", 0) == u32_to_s32(0xA0F7B07A) assert mmh3.hash(b"\x21", 0) == u32_to_s32(0x72661CF4) assert mmh3.hash(b"\xff\xff\xff\xff", 0) == u32_to_s32(0x76293B50) assert mmh3.hash(b"\x00\x00\x00\x00", 0) == u32_to_s32(0x2362F9DE) assert mmh3.hash(b"\x00\x00\x00", 0) == u32_to_s32(0x85F0B427) assert mmh3.hash(b"\x00\x00", 0) == u32_to_s32(0x30F4C306) assert mmh3.hash(b"\x00", 0) == u32_to_s32(0x514E28B7) assert mmh3.hash("aaaa", 0x9747B28C) == u32_to_s32(0x5A97808A) assert mmh3.hash("aaa", 0x9747B28C) == u32_to_s32(0x283E0130) assert mmh3.hash("aa", 0x9747B28C) == u32_to_s32(0x5D211726) assert mmh3.hash("a", 0x9747B28C) == u32_to_s32(0x7FA09EA6) assert mmh3.hash("abcd", 0x9747B28C) == u32_to_s32(0xF0478627) assert mmh3.hash("abc", 0x9747B28C) == u32_to_s32(0xC84A62DD) assert mmh3.hash("ab", 0x9747B28C) == u32_to_s32(0x74875592) assert mmh3.hash("a", 0x9747B28C) == u32_to_s32(0x7FA09EA6) assert mmh3.hash("Hello, world!", 0x9747B28C) == u32_to_s32(0x24884CBA) assert mmh3.hash("ππππππππ".encode(), 0x9747B28C) == u32_to_s32(0xD58063C1) assert mmh3.hash("a" * 256, 0x9747B28C) == u32_to_s32(0x37405BDC) assert mmh3.hash("abc", 0) == u32_to_s32(0xB3DD93FA) assert mmh3.hash( "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0 ) == u32_to_s32(0xEE925B90) assert mmh3.hash( "The quick brown fox jumps over the lazy dog", 0x9747B28C ) == u32_to_s32(0x2FA826CD) def test_hash_unsigned() -> None: assert mmh3.hash("foo", signed=False) == 4138058784 # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 assert mmh3.hash(b"", seed=0, signed=False) == 0 assert mmh3.hash(b"", seed=1, signed=False) == 0x514E28B7 assert mmh3.hash(b"", seed=0xFFFFFFFF, signed=False) == 0x81F16F39 assert mmh3.hash(b"\x21\x43\x65\x87", 0, signed=False) == 0xF55B516B assert mmh3.hash(b"\x21\x43\x65\x87", 0x5082EDEE, signed=False) == 0x2362F9DE assert mmh3.hash(b"\x21\x43\x65", 0, signed=False) == 0x7E4A8634 assert mmh3.hash(b"\x21\x43", 0, signed=False) == 0xA0F7B07A assert mmh3.hash(b"\x21", 0, signed=False) == 0x72661CF4 assert mmh3.hash(b"\xff\xff\xff\xff", 0, signed=False) == 0x76293B50 assert mmh3.hash(b"\x00\x00\x00\x00", 0, signed=False) == 0x2362F9DE assert mmh3.hash(b"\x00\x00\x00", 0, signed=False) == 0x85F0B427 assert mmh3.hash(b"\x00\x00", 0, signed=False) == 0x30F4C306 assert mmh3.hash(b"\x00", 0, signed=False) == 0x514E28B7 assert mmh3.hash("aaaa", 0x9747B28C, signed=False) == 0x5A97808A assert mmh3.hash("aaa", 0x9747B28C, signed=False) == 0x283E0130 assert mmh3.hash("aa", 0x9747B28C, signed=False) == 0x5D211726 assert mmh3.hash("a", 0x9747B28C, signed=False) == 0x7FA09EA6 assert mmh3.hash("abcd", 0x9747B28C, signed=False) == 0xF0478627 assert mmh3.hash("abc", 0x9747B28C, signed=False) == 0xC84A62DD assert mmh3.hash("ab", 0x9747B28C, signed=False) == 0x74875592 assert mmh3.hash("a", 0x9747B28C, signed=False) == 0x7FA09EA6 assert mmh3.hash("Hello, world!", 0x9747B28C, signed=False) == 0x24884CBA assert mmh3.hash("ππππππππ".encode(), 0x9747B28C, signed=False) == 0xD58063C1 assert mmh3.hash("a" * 256, 0x9747B28C, signed=False) == 0x37405BDC assert mmh3.hash("abc", 0, signed=False) == 0xB3DD93FA assert ( mmh3.hash( "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0, signed=False ) == 0xEE925B90 ) assert ( mmh3.hash( "The quick brown fox jumps over the lazy dog", 0x9747B28C, signed=False ) == 0x2FA826CD ) assert ( mmh3.hash( "The quick brown fox jumps over the lazy dog", 0x9747B28C, signed=False ) == 0x2FA826CD ) def test_hash2() -> None: assert mmh3.hash("foo") == -156908512 # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 assert mmh3.hash(b"", seed=0) == 0 assert mmh3.hash(b"", seed=1) == 0x514E28B7 assert mmh3.hash(b"", seed=0xFFFFFFFF) == u32_to_s32(0x81F16F39) assert mmh3.hash(b"\x21\x43\x65\x87", 0) == u32_to_s32(0xF55B516B) assert mmh3.hash(b"\x21\x43\x65\x87", 0x5082EDEE) == u32_to_s32(0x2362F9DE) assert mmh3.hash(b"\x21\x43\x65", 0) == u32_to_s32(0x7E4A8634) assert mmh3.hash(b"\x21\x43", 0) == u32_to_s32(0xA0F7B07A) assert mmh3.hash(b"\x21", 0) == u32_to_s32(0x72661CF4) assert mmh3.hash(b"\xff\xff\xff\xff", 0) == u32_to_s32(0x76293B50) assert mmh3.hash(b"\x00\x00\x00\x00", 0) == u32_to_s32(0x2362F9DE) assert mmh3.hash(b"\x00\x00\x00", 0) == u32_to_s32(0x85F0B427) assert mmh3.hash(b"\x00\x00", 0) == u32_to_s32(0x30F4C306) assert mmh3.hash(b"\x00", 0) == u32_to_s32(0x514E28B7) assert mmh3.hash("aaaa", 0x9747B28C) == u32_to_s32(0x5A97808A) assert mmh3.hash("aaa", 0x9747B28C) == u32_to_s32(0x283E0130) assert mmh3.hash("aa", 0x9747B28C) == u32_to_s32(0x5D211726) assert mmh3.hash("a", 0x9747B28C) == u32_to_s32(0x7FA09EA6) assert mmh3.hash("abcd", 0x9747B28C) == u32_to_s32(0xF0478627) assert mmh3.hash("abc", 0x9747B28C) == u32_to_s32(0xC84A62DD) assert mmh3.hash("ab", 0x9747B28C) == u32_to_s32(0x74875592) assert mmh3.hash("a", 0x9747B28C) == u32_to_s32(0x7FA09EA6) assert mmh3.hash("Hello, world!", 0x9747B28C) == u32_to_s32(0x24884CBA) assert mmh3.hash("ππππππππ".encode(), 0x9747B28C) == u32_to_s32(0xD58063C1) assert mmh3.hash("a" * 256, 0x9747B28C) == u32_to_s32(0x37405BDC) assert mmh3.hash("abc", 0) == u32_to_s32(0xB3DD93FA) assert mmh3.hash( "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0 ) == u32_to_s32(0xEE925B90) assert mmh3.hash( "The quick brown fox jumps over the lazy dog", 0x9747B28C ) == u32_to_s32(0x2FA826CD) def test_hash_from_buffer() -> None: mview = memoryview(b"foo") assert mmh3.hash_from_buffer(mview) == -156908512 assert mmh3.hash_from_buffer(mview, signed=False) == 4138058784 def test_hash_bytes() -> None: assert mmh3.hash_bytes("foo") == b"aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~" assert ( mmh3.hash_bytes("foo", 0, True) == b"aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~" ) # Test vectors from https://github.com/PeterScott/murmur3/blob/master/test.c assert mmh3.hash_bytes("Hello, world!", 123, x64arch=False) == ( 0x9E37C886A41621625A1AACD761C9129E ).to_bytes(16, "little") assert mmh3.hash_bytes("", 123, x64arch=False) == ( 0x26F3E79926F3E79926F3E799FEDC5245 ).to_bytes(16, "little") def test_hash64() -> None: assert mmh3.hash64("foo") == (-2129773440516405919, 9128664383759220103) assert mmh3.hash64("foo", signed=False) == ( 16316970633193145697, 9128664383759220103, ) assert mmh3.hash64("The quick brown fox jumps over the lazy dog", 0x9747B28C) == ( 8325606756057297185, -484854449282476315, ) assert mmh3.hash64( "The quick brown fox jumps over the lazy dog", 0x9747B28C, signed=False ) == ( 8325606756057297185, 17961889624427075301, ) assert mmh3.hash64("foo", signed=False, x64arch=True) == ( 16316970633193145697, 9128664383759220103, ) # Test vectors from https://github.com/PeterScott/murmur3/blob/master/test.c assert mmh3.hash64("Hello, world!", 123, signed=False, x64arch=False) == ( 0x5A1AACD761C9129E, 0x9E37C886A4162162, ) assert mmh3.hash64("", 123, False, False) == ( 0x26F3E799FEDC5245, 0x26F3E79926F3E799, ) def test_hash128() -> None: assert mmh3.hash128("foo") == 168394135621993849475852668931176482145 assert mmh3.hash128("foo", 42) == 215966891540331383248189432718888555506 assert ( mmh3.hash128("foo", 42, signed=False) == 215966891540331383248189432718888555506 ) assert ( mmh3.hash128("foo", 42, signed=True) == -124315475380607080215185174712879655950 ) # Test vectors from https://github.com/PeterScott/murmur3/blob/master/test.c assert ( mmh3.hash128("Hello, world!", 123, signed=False, x64arch=False) == 0x9E37C886A41621625A1AACD761C9129E ) assert mmh3.hash128("", 123, False, False) == 0x26F3E79926F3E79926F3E799FEDC5245 def test_mmh3_32_digest() -> None: assert mmh3.mmh3_32_digest(b"") == b"\0\0\0\0" assert mmh3.mmh3_32_digest(b"", 0) == b"\0\0\0\0" assert mmh3.mmh3_32_digest(b"\x21\x43\x65\x87", 0) == (0xF55B516B).to_bytes( 4, "little" ) assert mmh3.mmh3_32_digest(b"\x21\x43\x65\x87", u32_to_s32(0x5082EDEE)) == ( 0x2362F9DE ).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"\x21\x43\x65", 0) == (0x7E4A8634).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"\x21\x43", 0) == (0xA0F7B07A).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"\x21", 0) == (0x72661CF4).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"\xff\xff\xff\xff", 0) == (0x76293B50).to_bytes( 4, "little" ) assert mmh3.mmh3_32_digest(b"\x00\x00\x00\x00", 0) == (0x2362F9DE).to_bytes( 4, "little" ) assert mmh3.mmh3_32_digest(b"\x00\x00\x00", 0) == (0x85F0B427).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"\x00\x00", 0) == (0x30F4C306).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"\x00", 0) == (0x514E28B7).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"aaaa", 0x9747B28C) == (0x5A97808A).to_bytes( 4, "little" ) assert mmh3.mmh3_32_digest(b"aaa", 0x9747B28C) == (0x283E0130).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"aa", 0x9747B28C) == (0x5D211726).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"a", 0x9747B28C) == (0x7FA09EA6).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"abcd", 0x9747B28C) == (0xF0478627).to_bytes( 4, "little" ) assert mmh3.mmh3_32_digest(b"abc", 0x9747B28C) == (0xC84A62DD).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"ab", 0x9747B28C) == (0x74875592).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"a", 0x9747B28C) == (0x7FA09EA6).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"Hello, world!", 0x9747B28C) == (0x24884CBA).to_bytes( 4, "little" ) assert mmh3.mmh3_32_digest("ππππππππ".encode(), 0x9747B28C) == ( 0xD58063C1 ).to_bytes(4, "little") assert mmh3.mmh3_32_digest(b"a" * 256, 0x9747B28C) == (0x37405BDC).to_bytes( 4, "little" ) assert mmh3.mmh3_32_digest(b"abc", 0) == (0xB3DD93FA).to_bytes(4, "little") assert mmh3.mmh3_32_digest( b"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0 ) == (0xEE925B90).to_bytes(4, "little") assert mmh3.mmh3_32_digest( b"The quick brown fox jumps over the lazy dog", 0x9747B28C ) == (0x2FA826CD).to_bytes(4, "little") assert mmh3.mmh3_32_digest(bytearray(b"aaaa"), 0x9747B28C) == (0x5A97808A).to_bytes( 4, "little" ) v = memoryview(b"aaaa") assert mmh3.mmh3_32_digest(v, 0x9747B28C) == (0x5A97808A).to_bytes(4, "little") assert mmh3.mmh3_32_digest(v[1:3], 0x9747B28C) == (0x5D211726).to_bytes(4, "little") def test_mmh3_sintdigest() -> None: assert mmh3.mmh3_32_sintdigest(b"foo") == -156908512 assert mmh3.mmh3_32_sintdigest(bytearray(b"foo")) == -156908512 assert mmh3.mmh3_32_sintdigest(memoryview(b"foobar")[0:3]) == -156908512 # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 assert mmh3.mmh3_32_sintdigest(b"", 0) == 0 assert mmh3.mmh3_32_sintdigest(b"", 1) == 0x514E28B7 assert mmh3.mmh3_32_sintdigest(b"", 0xFFFFFFFF) == u32_to_s32(0x81F16F39) assert mmh3.mmh3_32_sintdigest(b"\x21\x43\x65\x87", 0) == u32_to_s32(0xF55B516B) assert mmh3.mmh3_32_sintdigest( b"\x21\x43\x65\x87", u32_to_s32(0x5082EDEE) ) == u32_to_s32(0x2362F9DE) assert mmh3.mmh3_32_sintdigest(b"\x21\x43\x65", 0) == u32_to_s32(0x7E4A8634) assert mmh3.mmh3_32_sintdigest(b"\x21\x43", 0) == u32_to_s32(0xA0F7B07A) assert mmh3.mmh3_32_sintdigest(b"\x21", 0) == u32_to_s32(0x72661CF4) assert mmh3.mmh3_32_sintdigest(b"\xff\xff\xff\xff", 0) == u32_to_s32(0x76293B50) assert mmh3.mmh3_32_sintdigest(b"\x00\x00\x00\x00", 0) == u32_to_s32(0x2362F9DE) assert mmh3.mmh3_32_sintdigest(b"\x00\x00\x00", 0) == u32_to_s32(0x85F0B427) assert mmh3.mmh3_32_sintdigest(b"\x00\x00", 0) == u32_to_s32(0x30F4C306) assert mmh3.mmh3_32_sintdigest(b"\x00", 0) == u32_to_s32(0x514E28B7) assert mmh3.mmh3_32_sintdigest(b"aaaa", 0x9747B28C) == u32_to_s32(0x5A97808A) assert mmh3.mmh3_32_sintdigest(b"aaa", 0x9747B28C) == u32_to_s32(0x283E0130) assert mmh3.mmh3_32_sintdigest(b"aa", 0x9747B28C) == u32_to_s32(0x5D211726) assert mmh3.mmh3_32_sintdigest(b"a", 0x9747B28C) == u32_to_s32(0x7FA09EA6) assert mmh3.mmh3_32_sintdigest(b"abcd", 0x9747B28C) == u32_to_s32(0xF0478627) assert mmh3.mmh3_32_sintdigest(b"abc", 0x9747B28C) == u32_to_s32(0xC84A62DD) assert mmh3.mmh3_32_sintdigest(b"ab", 0x9747B28C) == u32_to_s32(0x74875592) assert mmh3.mmh3_32_sintdigest(b"a", 0x9747B28C) == u32_to_s32(0x7FA09EA6) assert mmh3.mmh3_32_sintdigest(b"Hello, world!", 0x9747B28C) == u32_to_s32( 0x24884CBA ) assert mmh3.mmh3_32_sintdigest("ππππππππ".encode(), 0x9747B28C) == u32_to_s32( 0xD58063C1 ) assert mmh3.mmh3_32_sintdigest(b"a" * 256, 0x9747B28C) == u32_to_s32(0x37405BDC) assert mmh3.mmh3_32_sintdigest(b"abc", 0) == u32_to_s32(0xB3DD93FA) assert mmh3.mmh3_32_sintdigest( b"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0 ) == u32_to_s32(0xEE925B90) assert mmh3.mmh3_32_sintdigest( b"The quick brown fox jumps over the lazy dog", 0x9747B28C ) == u32_to_s32(0x2FA826CD) def test_mmh3_uintdigest() -> None: assert mmh3.mmh3_32_uintdigest(b"foo") == 4138058784 assert mmh3.mmh3_32_uintdigest(bytearray(b"foo")) == 4138058784 assert mmh3.mmh3_32_uintdigest(memoryview(b"foobar")[0:3]) == 4138058784 # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 assert mmh3.mmh3_32_uintdigest(b"") == 0 assert mmh3.mmh3_32_uintdigest(b"", 0) == 0 assert mmh3.mmh3_32_uintdigest(b"", 1) == 0x514E28B7 assert mmh3.mmh3_32_uintdigest(b"", 0xFFFFFFFF) == 0x81F16F39 assert mmh3.mmh3_32_uintdigest(b"\x21\x43\x65\x87", 0) == 0xF55B516B assert mmh3.mmh3_32_uintdigest(b"\x21\x43\x65\x87", 0x5082EDEE) == 0x2362F9DE assert mmh3.mmh3_32_uintdigest(b"\x21\x43\x65", 0) == 0x7E4A8634 assert mmh3.mmh3_32_uintdigest(b"\x21\x43", 0) == 0xA0F7B07A assert mmh3.mmh3_32_uintdigest(b"\x21", 0) == 0x72661CF4 assert mmh3.mmh3_32_uintdigest(b"\xff\xff\xff\xff", 0) == 0x76293B50 assert mmh3.mmh3_32_uintdigest(b"\x00\x00\x00\x00", 0) == 0x2362F9DE assert mmh3.mmh3_32_uintdigest(b"\x00\x00\x00", 0) == 0x85F0B427 assert mmh3.mmh3_32_uintdigest(b"\x00\x00", 0) == 0x30F4C306 assert mmh3.mmh3_32_uintdigest(b"\x00", 0) == 0x514E28B7 assert mmh3.mmh3_32_uintdigest(b"aaaa", 0x9747B28C) == 0x5A97808A assert mmh3.mmh3_32_uintdigest(b"aaa", 0x9747B28C) == 0x283E0130 assert mmh3.mmh3_32_uintdigest(b"aa", 0x9747B28C) == 0x5D211726 assert mmh3.mmh3_32_uintdigest(b"a", 0x9747B28C) == 0x7FA09EA6 assert mmh3.mmh3_32_uintdigest(b"abcd", 0x9747B28C) == 0xF0478627 assert mmh3.mmh3_32_uintdigest(b"abc", 0x9747B28C) == 0xC84A62DD assert mmh3.mmh3_32_uintdigest(b"ab", 0x9747B28C) == 0x74875592 assert mmh3.mmh3_32_uintdigest(b"a", 0x9747B28C) == 0x7FA09EA6 assert mmh3.mmh3_32_uintdigest(b"Hello, world!", 0x9747B28C) == 0x24884CBA assert mmh3.mmh3_32_uintdigest("ππππππππ".encode(), 0x9747B28C) == 0xD58063C1 assert mmh3.mmh3_32_uintdigest(b"a" * 256, 0x9747B28C) == 0x37405BDC assert mmh3.mmh3_32_uintdigest(b"abc", 0) == 0xB3DD93FA assert ( mmh3.mmh3_32_uintdigest( b"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0 ) == 0xEE925B90 ) assert ( mmh3.mmh3_32_uintdigest( b"The quick brown fox jumps over the lazy dog", 0x9747B28C ) == 0x2FA826CD ) assert ( mmh3.mmh3_32_uintdigest( b"The quick brown fox jumps over the lazy dog", 0x9747B28C ) == 0x2FA826CD ) def test_mmh3_x64_128_digest() -> None: assert ( mmh3.mmh3_x64_128_digest(b"foo") == b"aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~" ) assert ( mmh3.mmh3_x64_128_digest( b"The quick brown fox jumps over the lazy dog", 0x9747B28C ) == b"!1c\xd2;\x7f\x8as\xe5\x16\xc0~rsE\xf9" ) v = bytearray(b"bar boo bar") mv = memoryview(v) v[4] = ord("f") assert ( mmh3.mmh3_x64_128_digest(mv[4:7]) == b"aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~" ) def test_mmh3_x64_128_sintdigest() -> None: assert mmh3.mmh3_x64_128_sintdigest(b"") == 0 assert ( mmh3.mmh3_x64_128_sintdigest( b"The quick brown fox jumps over the lazy dog", 0x9747B28C ) == -8943985938913228316176695348732677855 ) def test_mmh3_x64_128_uintdigest() -> None: assert mmh3.mmh3_x64_128_uintdigest(b"") == 0 assert ( mmh3.mmh3_x64_128_uintdigest(b"foo", 42) == 215966891540331383248189432718888555506 ) def test_mmh3_x64_128_stupledigest() -> None: assert mmh3.mmh3_x64_128_stupledigest(b"") == (0, 0) assert mmh3.mmh3_x64_128_stupledigest( memoryview(b"The quick brown fox jumps over the lazy dog"), 0x9747B28C ) == ( 8325606756057297185, -484854449282476315, ) def test_mmh3_x64_128_utupledigest() -> None: assert mmh3.mmh3_x64_128_utupledigest(b"") == (0, 0) assert mmh3.mmh3_x64_128_utupledigest(memoryview(b"foo")) == ( 16316970633193145697, 9128664383759220103, ) def test_mmh3_x86_128_digest() -> None: assert mmh3.mmh3_x86_128_digest(b"", 123) == ( 0x26F3E79926F3E79926F3E799FEDC5245 ).to_bytes(16, "little") assert mmh3.mmh3_x86_128_digest(b"Hello, world!", 123) == ( 0x9E37C886A41621625A1AACD761C9129E ).to_bytes(16, "little") assert mmh3.mmh3_x86_128_digest(bytearray(b"Hello, world!"), 123) == ( 0x9E37C886A41621625A1AACD761C9129E ).to_bytes(16, "little") v = bytearray(b"hello, world!!!") mv = memoryview(v) v[0] = ord("H") assert mmh3.mmh3_x86_128_digest(mv[0:13], 123) == ( 0x9E37C886A41621625A1AACD761C9129E ).to_bytes(16, "little") def test_mmh3_x86_128_sintdigest() -> None: assert mmh3.mmh3_x64_128_sintdigest(b"") == 0 assert ( mmh3.mmh3_x64_128_sintdigest( b"The quick brown fox jumps over the lazy dog", 0x9747B28C ) == -8943985938913228316176695348732677855 ) def test_mmh3_x86_128_uintdigest() -> None: assert mmh3.mmh3_x64_128_uintdigest(b"", 0) == 0 # Test vector from https://github.com/PeterScott/murmur3/blob/master/test.c assert ( mmh3.mmh3_x86_128_uintdigest(b"Hello, world!", 123) == 0x9E37C886A41621625A1AACD761C9129E ) def test_mmh3_x86_128_stupledigest() -> None: assert mmh3.mmh3_x86_128_stupledigest(b"", 0) == (0, 0) assert mmh3.mmh3_x86_128_stupledigest( memoryview(b"The quick brown fox jumps over the lazy dog"), 0x9747B28C ) == ( 5528275682885686622, -3623575540584727908, ) def test_mmh3_x86_128_utupledigest() -> None: assert mmh3.mmh3_x86_128_utupledigest(b"", 0) == (0, 0) # Test vector from https://github.com/PeterScott/murmur3/blob/master/test.c assert mmh3.mmh3_x86_128_utupledigest(memoryview(b"Hello, world!"), 123) == ( 0x5A1AACD761C9129E, 0x9E37C886A4162162, ) def test_64bit() -> None: if sys.maxsize < (1 << 32): # Skip this test under 32-bit environments return a = bytes(2**32 + 1) assert mmh3.hash(a) == -1710109261 assert ( mmh3.hash_bytes(a) == b"\x821\x93\x0c\xe7\xa8\x02\x9d\xe5 \xa6\xf9\xeb8\xd6\x0e" ) # from hex string "0xff00de" to integer def hex_to_int(hex_str: str) -> int: return int(hex_str, 16) ================================================ FILE: tests/test_mmh3_hasher.py ================================================ # pylint: disable=missing-module-docstring,missing-function-docstring import mmh3 from helper import u32_to_s32 def test_mmh3_32_digest() -> None: hasher = mmh3.mmh3_32() hasher.update(b"") assert hasher.digest() == b"\x00\x00\x00\x00" # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"Hello, world!") assert hasher.digest() == b"\xba\x4c\x88\x24" hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"Hello,") hasher.update(b" world!") assert hasher.digest() == b"\xba\x4c\x88\x24" hasher = mmh3.mmh3_32(b"", 0x9747B28C) hasher.update(b"Hello,") hasher.update(b" world!") assert hasher.digest() == b"\xba\x4c\x88\x24" hasher = mmh3.mmh3_32(b"Hello,", 0x9747B28C) hasher.update(b" world!") assert hasher.digest() == b"\xba\x4c\x88\x24" hasher = mmh3.mmh3_32(b"Hello,", seed=0x9747B28C) hasher.update(b" world!") assert hasher.digest() == b"\xba\x4c\x88\x24" def test_mmh3_32_sintdigest() -> None: hasher = mmh3.mmh3_32() hasher.update(b"foo") assert hasher.sintdigest() == -156908512 # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 hasher = mmh3.mmh3_32() hasher.update(b"") assert hasher.sintdigest() == 0 hasher = mmh3.mmh3_32(seed=1) hasher.update(b"") assert hasher.sintdigest() == 0x514E28B7 hasher = mmh3.mmh3_32() hasher.update(b"\x21\x43") hasher.update(b"\x65") assert hasher.sintdigest() == u32_to_s32(0x7E4A8634) hasher = mmh3.mmh3_32() hasher.update(b"\x21\x43\x65\x87") assert hasher.sintdigest() == u32_to_s32(0xF55B516B) hasher = mmh3.mmh3_32() hasher.update(b"\x21\x43") hasher.update(b"\x65\x87") assert hasher.sintdigest() == u32_to_s32(0xF55B516B) hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"Hello, world!") assert hasher.sintdigest() == u32_to_s32(0x24884CBA) hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"Hello,") hasher.update(b" world!") assert hasher.sintdigest() == u32_to_s32(0x24884CBA) hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"The quick brown fo") hasher.update(b"x jumps over the lazy dog") assert hasher.sintdigest() == u32_to_s32(0x2FA826CD) def test_mmh3_32_uintdigest() -> None: hasher = mmh3.mmh3_32() hasher.update(b"foo") assert hasher.uintdigest() == 4138058784 # Test vectors devised by Ian Boyd # https://stackoverflow.com/a/31929528 hasher = mmh3.mmh3_32() hasher.update(b"") assert hasher.uintdigest() == 0 hasher = mmh3.mmh3_32(seed=1) hasher.update(b"") assert hasher.uintdigest() == 0x514E28B7 hasher = mmh3.mmh3_32() hasher.update(b"\x21\x43") hasher.update(b"\x65") assert hasher.uintdigest() == 0x7E4A8634 hasher = mmh3.mmh3_32() hasher.update(b"\x21\x43\x65\x87") assert hasher.uintdigest() == 0xF55B516B hasher = mmh3.mmh3_32() hasher.update(b"\x21\x43") hasher.update(b"\x65\x87") assert hasher.uintdigest() == 0xF55B516B hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"Hello, world!") assert hasher.uintdigest() == 0x24884CBA hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"Hello,") hasher.update(b" world!") assert hasher.uintdigest() == 0x24884CBA hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"The quick brown fo") hasher.update(b"x jumps over the lazy dog") assert hasher.uintdigest() == 0x2FA826CD def test_mmh3_32_copy() -> None: hasher = mmh3.mmh3_32(seed=0x9747B28C) hasher.update(b"The quick brown fox") hasher2 = hasher.copy() hasher.update(b" jumps over the lazy dog") assert hasher.uintdigest() == 0x2FA826CD hasher2.update(b" jumps over the lazy dog") assert hasher2.uintdigest() == 0x2FA826CD def test_mmh3_x64_128_basic_ops() -> None: hasher = mmh3.mmh3_x64_128() assert hasher.digest_size == 16 assert hasher.block_size == 32 assert hasher.name == "mmh3_x64_128" def test_mmh3_x64_128_digest() -> None: hasher = mmh3.mmh3_x64_128() hasher.update(b"foo") assert hasher.digest() == b"aE\xf5\x01W\x86q\xe2\x87}\xba+\xe4\x87\xaf~" hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.digest() == b"!1c\xd2;\x7f\x8as\xe5\x16\xc0~rsE\xf9" hasher = mmh3.mmh3_x64_128(b"", 0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.digest() == b"!1c\xd2;\x7f\x8as\xe5\x16\xc0~rsE\xf9" hasher = mmh3.mmh3_x64_128(b"The quick brown ", seed=0x9747B28C) hasher.update(b"fox jumps over the lazy dog") assert hasher.digest() == b"!1c\xd2;\x7f\x8as\xe5\x16\xc0~rsE\xf9" hasher = mmh3.mmh3_x64_128(b"The quick brown ", 0x9747B28C) hasher.update(b"fox jumps over the lazy dog") assert hasher.digest() == b"!1c\xd2;\x7f\x8as\xe5\x16\xc0~rsE\xf9" def test_mmh3_x64_128_sintdigest() -> None: hasher = mmh3.mmh3_x64_128() hasher.update(b"") assert hasher.sintdigest() == 0 hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.sintdigest() == -8943985938913228316176695348732677855 hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick brown fox j") hasher.update(b"umps over the lazy dog") assert hasher.sintdigest() == -8943985938913228316176695348732677855 def test_mmh3_x64_128_uintdigest() -> None: hasher = mmh3.mmh3_x64_128() hasher.update(b"") assert hasher.uintdigest() == 0 hasher = mmh3.mmh3_x64_128(seed=1) hasher.update(b"") assert hasher.uintdigest() == 108177238965372658051732455265379769525 hasher = mmh3.mmh3_x64_128() hasher.update(b"foo") assert hasher.uintdigest() == 168394135621993849475852668931176482145 hasher = mmh3.mmh3_x64_128() hasher.update(b"fo") hasher.update(b"o") assert hasher.uintdigest() == 168394135621993849475852668931176482145 hasher = mmh3.mmh3_x64_128() hasher.update(b"fooo") assert hasher.uintdigest() == 93757880664175803030724836966881520758 hasher = mmh3.mmh3_x64_128() hasher.update(b"fooofooo") assert hasher.uintdigest() == 211983152696995059280678248292944636041 hasher = mmh3.mmh3_x64_128() hasher.update(b"fooo") hasher.update(b"fooo") assert hasher.uintdigest() == 211983152696995059280678248292944636041 hasher = mmh3.mmh3_x64_128() hasher.update(b"fooofoooo") assert hasher.uintdigest() == 338423359992422647011971677127905553798 hasher = mmh3.mmh3_x64_128() hasher.update(b"fooo") hasher.update(b"foooo") assert hasher.uintdigest() == 338423359992422647011971677127905553798 hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.uintdigest() == 331338380982025235147197912083035533601 hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"T") hasher.update(b"he quick brown fox jumps over the lazy dog") assert hasher.uintdigest() == 331338380982025235147197912083035533601 hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quic") # 8 bytes hasher.update(b"k brown fox jumps over the lazy dog") assert hasher.uintdigest() == 331338380982025235147197912083035533601 hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick") hasher.update(b" brown fox jumps over the lazy dog") assert hasher.uintdigest() == 331338380982025235147197912083035533601 def test_mmh3_x64_128_stupledigest() -> None: hasher = mmh3.mmh3_x64_128() hasher.update(b"") assert hasher.stupledigest() == (0, 0) hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.stupledigest() == (8325606756057297185, -484854449282476315) hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quic") hasher.update(b"k brown fox jumps over the lazy dog") assert hasher.stupledigest() == (8325606756057297185, -484854449282476315) def test_mmh3_x64_128_utupledigest() -> None: hasher = mmh3.mmh3_x64_128() hasher.update(b"") assert hasher.utupledigest() == (0, 0) hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.utupledigest() == (8325606756057297185, 17961889624427075301) hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quic") hasher.update(b"k brown fox jumps over the lazy dog") assert hasher.utupledigest() == (8325606756057297185, 17961889624427075301) def test_mmh3_x64_128_copy() -> None: hasher = mmh3.mmh3_x64_128(seed=0x9747B28C) hasher.update(b"The quick brown fox") hasher2 = hasher.copy() hasher.update(b" jumps over the lazy dog") assert hasher.digest() == b"!1c\xd2;\x7f\x8as\xe5\x16\xc0~rsE\xf9" hasher2.update(b" jumps over the lazy dog") assert hasher2.digest() == b"!1c\xd2;\x7f\x8as\xe5\x16\xc0~rsE\xf9" def test_mmh3_x86_128_basic_ops() -> None: hasher = mmh3.mmh3_x86_128() assert hasher.digest_size == 16 assert hasher.block_size == 32 assert hasher.name == "mmh3_x86_128" def test_mmh3_x86_128_digest() -> None: hasher = mmh3.mmh3_x86_128() hasher.update(b"") assert ( hasher.digest() == b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" ) hasher = mmh3.mmh3_x86_128(seed=1) hasher.update(b"") assert hasher.digest() == b"\xec\xad\xc4\x88\xb9\x01\xd2T\xb9\x01\xd2T\xb9\x01\xd2T" hasher = mmh3.mmh3_x86_128() hasher.update(b"foo") assert hasher.digest() == b"%\x1b|We%\xb6`e%\xb6`e%\xb6`" hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown") # 15 bytes assert ( hasher.digest() == b"2\xc3\n\xdaW\xc2\xcb\xa9\xc4\xbe\x12\xb9\xdc\x01\xe1\x8e" ) hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown ") # 16 bytes assert hasher.digest() == b"u\xb6\xf9\x07\xf5|\x93,\x0e\xf5\xf1\xf0k\x98\x83\x19" hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown") # 15 bytes hasher.update(b" fox jumps over the lazy dog") assert hasher.digest() == b"^\xd5\xd4\x8aqa\xb8L\x9c:\xa7\x8e>y\xb6\xcd" hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown ") # 16 bytes hasher.update(b"fox jumps over the lazy dog") assert hasher.digest() == b"^\xd5\xd4\x8aqa\xb8L\x9c:\xa7\x8e>y\xb6\xcd" hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.digest() == b"^\xd5\xd4\x8aqa\xb8L\x9c:\xa7\x8e>y\xb6\xcd" hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown fox ju") hasher.update(b"mps ove") hasher.update(b"r the la") hasher.update(b"zy dog") assert hasher.digest() == b"^\xd5\xd4\x8aqa\xb8L\x9c:\xa7\x8e>y\xb6\xcd" hasher = mmh3.mmh3_x86_128(b"", 0x9747B28C) hasher.update(b"The quick brown fox ju") hasher.update(b"mps ove") hasher.update(b"r the la") hasher.update(b"zy dog") assert hasher.digest() == b"^\xd5\xd4\x8aqa\xb8L\x9c:\xa7\x8e>y\xb6\xcd" hasher = mmh3.mmh3_x86_128(b"The quick brown fox ju", seed=0x9747B28C) hasher.update(b"mps ove") hasher.update(b"r the la") hasher.update(b"zy dog") assert hasher.digest() == b"^\xd5\xd4\x8aqa\xb8L\x9c:\xa7\x8e>y\xb6\xcd" hasher = mmh3.mmh3_x86_128(b"The quick brown fox ju", 0x9747B28C) hasher.update(b"mps ove") hasher.update(b"r the la") hasher.update(b"zy dog") assert hasher.digest() == b"^\xd5\xd4\x8aqa\xb8L\x9c:\xa7\x8e>y\xb6\xcd" def test_mmh3_x86_128_sintdigest() -> None: hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.sintdigest() == -66843170628920214366208380873156012706 def test_mmh3_x86_128_uintdigest() -> None: hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.uintdigest() == 273439196292018249097166226558612198750 def test_mmh3_x86_128_stupledigest() -> None: hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.stupledigest() == (5528275682885686622, -3623575540584727908) def test_mmh3_x86_128_utupledigest() -> None: hasher = mmh3.mmh3_x86_128(seed=0x9747B28C) hasher.update(b"The quick brown fox jumps over the lazy dog") assert hasher.utupledigest() == (5528275682885686622, 14823168533124823708) ================================================ FILE: tox.ini ================================================ [tox] requires = tox>=4 envlist = lint, type, py{310,311,312,313,314,314t} [testenv] description = run unit tests commands_pre = uv pip install ".[test]" commands = pytest {posargs} [testenv:lint] description = run linters with formatting skip_install = true allowlist_externals = find npx commands_pre = uv pip install ".[lint]" commands = ruff format . ruff check --fix . find ./src/mmh3 -name '*.[ch]' -exec clang-format -i {} + npx prettier --write . pylint --recursive=y . npx markdownlint --config .markdown-lint.yml \ --ignore-path .gitignore **/*.md codespell actionlint [testenv:type] description = run type checks commands_pre = uv pip install ".[test,type]" commands = mypy --strict tests [testenv:docs] description = run documentation build allowlist_externals = make commands_pre = uv pip install ".[docs]" commands = make -C docs clean make -C docs html [testenv:build_cfiles] allowlist_externals = find git commands_pre = uv pip install ".[lint]" commands = git submodule update --init python util/refresh.py find ./src/mmh3 -name '*.[ch]' -exec clang-format -i {} + [testenv:benchmark] description = run benchmarks commands_pre = uv pip install ".[benchmark]" commands = python benchmark/benchmark.py {posargs} [testenv:plot] description = plot benchmark results commands_pre = uv pip install ".[benchmark,plot]" commands = python benchmark/plot_graph.py {posargs} ================================================ FILE: util/FILE_HEADER ================================================ /*** * This file is under MIT Hajime Senuma, just like other files. * See LICENSE for details. * * It was originally written by Austin Appleby in C++ under the public domain, * but ported to PEP 7 C for Python 3.6 and later by the mmh3 project. * * Any issues should be reported to https://github.com/hajimes/mmh3/issues. * * The following is the original public domain notice by Austin Appleby. */ //----------------------------------------------------------------------------- // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. ================================================ FILE: util/refresh.py ================================================ # pylint: disable=missing-function-docstring """A script to generate Murmurhash3 C files from the original C++ source.""" # For forward references from __future__ import annotations import os import re import textwrap from collections.abc import Callable ### # Simple classes to handle the transformation of the original code. # class MMH3Source: """A data class to represent the original source code of MurmurHash3. Lines to be retrieved are hard-coded, as the original code is effectively frozen. """ def __init__(self, code: str) -> None: self._code_lines = code.split("\n") @property def note_comment(self) -> str: return "\n".join(self._code_lines[4:8]) @property def header_include(self) -> str: return "\n".join(self._code_lines[9:10]) @property def macros(self) -> str: return "\n".join(self._code_lines[11:50]) @property def getblock_functions(self) -> str: return "\n".join(self._code_lines[50:64]) @property def finalization_mixes(self) -> str: return "\n".join(self._code_lines[64:91]) @property def body(self) -> str: return "\n".join(self._code_lines[91:336]) @property def finalization_x86_128(self) -> str: return "\n".join(self._code_lines[233:246]) @property def finalization_x64_128(self) -> str: return "\n".join(self._code_lines[318:329]) @property def constants_x86_128(self) -> str: return "\n".join(self._code_lines[160:164]) @property def constants_x64_128(self) -> str: return "\n".join(self._code_lines[263:265]) class MMH3Header: """A data class to represent the original header code of MurmurHash3. Lines to be retrieved are hard-coded, as the original code is effectively frozen. """ def __init__(self, code: str) -> None: self._code_lines = code.split("\n") @property def header_guards_begin(self) -> str: return "\n".join(self._code_lines[4:7]) @property def stdint(self) -> str: return "\n".join(self._code_lines[7:26]) @property def declarations(self) -> str: return "\n".join(self._code_lines[26:36]) @property def header_guards_end(self) -> str: return "\n".join(self._code_lines[36:37]) class MMH3CodeBuilder: """A builder class to generate the new MurmurHash3 C code.""" def __init__(self) -> None: self._code: list[tuple[str, list[Callable[[str], str]]]] = [] def add( self, subcode: str, transforms: list[Callable[[str], str]] | None = None ) -> MMH3CodeBuilder: if transforms is None: transforms = [] self._code.append((subcode, transforms)) return self def build(self) -> str: new_code = "" for subcode, transforms in self._code: for tr in transforms: subcode = tr(subcode) new_code += subcode + "\n\n" return new_code ### # The following functions are used to transform the original MurmurHash3 code. # def append_python_directives(subcode: str) -> str: """Append Python.h, as well as a macro definition to handle 64-bit data. Args: subcode (str): The code to be appended. Returns: str: The appended code. """ subcode += "\n\n" subcode += textwrap.dedent("""\ // To handle 64-bit data; see https://docs.python.org/3/c-api/arg.html #ifndef PY_SSIZE_T_CLEAN #define PY_SSIZE_T_CLEAN #endif #include """) return subcode def append_byteswap_header(subcode: str) -> str: """Append a header to the code that includes byteswap.h if the system is big endian. Args: subcode (str): The code to be appended. Returns: str: The appended code. """ subcode += "\n" subcode += textwrap.dedent("""\ #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #include #endif """) return subcode def introduce_py_ssize_t(subcode: str) -> str: """Use Py_ssize_t instead of int as the index type. Py_ssize_t is the type used by Python to represent the size of objects. It is required to handle 64-bit data in Python extensions. See https://docs.python.org/3/c-api/intro.html#c.Py_ssize_t and https://peps.python.org/pep-0353/ Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ transformations = [ ["int len", "Py_ssize_t len"], ["const int nblocks", "const Py_ssize_t nblocks"], ["for(int i", "for(Py_ssize_t i"], ] for tr in transformations: subcode = subcode.replace(tr[0], tr[1]) return subcode def transform_getblocks(subcode: str) -> str: """Revise getblock functions so that it handles big endian and 64-bit data. Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ # pylint: disable=invalid-name transformations = [ ["FORCE_INLINE", "static FORCE_INLINE"], ["int i", "Py_ssize_t i"], ] for tr in transformations: subcode = subcode.replace(tr[0], tr[1]) BYTE_SWAP_IF_BIG_ENDIAN = textwrap.dedent("""\ #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) return bswap_\\1(p[i]); #else return p[i]; #endif """) subcode = re.sub( r"getblock(.*?)(\s\(.*?\{\n).*?\}", "getblock\\1\\2" + BYTE_SWAP_IF_BIG_ENDIAN + "}", subcode, flags=re.DOTALL | re.MULTILINE, ) return subcode def transform_finalization_mixes(subcode: str) -> str: """Revise the finalization operations in MurmurHash3. Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ transformations = [ ["FORCE_INLINE", "static FORCE_INLINE"], ["int i", "Py_ssize_t i"], ] for tr in transformations: subcode = subcode.replace(tr[0], tr[1]) return subcode def transform_x86_128_return(subcode: str) -> str: """Revise the return block of MurmurHash3_x86_128 so that it handles big endian. Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ # pylint: disable=invalid-name BYTE_SWAP_IF_BIG_ENDIAN = textwrap.dedent("""\ #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint32_t *)out)[0] = h2; ((uint32_t *)out)[1] = h1; ((uint32_t *)out)[2] = h4; ((uint32_t *)out)[3] = h3; #else \\1 #endif """) subcode = re.sub( r"(\(\(uint32_t\*\)out\)\[0\] = h1;[\s\S]*\(\(uint32_t\*\)out\)\[3\] = h4;)", BYTE_SWAP_IF_BIG_ENDIAN, subcode, flags=re.DOTALL | re.MULTILINE, ) return subcode def expand_win_stdint_typedefs(subcode: str) -> str: """Delineate int type definitions for the older versions of the VS compiler. Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ # pylint: disable=invalid-name MSC_STDINT_TYPEDEFS = textwrap.dedent("""\ typedef signed __int8 int8_t; typedef signed __int32 int32_t; typedef signed __int64 int64_t; typedef unsigned __int8 uint8_t; typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t; """) return re.sub( r"typedef unsigned char(.*)uint64_t;", MSC_STDINT_TYPEDEFS, subcode, flags=re.DOTALL, ) def append_mur_macros(subcode: str) -> str: """Append building blocks for multiply and rotate (MUR) operations. These functions are used by mmh3 hashers. In future updates, they may be also used by one-shot hash functions, although performance tests must be employed before such refactoring. Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ subcode += "\n\n" subcode += textwrap.dedent("""\ //----------------------------------------------------------------------------- // Building blocks for multiply and rotate (MUR) operations. // Names are taken from Google Guava's implementation """) subcode += "\n" subcode += textwrap.dedent("""\ static FORCE_INLINE uint32_t mixK1(uint32_t k1) { const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; k1 *= c1; k1 = ROTL32(k1, 15); k1 *= c2; return k1; } """) subcode += textwrap.dedent("""\ static FORCE_INLINE uint32_t mixH1(uint32_t h1, const uint32_t h2, const uint8_t shift, const uint32_t c1) { h1 = ROTL32(h1, shift); h1 += h2; h1 = h1 * 5 + c1; return h1; } """) subcode += textwrap.dedent("""\ static FORCE_INLINE uint64_t mixK_x64_128(uint64_t k1, const uint8_t shift, const uint64_t c1, const uint64_t c2) { k1 *= c1; k1 = ROTL64(k1, shift); k1 *= c2; return k1; } """) subcode += textwrap.dedent("""\ static FORCE_INLINE uint64_t mixK1_x64_128(uint64_t k1) { const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; return k1; } """) subcode += textwrap.dedent("""\ static FORCE_INLINE uint64_t mixK2_x64_128(uint64_t k2) { const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; return k2; } """) subcode += textwrap.dedent("""\ static FORCE_INLINE uint64_t mixH_x64_128(uint64_t h1, uint64_t h2, const uint8_t shift, const uint32_t c) { h1 = ROTL64(h1, shift); h1 += h2; h1 = h1 * 5 + c; return h1; } """) subcode += textwrap.dedent("""\ static FORCE_INLINE uint64_t mixK_x86_128(uint32_t k, const uint8_t shift, const uint32_t c1, const uint32_t c2) { k *= c1; k = ROTL32(k, shift); k *= c2; return k; } """) return subcode def generate_hasher_digest_x86_128_pre(subcode: str) -> str: """Generate the first part of the digest function for x86_128. Args: subcode (str): The constants in mmh3_x86_128. Returns: str: The first part of the digest function for x86_128. """ hasher_digests = "\n\n" hasher_digests += textwrap.dedent("""\ static FORCE_INLINE void digest_x86_128_impl(uint32_t h1, uint32_t h2, uint32_t h3, uint32_t h4, const uint32_t k1, const uint32_t k2, const uint32_t k3, const uint32_t k4, const Py_ssize_t len, const char *out) { """) hasher_digests += subcode + "\n" return hasher_digests def generate_hasher_digest_x86_128_main(subcode: str) -> str: """Generate the main part of the digest function for x86_128. Args: subcode (str): The finalization code in mmh3 x86_128. Returns: str: The main part of the digest function for x86_128. """ hasher_digests = "" hasher_digests += textwrap.dedent("""\ h1 ^= mixK_x86_128(k1, 15, c1, c2); h2 ^= mixK_x86_128(k2, 16, c2, c3); h3 ^= mixK_x86_128(k3, 17, c3, c4); h4 ^= mixK_x86_128(k4, 18, c4, c1); """) hasher_digests += subcode + "\n" hasher_digests += textwrap.dedent("""\ #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint32_t *)out)[0] = bswap_32(h1); ((uint32_t *)out)[1] = bswap_32(h2); ((uint32_t *)out)[2] = bswap_32(h3); ((uint32_t *)out)[3] = bswap_32(h4); #else ((uint32_t *)out)[0] = h1; ((uint32_t *)out)[1] = h2; ((uint32_t *)out)[2] = h3; ((uint32_t *)out)[3] = h4; #endif """) hasher_digests += "\n}" return hasher_digests def generate_hasher_digest_x64_128(subcode: str) -> str: """Generate the digest function for x64_128. Args: subcode (str): The finalization code in mmh3 x64_128. Returns: str: The digest function for x64_128. """ hasher_digests = "\n\n" hasher_digests += textwrap.dedent("""\ //----------------------------------------------------------------------------- // Finalization function """) hasher_digests += "\n" hasher_digests += textwrap.dedent("""\ static FORCE_INLINE void digest_x64_128_impl(uint64_t h1, uint64_t h2, const uint64_t k1, const uint64_t k2, const Py_ssize_t len, const char *out) { """) hasher_digests += textwrap.dedent("""\ h1 ^= mixK1_x64_128(k1); h2 ^= mixK2_x64_128(k2); """) hasher_digests += subcode + "\n" hasher_digests += textwrap.dedent("""\ #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) ((uint64_t *)out)[0] = bswap_64(h1); ((uint64_t *)out)[1] = bswap_64(h2); #else ((uint64_t *)out)[0] = h1; ((uint64_t *)out)[1] = h2; #endif """) hasher_digests += "\n}" return hasher_digests def fix_non_win_force_inline(subcode: str) -> str: """Fix the FORCE_INLINE macro so that it works on old GCC and RHEL. Based on a commit from Micha Gorelick (@mynameisfiber). https://github.com/hajimes/mmh3/pull/1 Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ # pylint: disable=invalid-name NON_WIN_FORCE_INLINE_ORIGINAL = ( "#define FORCE_INLINE inline __attribute__((always_inline))" ) NON_WIN_FORCE_INLINE_REVISED = textwrap.dedent("""\ #if ((__GNUC__ > 4) || (__GNUC__ == 4 && GNUC_MINOR >= 4)) /* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6. Don't inline for RHEL 5 gcc * which is 4.1*/ #define FORCE_INLINE inline __attribute__((always_inline)) #else #define FORCE_INLINE #endif """) return subcode.replace(NON_WIN_FORCE_INLINE_ORIGINAL, NON_WIN_FORCE_INLINE_REVISED) def force_inline_force_inline(subcode: str) -> str: """Force inline to use static FORCE_INLINE. Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ return re.sub(r"^inline ", "static FORCE_INLINE ", subcode, flags=re.MULTILINE) def lowercase_function_names(subcode: str) -> str: """Lowercase functions names. Purely for style. Args: subcode (str): The code to be transformed. Returns: str: The transformed code. """ function_names = [ "MurmurHash3_x86_32", "MurmurHash3_x86_128", "MurmurHash3_x64_128", ] for fn in function_names: subcode = subcode.replace(fn, fn.lower()) return subcode if __name__ == "__main__": file_path = os.path.realpath(__file__) dir_path = os.path.dirname(file_path) original_source_path = os.path.join(dir_path, "smhasher/src/MurmurHash3.cpp") original_header_path = os.path.join(dir_path, "smhasher/src/MurmurHash3.h") NEW_SOURCE_NAME = "murmurhash3.c" NEW_HEADER_NAME = "murmurhash3.h" FILE_HEADER_NAME = "FILE_HEADER" new_source_path = os.path.join(dir_path, "../src/mmh3", NEW_SOURCE_NAME) new_header_path = os.path.join(dir_path, "../src/mmh3", NEW_HEADER_NAME) file_header_path = os.path.join(dir_path, FILE_HEADER_NAME) with ( open(original_source_path, encoding="utf-8") as source_file, open(original_header_path, encoding="utf-8") as header_file, open(file_header_path, encoding="utf-8") as file_header_file, ): source = MMH3Source(source_file.read()) header = MMH3Header(header_file.read()) file_header = file_header_file.read() new_source_builder = MMH3CodeBuilder() new_source_builder.add(file_header) new_source_builder.add(source.note_comment) new_source_builder.add(source.header_include, [str.lower]) new_source_builder.add( source.body, [introduce_py_ssize_t, transform_x86_128_return, lowercase_function_names], ) new_header_builder = MMH3CodeBuilder() new_header_builder.add(file_header) new_header_builder.add( header.header_guards_begin, [append_python_directives, append_byteswap_header], ) new_header_builder.add( header.stdint, [expand_win_stdint_typedefs], ) new_header_builder.add( source.macros, [fix_non_win_force_inline, force_inline_force_inline], ) new_header_builder.add( source.getblock_functions, [transform_getblocks], ) new_header_builder.add( "", [append_mur_macros], ) new_header_builder.add( source.finalization_mixes, [transform_finalization_mixes], ) new_header_builder.add( source.finalization_x64_128, [generate_hasher_digest_x64_128], ) new_header_builder.add( source.constants_x86_128, [generate_hasher_digest_x86_128_pre], ) new_header_builder.add( source.finalization_x86_128, [generate_hasher_digest_x86_128_main], ) new_header_builder.add( header.declarations, [lowercase_function_names, introduce_py_ssize_t], ) new_header_builder.add(header.header_guards_end) with open(new_source_path, "w", encoding="utf-8") as f: f.write(new_source_builder.build()) with open(new_header_path, "w", encoding="utf-8") as f: f.write(new_header_builder.build())