Repository: thorn-oss/perception Branch: main Commit: b17dcb841435 Files: 75 Total size: 366.2 KB Directory structure: gitextract_4qwyzu2o/ ├── .dockerignore ├── .git-blame-ignore-revs ├── .gitattributes ├── .github/ │ ├── dependabot.yaml │ └── workflows/ │ ├── ci.yaml │ ├── gh-pages.yaml │ └── release.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── build.py ├── docs/ │ ├── api/ │ │ ├── benchmarking.rst │ │ ├── hashers.rst │ │ ├── index.rst │ │ └── tools.rst │ ├── conf.py │ ├── examples/ │ │ ├── benchmarking.rst │ │ ├── deduplication.rst │ │ ├── detecting_csam.rst │ │ └── index.rst │ ├── index.rst │ └── requirements.txt ├── perception/ │ ├── __init__.py │ ├── approximate_deduplication/ │ │ ├── __init__.py │ │ ├── _graph_backend.py │ │ ├── debug.py │ │ ├── index.py │ │ └── serve.py │ ├── benchmarking/ │ │ ├── __init__.py │ │ ├── common.py │ │ ├── extensions.pyx │ │ ├── image.py │ │ ├── image_transforms.py │ │ ├── video.py │ │ └── video_transforms.py │ ├── extensions.pyx │ ├── hashers/ │ │ ├── __init__.py │ │ ├── hasher.py │ │ ├── image/ │ │ │ ├── __init__.py │ │ │ ├── average.py │ │ │ ├── dhash.py │ │ │ ├── opencv.py │ │ │ ├── pdq.py │ │ │ ├── phash.py │ │ │ └── wavelet.py │ │ ├── tools.py │ │ └── video/ │ │ ├── __init__.py │ │ ├── framewise.py │ │ └── tmk.py │ ├── local_descriptor_deduplication.py │ ├── py.typed │ ├── testing/ │ │ ├── __init__.py │ │ ├── images/ │ │ │ └── README.md │ │ ├── logos/ │ │ │ └── README.md │ │ └── videos/ │ │ ├── README.md │ │ ├── rgb.m4v │ │ ├── v1.m4v │ │ └── v2.m4v │ ├── tools.py │ └── utils.py ├── poetry.toml ├── pyproject.toml ├── setup.py └── tests/ ├── test_approximate_deduplication.py ├── test_benchmarking.py ├── test_hashers.py ├── test_local_descriptor_deduplication.py ├── test_tmk.py └── test_tools.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ notebooks .venv/ ================================================ FILE: .git-blame-ignore-revs ================================================ # Format with black 6c03f96a9335e548685ece233474125fe453c262 ================================================ FILE: .gitattributes ================================================ perception/_version.py export-subst ================================================ FILE: .github/dependabot.yaml ================================================ version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: # Check for updates to GitHub Actions every week. interval: "weekly" ================================================ FILE: .github/workflows/ci.yaml ================================================ name: ci on: push: branches: - "**" tags-ignore: - v* jobs: test: strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] os: ["ubuntu-latest", "windows-latest", "macos-latest"] runs-on: ${{ matrix.os }} steps: - name: checkout uses: actions/checkout@v6 - name: Setup Poetry uses: abatilo/actions-poetry@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} cache: poetry cache-dependency-path: poetry.lock - name: Setup FFMPEG uses: FedericoCarboni/setup-ffmpeg@v3 if: ${{ ! startsWith(matrix.os, 'macos') }} - name: Setup Dependencies with Homebrew if: startsWith(matrix.os, 'macos') run: | brew install llvm ffmpeg echo "CC=$(brew --prefix)/opt/llvm/bin/clang" >> $GITHUB_ENV echo "CXX=$(brew --prefix)/opt/llvm/bin/clang++" >> $GITHUB_ENV - name: Setup Project run: make init-project - name: Normalize OpenCV package run: | poetry run python -m pip uninstall -y opencv-python-headless poetry run python -m pip install --no-deps --force-reinstall opencv-contrib-python-headless - name: Run precommit run: make precommit ================================================ FILE: .github/workflows/gh-pages.yaml ================================================ name: Deploy Sphinx documentation to Pages on: push: branches: - dunnack/sphinx-to-github-pages - main paths: - .github/workflows/gh-pages.yaml - docs/** jobs: pages: runs-on: ubuntu-latest environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} permissions: contents: read pages: write id-token: write steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - id: deployment uses: sphinx-notes/pages@v3 with: checkout: false documentation_path: docs requirements_path: docs/requirements.txt ================================================ FILE: .github/workflows/release.yaml ================================================ name: release on: release: types: [published] workflow_dispatch: jobs: build-wheels: runs-on: ${{ matrix.os }} strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] os: ["ubuntu-latest", "windows-latest", "macos-latest"] name: Build for ${{ matrix.os }} on Python ${{ matrix.python-version }} steps: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Setup Poetry uses: abatilo/actions-poetry@v4 - name: Setup FFMPEG uses: FedericoCarboni/setup-ffmpeg@v3 if: ${{ ! startsWith(matrix.os, 'macos') }} - name: Setup Dependencies with Homebrew if: startsWith(matrix.os, 'macos') run: | brew install llvm ffmpeg echo "CC=$(brew --prefix)/opt/llvm/bin/clang" >> $GITHUB_ENV echo "CXX=$(brew --prefix)/opt/llvm/bin/clang++" >> $GITHUB_ENV - uses: actions/checkout@v6 with: # Full clone for version calculation fetch-depth: 0 fetch-tags: true ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }} - name: Build Project run: make build-wheel - uses: actions/upload-artifact@v7 with: name: package-wheels-${{ matrix.os }}-${{ matrix.python-version }} path: dist/* build-sdist: runs-on: ubuntu-latest name: Build sdist steps: - name: Set up Python uses: actions/setup-python@v6 with: python-version: "3.13" - name: Setup Poetry uses: abatilo/actions-poetry@v4 - uses: actions/checkout@v6 with: # Full clone for version calculation fetch-depth: 0 fetch-tags: true ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }} - name: Build Project run: make build-sdist - uses: actions/upload-artifact@v7 with: name: package-sdist path: dist/* publish: needs: [build-wheels, build-sdist] runs-on: ubuntu-latest if: ${{ github.repository_owner == 'thorn-oss' && github.event_name == 'release' }} steps: - uses: actions/checkout@v6 with: # Full clone for version calculation fetch-depth: 0 fetch-tags: true ref: refs/tags/${{ github.event.release.tag_name }} - uses: actions/setup-python@v6 with: python-version: "3.13" - name: Setup Poetry uses: abatilo/actions-poetry@v4 - name: Setup Dynamic Versioning run: poetry self add "poetry-dynamic-versioning[plugin]" - name: Download wheels uses: actions/download-artifact@v8 with: path: dist pattern: package-* merge-multiple: true - name: Load PyPI Token uses: 1password/load-secrets-action@v4 with: # Export loaded secrets as environment variables export-env: true env: OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.DATA_SCIENCE_OP_SERVICE_ACCOUNT_TOKEN }} POETRY_PYPI_TOKEN_PYPI: op://data-science-oss/perception-pypi-api-key/secret/value - name: Verify artifacts run: | mapfile -t artifacts < <(find dist -type f \( -name "*.whl" -o -name "*.tar.gz" \)) if [ ${#artifacts[@]} -eq 0 ]; then echo "No artifacts found in dist" exit 1 fi printf '%s\n' "${artifacts[@]}" if printf '%s\n' "${artifacts[@]}" | grep -E -- '-0\.0\.0([.-]|$)'; then echo "Refusing to publish placeholder version 0.0.0 artifacts" exit 1 fi - name: Publish package run: poetry publish -n ================================================ FILE: .gitignore ================================================ # MacOS stuff .DS_Store # Python artifacts *.egg-info # Cache .mypy_cache .pytest_cache __pycache__ .ipynb_checkpoints dist # Any temporary images or CSV files notebooks # Local environment .venv .python-version # Coverage file .coverage # Versioneer artifacts /versioneer.pyc # Build artifacts /build # Docs build artifacts /docs/_build # Remove .vscode folder .vscode # Extension artifacts *.c *.cpp *.so debug-image* ================================================ FILE: .pre-commit-config.yaml ================================================ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-added-large-files - repo: https://github.com/psf/black rev: 26.3.1 hooks: - id: black language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.11.13 hooks: # Run the linter. - id: ruff args: [ --fix ] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.8.0 hooks: - id: mypy ================================================ FILE: .readthedocs.yaml ================================================ version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py formats: all # Installs the package and the docs requirements. python: version: 3.9 install: - requirements: docs/requirements.txt - method: pip path: . system_packages: true ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.4.0] - 2020-10-17 This release switches from using false positive rates in benchmarking to reporting precision, which is more intuitive. ### Breaking changes All references to fpr_threshold now refer to precision_threshold. ### Bug fixes The PDQHash hasher now correctly returns the hash vector instead of the (vector, quality) tuple. ## [0.3.0] - 2020-04-27 This release adds significantly more support for video. ### Breaking changes - Previously, `read_video` returned `(frame, index, timestamp)` tuples where `index` reflected the index of the yielded frame (i.e., it always increased by exactly 1). It now reflects the index of the frame in the original video. This means that, if the requested framerate is higher than the encoded video framerate, this index may repeat the same value, indicating that we have repeated the same frame. ### Enhancements - We now include a `SimpleSceneDetection` hasher that can wrap other video hashers using scene detection. - `compute_metrics` is much faster now for integer-valued hashes that use a euclidean distance metric. - We now include an unsigned 8-bit integer version of `PHash`, called `PHashU8`. This provides a useful framewise hasher for averaging across frames (e.g., using TMK) while being more compact than `PHashF`. - We include more thorough support for benchmarking video hashes. ### Bug fixes - When using `hasher.vector_to_string` with hashers that return multiple hashes, the `hash_format` argument was not respected. - The `compute_threshold_recall` and `show_histograms` functions did not work properly when `grouping=[]`. ## [0.2.0] - 2019-12-20 This release adds more support for hashing videos (including TMK L2 and TMK L2). As part of that, it also includes a re-factor to separate `benchmarking.BenchmarkDataset` and `benchmarking.BenchmarkTransforms` into image and video variants. ## [0.1.0] - 2019-11-04 Initial release ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies within all project spaces, and it also applies when an individual is representing the project or its community in public spaces. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at conduct@thorn.org. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 https://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS Copyright 2019 Thorn Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include perception/testing/images/* include perception/testing/videos/* include perception/testing/logos/* include perception/**/*.pyx include perception/*.pyx include perception/py.typed exclude tests/* ================================================ FILE: Makefile ================================================ TEST_SCOPE?=tests/ .PHONY: build build-wheel build-sdist verify-version init-project init test lint_check type_check format format_check precommit init-project: poetry install --all-extras init: init-project poetry run pre-commit install test: poetry run pytest $(TEST_SCOPE) lint_check: poetry run ruff check perception tests type_check: poetry run mypy perception format: poetry run black . format_check: poetry run black --check . || (echo '\nUnexpected format.' && exit 1) precommit: poetry check make lint_check make type_check make format_check make test verify-version: @echo "Poetry: $$(poetry --version)" @echo "Poetry plugins:" poetry self show plugins @echo "Git describe: $$(git describe --tags --always)" @poetry self show plugins | grep -q "poetry-dynamic-versioning" build-wheel: poetry run pip -q install repairwheel poetry self add "poetry-dynamic-versioning[plugin]" $(MAKE) verify-version poetry build --format="wheel" --output="dist-tmp" poetry run repairwheel -o dist dist-tmp/*.whl @find dist -name "*.whl" -type f | sed -n "s/\(.*\)\.linux.*\.whl$$/& \1.whl/p" | xargs -r -n 2 mv # Fix wheel name @rm -rf dist-tmp build-sdist: poetry self add "poetry-dynamic-versioning[plugin]" $(MAKE) verify-version poetry build --format="sdist" --output="dist" build: build-wheel build-sdist ================================================ FILE: README.md ================================================ # perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg) `perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details. ## Background `perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/). ## Getting Started ### Installation `pip install perception` ### Hashing Hashing with different functions is simple with `perception`. ```python from perception import hashers file1, file2 = 'test1.jpg', 'test2.jpg' hasher = hashers.PHash() hash1, hash2 = hasher.compute(file1), hasher.compute(file2) distance = hasher.compute_distance(hash1, hash2) ``` ### Examples See below for end-to-end examples for common use cases for perceptual hashes. - [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html) - [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html) - [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html) ## Supported Hashing Algorithms `perception` currently ships with: - pHash (DCT hash) (`perception.hashers.PHash`) - Facebook's PDQ Hash (`perception.hashers.PDQ`) - dHash (difference hash) (`perception.hashers.DHash`) - aHash (average hash) (`perception.hashers.AverageHash`) - Marr-Hildreth (`perception.hashers.MarrHildreth`) - Color Moment (`perception.hashers.ColorMoment`) - Block Mean (`perception.hashers.BlockMean`) - wHash (wavelet hash) (`perception.hashers.WaveletHash`) ## Contributing To work on the project, start by doing the following. ```bash # Install local dependencies for # code completion, etc. make init - To do a (close to) comprehensive check before committing code, you can use `make precommit`. To implement new features, please first file an issue proposing your change for discussion. To report problems, please file an issue with sample code, expected results, actual results, and a complete traceback. ## Alternatives There are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some examples. - [dedupe](https://github.com/dedupeio/dedupe) - [imagededup](https://idealo.github.io/imagededup/) - [ImageHash](https://github.com/JohannesBuchner/imagehash) - [PhotoHash](https://github.com/bunchesofdonald/photohash) ``` ================================================ FILE: build.py ================================================ from Cython.Build import cythonize import numpy as np compiler_directives = {"language_level": 3, "embedsignature": True} def build(setup_kwargs): setup_kwargs.update( { "ext_modules": cythonize( "perception/**/extensions.pyx", compiler_directives=compiler_directives ), "include_dirs": [np.get_include()], } ) ================================================ FILE: docs/api/benchmarking.rst ================================================ Benchmarking ************ .. autoclass:: perception.benchmarking.BenchmarkImageDataset :members: :inherited-members: .. autoclass:: perception.benchmarking.BenchmarkImageTransforms :members: :inherited-members: .. autoclass:: perception.benchmarking.BenchmarkVideoDataset :members: :inherited-members: .. autoclass:: perception.benchmarking.BenchmarkVideoTransforms :members: :inherited-members: .. autoclass:: perception.benchmarking.BenchmarkHashes :members: :inherited-members: Video Transforms ================ Transforming videos can be more complex, so we provide the following tools for transforming videos. .. automodule:: perception.benchmarking.video_transforms :members: get_simple_transform, get_black_frame_padding_transform, get_slideshow_transform ================================================ FILE: docs/api/hashers.rst ================================================ Hashers ******* All hashers from the :code:`Hasher` class. .. autoclass:: perception.hashers.hasher.Hasher :members: Images ~~~~~~ All image hashers inherit from the :code:`ImageHasher` class. .. autoclass:: perception.hashers.hasher.ImageHasher :members: The following image hash functions are included in the package. .. automodule:: perception.hashers.image :members: :imported-members: Videos ~~~~~~ All video hashers inherit from the :code:`VideoHasher` class. .. autoclass:: perception.hashers.hasher.VideoHasher :members: The following video hash functions are included in the package. .. automodule:: perception.hashers.video :members: :imported-members: Tools ~~~~~ These utility functions are only used by the hashers but are documented here for completeness. .. automodule:: perception.hashers.tools :members: ================================================ FILE: docs/api/index.rst ================================================ API *** .. toctree:: :maxdepth: 2 :caption: Contents: hashers benchmarking tools ================================================ FILE: docs/api/tools.rst ================================================ Tools ***** .. automodule:: perception.tools :members: ================================================ FILE: docs/conf.py ================================================ # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # -- Project information ----------------------------------------------------- project = "perception" copyright = "2019, thorn" author = "thorn" # The short X.Y version version = "" # The full version, including alpha/beta/rc tags release = "" # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.imgmath", "sphinx.ext.napoleon", "sphinx_autodoc_typehints", "m2r", ] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = ".rst" # The master toctree document. master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None html_theme = "sphinx_rtd_theme" html_theme_options = {"navigation_depth": 4, "collapse_navigation": False} ================================================ FILE: docs/examples/benchmarking.rst ================================================ Benchmarking ************ This package provides a fair amount of infrastructure for benchmarking different hashers to evaluate their performance. Image Hashing ============= The below example does the following: - Download a benchmarking dataset (we provide a dataset with images that have compatible licensing for this example) - Load the dataset. If you are using your own datasets, you may wish to call `deduplicate` on it to ensure no duplicates are included. - Transform the dataset to generate synthetic images. - Define a new custom hasher that we want to evaluate. It's not very good -- but demonstrates how you can evaluate your own custom hash functions. - Compute all the hashes. - Report metrics for each image category / hasher / transformation combination. .. code-block:: python import os import glob import zipfile import urllib.request import cv2 import albumentations import tabulate # Optional: Only used for generating tables for the Sphinx documentation import numpy as np from perception import benchmarking, hashers from perception.hashers.image.pdq import PDQHash urllib.request.urlretrieve( "https://thorn-perception.s3.amazonaws.com/thorn-perceptual-benchmark-v0.zip", "thorn-perceptual-benchmark-v0.zip" ) with zipfile.ZipFile('thorn-perceptual-benchmark-v0.zip') as f: f.extractall('.') # Load the dataset dataset = benchmarking.BenchmarkImageDataset.from_tuples(files=[ (filepath, filepath.split(os.path.sep)[-2]) for filepath in glob.glob( os.path.join('thorn-perceptual-benchmark-v0', '**', '*.jpg') ) ]) # Define the transforms we want to use for # evaluation hash quality. def watermark(image): fontScale = 5 thickness = 5 text = "TEXT" fontFace = cv2.FONT_HERSHEY_SIMPLEX targetWidth = 0.2*image.shape[1] (textWidth, textHeight), _ = cv2.getTextSize( text="TEST", fontFace=fontFace, fontScale=fontScale, thickness=thickness ) fontScaleCorr = targetWidth / textWidth textHeight *= fontScaleCorr textWidth *= fontScaleCorr fontScale *= fontScaleCorr org = ( textHeight, image.shape[0] - textHeight ) org = tuple(map(int, org)) color = (0, 0, 0, 200) placeholder = cv2.putText( img=np.zeros(image.shape[:2] + (4, ), dtype='uint8'), text="TEST", org=org, color=color, fontFace=fontFace, fontScale=fontScale, thickness=thickness ).astype('float32') augmented = ( (image.astype('float32')[..., :3]*(255 - placeholder[..., 3:]) + placeholder[..., :3]*placeholder[..., 3:]) ) / 255 return augmented.astype('uint8') def vignette(image): height, width = image.shape[:2] a = cv2.getGaussianKernel(height, height/2) b = cv2.getGaussianKernel(width, width/2) c = (b.T*a)[..., np.newaxis] d = c/c.max() e = image*d return e.astype('uint8') transforms={ 'watermark': watermark, 'blur2': albumentations.GaussianBlur(sigma_limit=2.0, p=1), 'vignette': vignette, 'gamma2': albumentations.RandomGamma(gamma_limit=2, p=1), 'jpeg95': albumentations.ImageCompression(quality=95, p=1), 'pad0.2': albumentations.CropAndPad(percent=(0.2, 2), p=1), 'crop0.05': albumentations.CropAndPad(percent=-0.05, p=1), 'noise0.2': albumentations.GaussNoise(noise_scale_factor=0.2, p=1), 'rotate4': albumentations.Affine(rotate=4, p=1), 'noop': albumentations.NoOp(p=1), } # Compute the transformed versions of the images. # This takes a while but you can reload the # generated dataset without recomputing it (see next line). transformed = dataset.transform( transforms=transforms, storage_dir='transformed', errors="raise" ) # We don't actually have to do this, but it shows # how to reload the transformed dataset later. transformed = benchmarking.BenchmarkImageTransforms.load( path_to_zip_or_directory='transformed', verify_md5=False ) # Create a new hash that we want to evaluate. # perception will handle most of the plumbing but # we do have to specify a few things. class ShrinkHash(hashers.ImageHasher): """This is a simple hash to demonstrate how you can create your own hasher and compare it to others. It just shrinks images to 8x8 pixels and then flattens the result. """ # We have to let perception know # the shape and type of our hash. hash_length = 64 dtype = 'uint8' # We need to specify how distance is # computed between hashes. distance_metric = 'euclidean' def _compute(self, image): gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) resized = cv2.resize(gray, dsize=(8, 8)) return resized.flatten() hashers_dict = { 'ahash': hashers.AverageHash(hash_size=16), 'dhash': hashers.DHash(hash_size=16), 'pdq': PDQHash(), 'phash': hashers.PHash(hash_size=16), 'marrhildreth': hashers.MarrHildreth(), 'wavelet': hashers.WaveletHash(hash_size=16), 'blockmean': hashers.BlockMean(), 'shrinkhash': ShrinkHash() } # Compute the hashes hashes = transformed.compute_hashes(hashers=hashers_dict) # Get performance metrics (i.e., recall) for each hash function based on # a minimum precision threshold. Here we use 99.99%. precision_threshold = 99.99 # The metrics are just pandas dataframes. We use tabulate here to obtain the tables # formatted for the documentation. metrics = hashes.compute_threshold_recall(precision_threshold=precision_threshold).reset_index() print(tabulate.tabulate(metrics, showindex=False, headers=metrics.columns, tablefmt='rst')) metrics_by_transform = hashes.compute_threshold_recall(grouping=['transform_name'], precision_threshold=precision_threshold).reset_index() print(tabulate.tabulate(metrics_by_transform, showindex=False, headers=metrics_by_transform.columns, tablefmt='rst')) metrics_simple = hashes.compute_threshold_recall(grouping=[], precision_threshold=precision_threshold).reset_index() print(tabulate.tabulate(metrics_simple, showindex=False, headers=metrics_simple.columns, tablefmt='rst')) =========== ================ ============= ============ ======== =========== ============= category transform_name hasher_name threshold recall precision n_exemplars =========== ================ ============= ============ ======== =========== ============= paintings blur2 ahash 0.0078125 51.724 100 2204 paintings blur2 blockmean 0.0123967 85.753 100 2204 paintings blur2 dhash 0.105469 100 100 2204 paintings blur2 marrhildreth 0.0989583 100 100 2204 paintings blur2 pdq 0.117188 100 100 2204 paintings blur2 phash 0.0390625 100 100 2204 paintings blur2 shrinkhash 60.8112 43.33 100 2204 paintings blur2 wavelet 0.0117188 66.379 100 2204 paintings crop0.05 ahash 0.00390625 0.045 100 2204 paintings crop0.05 blockmean 0.0123967 0.227 100 2204 paintings crop0.05 dhash 0.210938 7.577 100 2204 paintings crop0.05 marrhildreth 0.213542 3.584 100 2204 paintings crop0.05 pdq 0.257812 8.439 100 2204 paintings crop0.05 phash 0.226562 6.76 100 2204 paintings crop0.05 shrinkhash 95.0053 2.269 100 2204 paintings crop0.05 wavelet 0.0078125 0 nan 2204 paintings gamma2 ahash 0.00390625 0.998 100 2204 paintings gamma2 blockmean 0.0072314 1.724 100 2204 paintings gamma2 dhash 0.167969 98.639 100 2204 paintings gamma2 marrhildreth 0.159722 99.41 100 2204 paintings gamma2 pdq 0.164062 100 100 2204 paintings gamma2 phash 0.164062 100 100 2204 paintings gamma2 shrinkhash 46.5296 0 nan 2204 paintings gamma2 wavelet 0.0117188 18.512 100 2204 paintings jpeg95 ahash 0.00390625 4.22 100 2204 paintings jpeg95 blockmean 0.0134298 28.811 100 2204 paintings jpeg95 dhash 0.191406 94.782 100 2204 paintings jpeg95 marrhildreth 0.168403 82.985 100 2204 paintings jpeg95 pdq 0.257812 100 100 2204 paintings jpeg95 phash 0.234375 100 100 2204 paintings jpeg95 shrinkhash 66.053 55.172 100 2204 paintings jpeg95 wavelet 0 0 nan 2204 paintings noise0.2 ahash 0.00390625 2.677 100 2204 paintings noise0.2 blockmean 0.00826446 6.987 100 2204 paintings noise0.2 dhash 0.25 93.648 100 2204 paintings noise0.2 marrhildreth 0.170139 73.911 100 2204 paintings noise0.2 pdq 0.257812 99.229 100 2204 paintings noise0.2 phash 0.257812 100 100 2204 paintings noise0.2 shrinkhash 169.387 3.312 100 2204 paintings noise0.2 wavelet 0.0078125 1.407 100 2204 paintings noop ahash 0 100 100 2204 paintings noop blockmean 0 100 100 2204 paintings noop dhash 0 100 100 2204 paintings noop marrhildreth 0 100 100 2204 paintings noop pdq 0 100 100 2204 paintings noop phash 0 100 100 2204 paintings noop shrinkhash 0 100 100 2204 paintings noop wavelet 0 100 100 2204 paintings pad0.2 ahash 0.0703125 0 nan 2204 paintings pad0.2 blockmean 0.0795455 0 nan 2204 paintings pad0.2 dhash 0.210938 1.089 100 2204 paintings pad0.2 marrhildreth 0.177083 0 nan 2204 paintings pad0.2 pdq 0.289062 1.86 100 2204 paintings pad0.2 phash 0.273438 2.541 100 2204 paintings pad0.2 shrinkhash 146.325 0.181 100 2204 paintings pad0.2 wavelet 0.109375 0 nan 2204 paintings resize0.5 ahash 0.0078125 76.089 100 2204 paintings resize0.5 blockmean 0.0144628 98.185 100 2204 paintings resize0.5 dhash 0.0976562 100 100 2204 paintings resize0.5 marrhildreth 0.154514 99.819 100 2204 paintings resize0.5 pdq 0.1875 100 100 2204 paintings resize0.5 phash 0.09375 100 100 2204 paintings resize0.5 shrinkhash 56.9034 76.27 100 2204 paintings resize0.5 wavelet 0.0117188 84.71 100 2204 paintings rotate4 ahash 0.0390625 2.949 100 2204 paintings rotate4 blockmean 0.0382231 2.949 100 2204 paintings rotate4 dhash 0.207031 36.298 100 2204 paintings rotate4 marrhildreth 0.227431 61.978 100 2204 paintings rotate4 pdq 0.273438 56.08 100 2204 paintings rotate4 phash 0.257812 61.615 100 2204 paintings rotate4 shrinkhash 69.1737 2.813 100 2204 paintings rotate4 wavelet 0.03125 0.136 100 2204 paintings vignette ahash 0.0429688 6.171 100 2204 paintings vignette blockmean 0.0475207 8.122 100 2204 paintings vignette dhash 0.121094 32.305 100 2204 paintings vignette marrhildreth 0.177083 77.904 100 2204 paintings vignette pdq 0.132812 100 100 2204 paintings vignette phash 0.132812 100 100 2204 paintings vignette shrinkhash 102.186 3.267 100 2204 paintings vignette wavelet 0.046875 3.085 100 2204 paintings watermark ahash 0.00390625 20.054 100 2204 paintings watermark blockmean 0.0123967 45.145 100 2204 paintings watermark dhash 0.0585938 100 100 2204 paintings watermark marrhildreth 0.0625 100 100 2204 paintings watermark pdq 0.273438 98.866 100 2204 paintings watermark phash 0.28125 99.456 100 2204 paintings watermark shrinkhash 104.398 75.998 100 2204 paintings watermark wavelet 0.0117188 51.27 100 2204 photographs blur2 ahash 0.015625 76.727 100 1650 photographs blur2 blockmean 0.0330579 98 100 1650 photographs blur2 dhash 0.0859375 98.97 100 1650 photographs blur2 marrhildreth 0.107639 97.576 100 1650 photographs blur2 pdq 0.304688 100 100 1650 photographs blur2 phash 0.179688 100 100 1650 photographs blur2 shrinkhash 117.627 44 100 1650 photographs blur2 wavelet 0.0195312 79.879 100 1650 photographs crop0.05 ahash 0.0078125 0.182 100 1650 photographs crop0.05 blockmean 0.0258264 0.788 100 1650 photographs crop0.05 dhash 0.0976562 1.091 100 1650 photographs crop0.05 marrhildreth 0.173611 3.152 100 1650 photographs crop0.05 pdq 0.304688 30.606 100 1650 photographs crop0.05 phash 0.320312 63.697 100 1650 photographs crop0.05 shrinkhash 125.94 1.152 100 1650 photographs crop0.05 wavelet 0.015625 0.182 100 1650 photographs gamma2 ahash 0.015625 8.182 100 1650 photographs gamma2 blockmean 0.0268595 17.212 100 1650 photographs gamma2 dhash 0.101562 90.303 100 1650 photographs gamma2 marrhildreth 0.105903 90.909 100 1650 photographs gamma2 pdq 0.210938 100 100 1650 photographs gamma2 phash 0.234375 100 100 1650 photographs gamma2 shrinkhash 119.683 0.545 100 1650 photographs gamma2 wavelet 0.0195312 18.424 100 1650 photographs jpeg95 ahash 0.0117188 29.879 100 1650 photographs jpeg95 blockmean 0.0278926 76.788 100 1650 photographs jpeg95 dhash 0.121094 84.182 100 1650 photographs jpeg95 marrhildreth 0.104167 69.576 100 1650 photographs jpeg95 pdq 0.296875 99.879 100 1650 photographs jpeg95 phash 0.28125 99.879 100 1650 photographs jpeg95 shrinkhash 131.031 89.212 100 1650 photographs jpeg95 wavelet 0.0195312 40.242 100 1650 photographs noise0.2 ahash 0.015625 27.636 100 1650 photographs noise0.2 blockmean 0.036157 75.091 100 1650 photographs noise0.2 dhash 0.121094 54.121 100 1650 photographs noise0.2 marrhildreth 0.0989583 46.364 100 1650 photographs noise0.2 pdq 0.296875 99.697 100 1650 photographs noise0.2 phash 0.304688 99.818 100 1650 photographs noise0.2 shrinkhash 210.661 57.576 100 1650 photographs noise0.2 wavelet 0.0234375 27.03 100 1650 photographs noop ahash 0 100 100 1650 photographs noop blockmean 0 100 100 1650 photographs noop dhash 0 100 100 1650 photographs noop marrhildreth 0 100 100 1650 photographs noop pdq 0 100 100 1650 photographs noop phash 0 100 100 1650 photographs noop shrinkhash 0 100 100 1650 photographs noop wavelet 0 100 100 1650 photographs pad0.2 ahash 0.0429688 0.061 100 1650 photographs pad0.2 blockmean 0.0320248 0 nan 1650 photographs pad0.2 dhash 0.105469 0.545 100 1650 photographs pad0.2 marrhildreth 0.177083 0.121 100 1650 photographs pad0.2 pdq 0.28125 1.455 100 1650 photographs pad0.2 phash 0.289062 3.515 100 1650 photographs pad0.2 shrinkhash 114.721 0.061 100 1650 photographs pad0.2 wavelet 0.0820312 0 nan 1650 photographs resize0.5 ahash 0.015625 87.697 100 1650 photographs resize0.5 blockmean 0.0330579 99.152 100 1650 photographs resize0.5 dhash 0.0898438 98.485 100 1650 photographs resize0.5 marrhildreth 0.111111 95.394 100 1650 photographs resize0.5 pdq 0.328125 99.818 100 1650 photographs resize0.5 phash 0.234375 100 100 1650 photographs resize0.5 shrinkhash 132.117 80.242 100 1650 photographs resize0.5 wavelet 0.0195312 88.97 100 1650 photographs rotate4 ahash 0.0273438 1.818 100 1650 photographs rotate4 blockmean 0.0371901 3.879 100 1650 photographs rotate4 dhash 0.09375 2.97 100 1650 photographs rotate4 marrhildreth 0.149306 4.606 100 1650 photographs rotate4 pdq 0.304688 73.394 100 1650 photographs rotate4 phash 0.3125 89.818 100 1650 photographs rotate4 shrinkhash 130.211 4.424 100 1650 photographs rotate4 wavelet 0.0078125 0.061 100 1650 photographs vignette ahash 0.0273438 8.242 100 1650 photographs vignette blockmean 0.0320248 10 100 1650 photographs vignette dhash 0.0703125 22 100 1650 photographs vignette marrhildreth 0.0954861 38.727 100 1650 photographs vignette pdq 0.117188 100 100 1650 photographs vignette phash 0.125 100 100 1650 photographs vignette shrinkhash 138.989 11.939 100 1650 photographs vignette wavelet 0.0195312 4.242 100 1650 photographs watermark ahash 0.015625 42.667 100 1650 photographs watermark blockmean 0.0247934 60.788 100 1650 photographs watermark dhash 0.078125 100 100 1650 photographs watermark marrhildreth 0.112847 98.727 100 1650 photographs watermark pdq 0.3125 99.818 100 1650 photographs watermark phash 0.3125 99.758 100 1650 photographs watermark shrinkhash 142.046 79.576 100 1650 photographs watermark wavelet 0.0195312 53.455 100 1650 =========== ================ ============= ============ ======== =========== ============= ================ ============= ============ ======== =========== ============= transform_name hasher_name threshold recall precision n_exemplars ================ ============= ============ ======== =========== ============= blur2 ahash 0.0078125 49.014 100 3854 blur2 blockmean 0.0123967 80.773 100 3854 blur2 dhash 0.0859375 99.196 100 3854 blur2 marrhildreth 0.107639 98.962 100 3854 blur2 pdq 0.234375 99.948 100 3854 blur2 phash 0.179688 100 100 3854 blur2 shrinkhash 60.8112 28.412 100 3854 blur2 wavelet 0.0117188 62.247 100 3854 crop0.05 ahash 0.00390625 0.052 100 3854 crop0.05 blockmean 0.0123967 0.208 100 3854 crop0.05 dhash 0.0976562 0.493 100 3854 crop0.05 marrhildreth 0.173611 1.635 100 3854 crop0.05 pdq 0.257812 9.03 100 3854 crop0.05 phash 0.226562 7.058 100 3854 crop0.05 shrinkhash 95.0053 1.427 100 3854 crop0.05 wavelet 0.0078125 0 nan 3854 gamma2 ahash 0.00390625 0.934 100 3854 gamma2 blockmean 0.0072314 1.713 100 3854 gamma2 dhash 0.101562 90.036 100 3854 gamma2 marrhildreth 0.105903 94.24 100 3854 gamma2 pdq 0.210938 100 100 3854 gamma2 phash 0.234375 100 100 3854 gamma2 shrinkhash 108.457 0.156 100 3854 gamma2 wavelet 0.0117188 14.997 100 3854 jpeg95 ahash 0.00390625 5.319 100 3854 jpeg95 blockmean 0.0134298 32.045 100 3854 jpeg95 dhash 0.121094 74.079 100 3854 jpeg95 marrhildreth 0.104167 59.263 100 3854 jpeg95 pdq 0.257812 99.896 100 3854 jpeg95 phash 0.234375 99.896 100 3854 jpeg95 shrinkhash 66.053 40.296 100 3854 jpeg95 wavelet 0.00390625 3.71 100 3854 noise0.2 ahash 0.00390625 2.984 100 3854 noise0.2 blockmean 0.00826446 8.563 100 3854 noise0.2 dhash 0.121094 40.088 100 3854 noise0.2 marrhildreth 0.0989583 33.083 100 3854 noise0.2 pdq 0.257812 99.222 100 3854 noise0.2 phash 0.273438 99.896 100 3854 noise0.2 shrinkhash 169.387 4.385 100 3854 noise0.2 wavelet 0.0078125 1.894 100 3854 noop ahash 0 100 100 3854 noop blockmean 0 100 100 3854 noop dhash 0 100 100 3854 noop marrhildreth 0 100 100 3854 noop pdq 0 100 100 3854 noop phash 0 100 100 3854 noop shrinkhash 0 100 100 3854 noop wavelet 0 100 100 3854 pad0.2 ahash 0.0429688 0.026 100 3854 pad0.2 blockmean 0.0320248 0 nan 3854 pad0.2 dhash 0.105469 0.234 100 3854 pad0.2 marrhildreth 0.177083 0.052 100 3854 pad0.2 pdq 0.28125 1.349 100 3854 pad0.2 phash 0.273438 2.387 100 3854 pad0.2 shrinkhash 114.721 0.052 100 3854 pad0.2 wavelet 0.0820312 0 nan 3854 resize0.5 ahash 0.0078125 70.784 100 3854 resize0.5 blockmean 0.0144628 95.226 100 3854 resize0.5 dhash 0.0898438 99.299 100 3854 resize0.5 marrhildreth 0.112847 97.846 100 3854 resize0.5 pdq 0.265625 99.844 100 3854 resize0.5 phash 0.234375 100 100 3854 resize0.5 shrinkhash 56.9034 51.453 100 3854 resize0.5 wavelet 0.0117188 80.747 100 3854 rotate4 ahash 0.0273438 1.297 100 3854 rotate4 blockmean 0.0371901 3.036 100 3854 rotate4 dhash 0.09375 1.401 100 3854 rotate4 marrhildreth 0.149306 3.762 100 3854 rotate4 pdq 0.273438 54.489 100 3854 rotate4 phash 0.257812 59.626 100 3854 rotate4 shrinkhash 69.1737 1.894 100 3854 rotate4 wavelet 0.0078125 0.026 100 3854 vignette ahash 0.0273438 4.67 100 3854 vignette blockmean 0.0320248 6.098 100 3854 vignette dhash 0.0703125 12.195 100 3854 vignette marrhildreth 0.0954861 30.54 100 3854 vignette pdq 0.132812 100 100 3854 vignette phash 0.132812 100 100 3854 vignette shrinkhash 103.005 4.541 100 3854 vignette wavelet 0.0195312 1.946 100 3854 watermark ahash 0.00390625 18.5 100 3854 watermark blockmean 0.0123967 41.593 100 3854 watermark dhash 0.078125 100 100 3854 watermark marrhildreth 0.112847 99.455 100 3854 watermark pdq 0.273438 99.014 100 3854 watermark phash 0.28125 99.377 100 3854 watermark shrinkhash 104.398 71.199 100 3854 watermark wavelet 0.0117188 46.912 100 3854 ================ ============= ============ ======== =========== ============= ============= =========== ======== =========== ============= hasher_name threshold recall precision n_exemplars ============= =========== ======== =========== ============= ahash 0.00390625 17.578 100 42394 blockmean 0.00826446 27.714 100 42394 dhash 0.0859375 51.981 99.9952 42394 marrhildreth 0.100694 55.942 99.9957 42394 pdq 0.257812 77.181 99.9969 42394 phash 0.273438 81.967 99.9942 42394 shrinkhash 56.9034 22.378 100 42394 wavelet 0.00390625 18.467 100 42394 ============= =========== ======== =========== ============= Video Hashing ============= The below example does the following: - Download a benchmarking dataset. Here we use the `Charades `_ dataset which contain over 9,000 videos. - Load the dataset. - Transform the dataset to generate synthetically altered videos. Our hashers are responsible for matching the altered videos with the originals. - Define some hashers we want to evaluate. - Compute all the hashes. - Report metrics for each video category / hasher / transformation combination to see how well our hashers can match the altered videos to the original ("no-op" videos). .. code-block:: python import os import zipfile import urllib.request import pandas as pd import perception.benchmarking import perception.hashers if not os.path.isdir('Charades_v1_480'): # Download the dataset since it appears we do not have it. Note that # these are large files (> 13GB). urllib.request.urlretrieve( url='http://ai2-website.s3.amazonaws.com/data/Charades_v1_480.zip', filename='Charades_v1_480.zip' ) with zipfile.ZipFile('Charades_v1_480.zip') as zfile: zfile.extractall('.') urllib.request.urlretrieve( url='http://ai2-website.s3.amazonaws.com/data/Charades.zip', filename='Charades.zip' ) with zipfile.ZipFile('Charades.zip') as zfile: zfile.extractall('.') # These are files that we've identified as having identical subsequences, typically # when a person is out of frame and the backgrounds are the same. duplicates = [ ('0HVVN.mp4', 'UZRQD.mp4'), ('ZIOET.mp4', 'YGXX6.mp4'), ('82XPD.mp4', 'E7QDZ.mp4'), ('FQDS1.mp4', 'AIOTI.mp4'), ('PBV4T.mp4', 'XXYWL.mp4'), ('M0P0H.mp4', 'STY6W.mp4'), ('3Q92U.mp4', 'GHPO3.mp4'), ('NFIQM.mp4', 'I2DHG.mp4'), ('PIRMO.mp4', '0GFE8.mp4'), ('LRPBA.mp4', '9VK0J.mp4'), ('UI0QG.mp4', 'FHXKQ.mp4'), ('Y05U8.mp4', '4RVZB.mp4'), ('J6TVB.mp4', '2ZBL5.mp4'), ('A8T8V.mp4', 'IGOQK.mp4'), ('H8QM1.mp4', 'QYMWC.mp4'), ('O45BC.mp4', 'ZS7X6.mp4'), ('NOP6W.mp4', 'F7KFE.mp4'), ('4MPPQ.mp4', 'A3M94.mp4'), ('L8FFR.mp4', 'M8MP0.mp4'), ('EHYXP.mp4', 'O8PO3.mp4'), ('MGBLJ.mp4', 'RIEG6.mp4'), ('53FPM.mp4', 'BLFEV.mp4'), ('UIIF3.mp4', 'TKEKQ.mp4'), ('GVX7E.mp4', '7GPSY.mp4'), ('T7HZB.mp4', '6KGZA.mp4'), ('65M4K.mp4', 'UDGP2.mp4'), ('6SS4H.mp4', 'CK6OL.mp4'), ('OVHFT.mp4', 'GG1X2.mp4'), ('VEHER.mp4', 'XBPEJ.mp4'), ('WN38A.mp4', '2QI8F.mp4'), ('UMXKN.mp4', 'EOKJ0.mp4'), ('OSIKP.mp4', 'WT2C0.mp4'), ('H5V2Y.mp4', 'ZXN6A.mp4'), ('XS6PF.mp4', '1WJ6O.mp4'), ('S2XJW.mp4', 'YH0BX.mp4'), ('UO607.mp4', 'Z5JZD.mp4'), ('XN64E.mp4', 'CSRZM.mp4'), ('YXI7M.mp4', 'IKQLJ.mp4'), ('1B9C8.mp4', '004QE.mp4'), ('V1SQH.mp4', '48WOM.mp4'), ('107YZ.mp4', 'I049A.mp4'), ('3S6WL.mp4', 'SC5YW.mp4'), ('OY50Q.mp4', '5T607.mp4'), ('XKH7W.mp4', '028CE.mp4'), ('X8XQE.mp4', 'J0VXY.mp4'), ('STB0G.mp4', 'J0VXY.mp4'), ('UNXLF.mp4', 'J0VXY.mp4'), ('56PK0.mp4', 'M1TZR.mp4'), ('FVITB.mp4', 'R0M34.mp4'), ('BPZE3.mp4', 'R0M34.mp4'), ('VS7DA.mp4', '1X0M3.mp4'), ('I7MEA.mp4', 'YMM1Z.mp4'), ('9N76L.mp4', '0LDP7.mp4'), ('AXS82.mp4', 'W8WRK.mp4'), ('8TSU4.mp4', 'MXATD.mp4'), ('80FWF.mp4', '18HFG.mp4'), ('RO3A2.mp4', 'V4HY4.mp4'), ('HU409.mp4', 'BDWIX.mp4'), ('3YY88.mp4', 'EHHRS.mp4'), ('65RS3.mp4', 'SLIH4.mp4'), ('LR0L8.mp4', 'Y665P.mp4'), ('DVPL2.mp4', 'EI5M3.mp4'), ('0EGNU.mp4', 'CU3JE.mp4'), ('94KP4.mp4', '94KP4.mp4'), ('79QDP.mp4', '79QDP.mp4'), ('GKBX9.mp4', 'GKBX9.mp4'), ('RX6R8.mp4', 'RX6R8.mp4'), ('PMVT7.mp4', 'PMVT7.mp4'), ('XNXW6.mp4', 'XNXW6.mp4'), ('I005F.mp4', 'I005F.mp4'), ('TF95Y.mp4', 'TF95Y.mp4'), ('79QDP.mp4', '79QDP.mp4'), ('LQGMM.mp4', 'LQGMM.mp4'), ('QCAUL.mp4', 'QCAUL.mp4'), ('GFVSV.mp4', 'GFVSV.mp4'), ('4UYGY.mp4', '4UYGY.mp4'), ('BYDSE.mp4', 'BYDSE.mp4'), ('PV3KQ.mp4', 'PV3KQ.mp4'), ('1X0M3.mp4', '1X0M3.mp4'), ('T5FHD.mp4', 'T5FHD.mp4'), ('QRHJJ.mp4', 'QRHJJ.mp4'), ('JYBGS.mp4', 'JYBGS.mp4'), ('N2XCF.mp4', 'N2XCF.mp4'), ('OZPA9.mp4', 'OZPA9.mp4'), ('297S4.mp4', '297S4.mp4'), ('LHU7D.mp4', 'LHU7D.mp4'), ('TSKZL.mp4', 'TSKZL.mp4'), ('BCONW.mp4', 'BCONW.mp4'), ('KBPDM.mp4', 'KBPDM.mp4'), ('7FTBS.mp4', '7FTBS.mp4'), ('099Y1.mp4', '099Y1.mp4'), ('S2RIQ.mp4', 'S2RIQ.mp4'), ('22FJU.mp4', '22FJU.mp4'), ('99UA6.mp4', '99UA6.mp4'), ('WJ13E.mp4', 'WJ13E.mp4'), ('5OLVC.mp4', '5OLVC.mp4'), ('YQ6Z6.mp4', 'YQ6Z6.mp4'), ('T5MLJ.mp4', 'T5MLJ.mp4'), ('0VOQC.mp4', '0VOQC.mp4'), ('S2RIQ.mp4', 'S2RIQ.mp4'), ('2VNXF.mp4', '2VNXF.mp4'), ('G87XG.mp4', 'G87XG.mp4'), ('RRS54.mp4', 'RRS54.mp4'), ('TXJK7.mp4', 'TXJK7.mp4'), ('G4KE3.mp4', 'G4KE3.mp4'), ('3SNSC.mp4', '3SNSC.mp4'), ('U2FA5.mp4', 'U2FA5.mp4'), ('9AFQ7.mp4', '9AFQ7.mp4') ] blacklist = [fp1 for fp1, fp2 in duplicates] df = pd.concat([pd.read_csv('Charades/Charades_v1_test.csv'), pd.read_csv('Charades/Charades_v1_train.csv')]) df = df[~(df['id'] + '.mp4').isin(blacklist)] df['filepath'] = df['id'].apply(lambda video_id: os.path.join('Charades_v1_480', video_id + '.mp4')) assert df['filepath'].apply(os.path.isfile).all(), 'Some video files are missing.' dataset = perception.benchmarking.BenchmarkVideoDataset.from_tuples( files=df[['filepath', 'scene']].itertuples(index=False) ) if not os.path.isdir('benchmarking_videos'): # We haven't computed the transforms yet, so we do that # now. Below, we create the following files for each of # the videos in our dataset. Note that the only required # transform is `noop` (see documentation for # perception.bencharmking.BenchmarkVideoDataset.transform). # # noop: This is the base video we'll actually use in benchmarking, rather # than using the raw video. It is the same as the raw video but downsampled # to a size that is reasonable for hashing (240p). This is because all # of our hashers downsample to a size smaller than this anyway, so there # is no benefit to a higher resolution. Also, we limit the length to the # first five minutes of the video, which speeds everything up significantly. # shrink: Shrink the noop video down to 70% of its original size. # clip0.2: Clip the first 20% and last 20% of the noop video off. # slideshow: Create a slideshow version of the video that grabs frames periodically # from the original. # black_frames: Add black frames before and after the start of the video. # gif: Create a GIF from the video (similar to slideshow but with re-encoding) # black_padding: Add black bars to the top and bottom of the video. pad_width = 240 pad_height = 320 transforms = { 'noop': perception.benchmarking.video_transforms.get_simple_transform( width='ceil(min(240/max(iw, ih), 1)*iw/2)*2', height='ceil(min(240/max(iw, ih), 1)*ih/2)*2', codec='h264', output_ext='.m4v', sar='1/1', clip_s=(None, 60*5) ), 'shrink': perception.benchmarking.video_transforms.get_simple_transform( width='ceil(0.7*iw/2)*2', height='ceil(0.7*ih/2)*2' ), 'clip0.2': perception.benchmarking.video_transforms.get_simple_transform(clip_pct=(0.2, 0.8)), 'slideshow': perception.benchmarking.video_transforms.get_slideshow_transform( frame_input_rate=1/2.5, frame_output_rate=0.5, max_frames=10, offset=1.3), 'black_frames': perception.benchmarking.video_transforms.get_black_frame_padding_transform(0.5, 0.05), 'gif': perception.benchmarking.video_transforms.get_simple_transform( output_ext='.gif', codec='gif', clip_s=(1.2, 10.2), fps=1/2.5 ), 'black_padding': perception.benchmarking.video_transforms.get_simple_transform( width=f'(iw*sar)*min({pad_width}/(iw*sar),{pad_height}/ih)', height=f'ih*min({pad_width}/(iw*sar),{pad_height}/ih)', pad=f'{pad_width}:{pad_height}:({pad_width}-iw*min({pad_width}/iw,{pad_height}/ih))/2:({pad_height}-ih*min({pad_width}/iw,{pad_height}/ih))/2' ) } # Save the transforms for later. transformed = dataset.transform(transforms=transforms, storage_dir='benchmarking_videos') transformed = perception.benchmarking.BenchmarkVideoTransforms.load('benchmarking_videos', verify_md5=False) phashu8 = perception.hashers.PHashU8(exclude_first_term=False, freq_shift=1, hash_size=12) hashers = { 'phashu8_framewise': perception.hashers.FramewiseHasher( frames_per_second=1, frame_hasher=phashu8, interframe_threshold=50, quality_threshold=90), 'phashu8_tmkl1': perception.hashers.FramewiseHasher( base_hasher=perception.hashers.TMKL1( frames_per_second=5, frame_hasher=phashu8, distance_metric='euclidean', dtype='uint8', norm=None, quality_threshold=90) ) } if not os.path.isfile('hashes.csv'): # We haven't computed the hashes, so we do that now. hashes = transformed.compute_hashes(hashers=hashers, max_workers=5) # Save the hashes for later. It took a long time after all! hashes.save('hashes.csv') hashes = perception.benchmarking.BenchmarkHashes.load('hashes.csv') hashes.compute_threshold_recall(precision_threshold=99.9, grouping=['transform_name']) ================ ================= =========== ======== =========== ============= transform_name hasher_name threshold recall precision n_exemplars ================ ================= =========== ======== =========== ============= black_frames phashu8_framewise 51.0979 88.12 99.9069 278644 black_frames phashu8_tmkl1 55.7584 99.918 99.9079 403768 black_padding phashu8_framewise 74.6391 7.662 100 277399 black_padding phashu8_tmkl1 53.8702 99.898 99.9079 406899 clip0.2 phashu8_framewise 54.8635 90.741 99.9098 224264 clip0.2 phashu8_tmkl1 59.0424 99.724 99.9077 324251 gif phashu8_framewise 55.4437 68.21 99.9088 82232 gif phashu8_tmkl1 55.4887 81.029 99.9103 39757 noop phashu8_framewise 0 100 100 282658 noop phashu8_tmkl1 0 100 100 408871 shrink phashu8_framewise 24.7184 100 100 281731 shrink phashu8_tmkl1 49.8999 99.836 99.9078 400650 slideshow phashu8_framewise 56.9825 99.713 99.9076 172829 slideshow phashu8_tmkl1 56.8683 95.934 99.9035 90684 ================ ================= =========== ======== =========== ============= ================================================ FILE: docs/examples/deduplication.rst ================================================ Media Deduplication ******************* Perceptual hashes can be used to deduplicate sets of images. Below we provide two examples (one simple, one larger scale). **For most use cases, we recommend using PHash with** :code:`hash_size=16` **and with 0.2 as the distance threshold as in the example below.** You may wish to adjust this threshold up or down based on your tolerance for false negatives / positives. In practice, deduplicating in memory on your machine by the methods below may be impractical. For larger-scale applications, you may wish to use tools like `FAISS `_, `Annoy `_, or databases with functionality for querying based on distance such as `MemSQL `_. For the supported hashers, below are our recommended thresholds with expected false positive rates of <1%. ====================== =========== hasher threshold ====================== =========== ahash (hash_size=16) 0.008 blockmean 0.008 dhash (hash_size=16) 0.07 marrhildreth 0.1 pdq 0.2 phash (hash_size=16) 0.2 wavelet (hash_size=16) 0.02 ====================== =========== Simple example ============== In this example, we download a ZIP file containing 18 images. One of the images is duplicated twice and another image is duplicated once. .. code-block:: python import os import glob import zipfile import urllib.request import tabulate import pandas as pd from perception import tools, hashers urllib.request.urlretrieve( "https://thorn-perception.s3.amazonaws.com/thorn-perceptual-deduplication-example.zip", "thorn-perceptual-deduplication-example.zip" ) with zipfile.ZipFile('thorn-perceptual-deduplication-example.zip') as f: f.extractall('.') filepaths = glob.glob('thorn-perceptual-deduplication-example/*.jpg') duplicate_pairs = tools.deduplicate(files=filepaths, hashers=[(hashers.PHash(hash_size=16), 0.2)]) print(tabulate.tabulate(pd.DataFrame(duplicate_pairs), showindex=False, headers=['file1', 'file2'], tablefmt='rst')) # Now we can do whatever we want with the duplicates. We could just delete # the first entry in each pair or manually verify the pairs to ensure they # are, in fact duplicates. =============================================== =============================================== file1 file2 =============================================== =============================================== thorn-perceptual-deduplication-example/309b.jpg thorn-perceptual-deduplication-example/309.jpg thorn-perceptual-deduplication-example/309b.jpg thorn-perceptual-deduplication-example/309a.jpg thorn-perceptual-deduplication-example/309a.jpg thorn-perceptual-deduplication-example/309.jpg thorn-perceptual-deduplication-example/315a.jpg thorn-perceptual-deduplication-example/315.jpg =============================================== =============================================== Real-world example ================== In the example below, we use the `Caltech 256 Categories `_ dataset. Like most other public image datasets, it contains a handful of duplicates in some categories. The code below will: 1. Download the dataset 2. Group all the filepaths by category (the dataset is provided in folders) 3. Within each group, find duplicates using PHash. We will compare not just the original images, but also the 8 isometric transformations for each image. .. code-block:: python import os import tarfile from glob import glob import urllib.request import tqdm from perception import hashers, tools urllib.request.urlretrieve( "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar", "256_ObjectCategories.tar" ) with tarfile.open('256_ObjectCategories.tar') as tfile: tfile.extractall() files = glob('256_ObjectCategories/**/*.jpg') # To reduce the number of pairwise comparisons, # we can deduplicate within each image category # (i.e., we don't need to compare images of # butterflies with images of chess boards). filepath_group = [ ( filepath, os.path.normpath(filepath).split(os.sep)[-2] ) for filepath in files ] groups = list(set([group for _, group in filepath_group])) # We consider any pair of images with a PHash distance of < 0.2 as # as a duplicate. comparison_hashers = [(hashers.PHash(hash_size=16), 0.2)] duplicate_pairs = [] for current_group in groups: current_filepaths = [ filepath for filepath, group in filepath_group if group == current_group ] current_duplicate_pairs = tools.deduplicate( files=current_filepaths, hashers=comparison_hashers, isometric=True, progress=tqdm.tqdm ) duplicate_pairs.extend(current_duplicate_pairs) # Now we can do whatever we want with the duplicates. We could just delete # the first entry in each pair or manually verify the pairs to ensure they # are, in fact duplicates. Video deduplication =================== Video deduplication requires more thought depending on your tolerance for false positives and how important temporal relationships are. Below is one example approach for deduplicating a group of videos by taking frames from each video that are sufficiently different from each other (to avoid keeping too many) and then using them all to find pairs of videos that have matching frames. .. code-block:: python import urllib.request import zipfile import glob import tqdm import perception.hashers # Download some example videos. urllib.request.urlretrieve( "https://thorn-perception.s3.amazonaws.com/thorn-perceptual-video-deduplication-example.zip", "thorn-perceptual-video-deduplication-example.zip" ) with zipfile.ZipFile('thorn-perceptual-video-deduplication-example.zip') as f: f.extractall('.') frame_hasher = hashers.PHash(hash_size=16) hasher = perception.hashers.FramewiseHasher(frames_per_second=1, frame_hasher=frame_hasher, interframe_threshold=50, quality_threshold=90) # Set a threshold for matching frames within videos and across videos. filepaths = glob.glob('thorn-perceptual-video-deduplication-example/*.m4v') + \ glob.glob('thorn-perceptual-video-deduplication-example/*.gif') # Returns a list of dicts with a "filepath" and "hash" key. "hash" contains a # list of hashes. hashes = hasher.compute_parallel(filepaths=filepaths, progress=tqdm.tqdm) # Flatten the hashes into a list of (filepath, hash) tuples. hashes_flattened = perception.tools.flatten([ [(hash_group['filepath'], hash_string) for hash_string in hash_group['hash']] for hash_group in hashes ]) duplicates = perception.tools.deduplicate_hashes( hashes=hashes_flattened, threshold=50, hasher=hasher ) ================================================ FILE: docs/examples/detecting_csam.rst ================================================ Detecting Child Sexual Abuse Material ************************************* Using `perception` and a subscription to Thorn's Safer service, you can easily check for child sexual abuse material against a database of known bad content **without** having to send any images to a third party. You do this by sending compact, irreversible image hashes to get matches with a high degree of precision. We support matching using 16x16 PHash hashes and md5 hashes. See usage example below. Please contact info@getsafer.io to discuss Thorn's Safer service and subscription options and visit `getsafer.io `_ to learn more. .. code-block:: python from perception import tools matcher = tools.SaferMatcher( api_key='YOUR_API_KEY', url='MATCHING_SERVICE_URL' ) matches = matcher.match(['myfile.jpg']) In some cases, you may have a username/password instead of an API key, in which case you can pass those instead (see API documentation for details). ================================================ FILE: docs/examples/index.rst ================================================ Examples ******** .. toctree:: :maxdepth: 2 :caption: Contents: deduplication detecting_csam benchmarking ================================================ FILE: docs/index.rst ================================================ perception ========== :code:`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. It provides a common wrapper around existing, popular perceptual hashes (such as those implemented by `ImageHash `_) along with tools to compare their performance and use them for common tasks. Perceptual hashes are used to create compact image "fingerprints" which are invariant to small alterations to the original image. Typically, the representations are compact enough that they are irreversible, which makes them useful for deduplication and detecting abusive content while preserving the privacy of content owners. Installation ************ You can install :code:`perception` using pip. You must install OpenCV separately (e.g., with :code:`pip install opencv-python`). .. code-block:: bash # Install from PyPi pip install perception # Install from GitHub pip install git+https://github.com/thorn-oss/perception.git#egg=perception To install with the necessary dependencies for benchmarking, use: .. code-block:: bash # Install from PyPi pip install perception[benchmarking] # Install from GitHub pip install opencv-python git+https://github.com/thorn-oss/perception.git#egg=perception[benchmarking] Getting Started *************** Please see the examples for code snippets for common use cases. .. toctree:: :maxdepth: 2 :caption: Contents: examples/index api/index ================================================ FILE: docs/requirements.txt ================================================ sphinx-autodoc-typehints==3.2.0 # sphinx-autobuild==3.0.2 # sphinx==1.8.3 sphinx_rtd_theme==3.0.2 m2r==0.3.1 opencv-contrib-python-headless tqdm albumentations ffmpeg-python typing-extensions faiss-cpu aiohttp python-json-logger networkit ================================================ FILE: perception/__init__.py ================================================ from importlib import metadata __version__ = metadata.version("perception") ================================================ FILE: perception/approximate_deduplication/__init__.py ================================================ import logging import math import os.path as op import typing import faiss import numpy as np import tqdm import typing_extensions from ._graph_backend import get_graph_backend LOGGER = logging.getLogger(__name__) DEFAULT_PCT_PROBE = 0 # For faiss training on datasets larger than 50,000 vectors, we take a random sub-sample. TRAIN_LARGE_SIZE: int = 50_000 class ClusterAssignment(typing_extensions.TypedDict): cluster: int id: typing.Any def build_index( X: np.ndarray, pct_probe: float = DEFAULT_PCT_PROBE, approximate: bool = True, use_gpu: bool = True, ): """Buid a FAISS index from a reference dataframe. Args: X: The vectors to add to the index. pct_probe: The minimum fraction of nearest lists to search. If the product of pct_probe and the number of lists is less than 1, one list will be searched. approximate: Whether to build an approximate or exact index. Returns: An (index, lookup) tuple where the lookup returns the filepath for a given entry in the index. """ if X is None: return None X = X.astype("float32") d = X.shape[1] if approximate: ntotal = X.shape[0] nlist = int(max(min(4 * np.sqrt(ntotal), ntotal / 39), 1)) quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFFlat(quantizer, d, nlist) gpu = False if use_gpu: try: res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(res, 0, index) gpu = True except AttributeError: LOGGER.info("Building approximate FAISS index on CPU.") if X.shape[0] > TRAIN_LARGE_SIZE: # Take random sample of 50,000 or 39 points per centroid. # 39 points per centroid is the min for for not getting warnings. # https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids sample_size = max(39 * nlist, TRAIN_LARGE_SIZE) index.train(X[np.random.choice(X.shape[0], sample_size, replace=False)]) else: index.train(X) batch_size = 10_000 for i in range(0, X.shape[0], batch_size): index.add(X[i : i + batch_size]) if gpu: index = faiss.index_gpu_to_cpu(index) nprobe = max(math.ceil(pct_probe * nlist), 1) faiss.ParameterSpace().set_index_parameter(index, "nprobe", nprobe) else: index = faiss.IndexFlat(d) index.add(X) return index def compute_euclidean_pairwise_duplicates_approx( X, counts, threshold, minimum_overlap, Y=None, y_counts=None, pct_probe=0.1, use_gpu: bool = True, faiss_cache_path: str | None = None, show_progress: bool = False, ): """Provides the same result as perception.extensions.compute_pairwise_duplicates_simple but uses an approximate search instead of an exhaustive search, which can dramatically reduce processing time. Args: X: An array of vectors to compute pairs for. Y: if provided we search in X for Y vectors. counts: A list of counts of vectors for separate files in the in the vectors (should add up to the length of X) threshold: The threshold for a match as a euclidean distance. minimum_overlap: The minimum overlap between two files to qualify as a match. pct_probe: The minimum percentage of sublists to search for matches. The larger the value, the more exhaustive the search. faiss_cache_path: If provided load any existing faiss index from this path, and if it does not exist then save the generated faiss index to the path. show_progress: Whether or not to show a progress bar while computing pairs Returns: A list of pairs of matching file indexes. """ assert ( counts.sum() == X.shape[0] ), "Length of counts incompatible with vectors shape." assert (Y is None) == ( y_counts is None ), "Must provide both or neither for y, y_counts." if X.dtype != "float32": # Only make the copy if we have to. X = X.astype("float32") if Y is not None and Y.dtype != "float32": # Only make the copy if we have to. Y = Y.astype("float32") lookup_ = [] for idx, count in enumerate(counts): lookup_.extend([idx] * count) lookup = np.array(lookup_) if faiss_cache_path is not None and op.exists(faiss_cache_path): LOGGER.debug("Loading cached FAISS index from %s", faiss_cache_path) index = faiss.read_index(faiss_cache_path) assert ( X.shape[0] == index.ntotal ), "Cached FAISS index does not match provided X." else: LOGGER.debug("Building FAISS index.") index = build_index(X=X, pct_probe=pct_probe, approximate=True, use_gpu=use_gpu) if faiss_cache_path is not None: faiss.write_index(index, faiss_cache_path) LOGGER.debug("FAISS index ready, start aprox search") pairs = [] # Only use y_counts if present. if y_counts is None: iterator_counts = counts M = X else: iterator_counts = y_counts M = Y for end, length, query in tqdm.tqdm( zip(iterator_counts.cumsum(), iterator_counts, range(len(iterator_counts))), total=len(iterator_counts), disable=not show_progress, desc="Vectors", ): if length == 0: continue Xq = M[end - length : end] lims, _, idxs = index.range_search(Xq, threshold**2) lims = lims.astype("int32") matched = [ match for match in np.unique(lookup[list(set(idxs))]) # type: ignore if match != query or Y is not None # Protect self matches if Y is not present. ] query_in_match: typing.Mapping[int, set] = {m: set() for m in matched} match_in_query: typing.Mapping[int, set] = {m: set() for m in matched} for query_idx in range(length): for match_idx in idxs[lims[query_idx] : lims[query_idx + 1]]: match = lookup[match_idx] if ( match == query and Y is None ): # Protect self matches if Y is not present. continue match_in_query[match].add(match_idx) query_in_match[match].add(query_idx) for match in matched: overlap = min( [ len(query_in_match[match]) / length, len(match_in_query[match]) / counts[match], ] ) if overlap >= minimum_overlap and overlap > 0: if Y is None: pairs.append(tuple(sorted([query, match]))) else: pairs.append(tuple([query, match])) return list(set(pairs)) def pairs_to_clusters( ids: typing.Iterable[str], pairs: typing.Iterable[tuple[str, str]], strictness: typing_extensions.Literal[ "clique", "community", "component" ] = "clique", max_clique_batch_size: int = 1000, ) -> list[ClusterAssignment]: """Given a list of pairs of matching files, compute sets of cliques where all files in a clique are connected. Args: ids: A list of node ids (e.g., filepaths). pairs: A list of pairs of node ids, each pair is assumed to have an edge strictness: The level at which groups will be clustered. "component" means that all clusters will be connected components. "community" will select clusters of files within components that are clustered together. "clique" will result in clusters where every file is connected to every other file. max_clique_batch_size: The maximum batch size for identifying cliques. Returns: A list of cluster assignments (dicts with id and cluster entries). """ assert strictness in ["component", "community", "clique"], "Invalid strictness." list_ids = list(ids) id_to_node_map = {v: i for i, v in enumerate(list_ids)} node_to_id_map = {v: k for k, v in id_to_node_map.items()} LOGGER.debug("Building graph.") node_pairs = {(id_to_node_map[pair[0]], id_to_node_map[pair[1]]) for pair in pairs} backend = get_graph_backend() graph = backend.build_graph(len(list_ids), node_pairs) assignments: list[ClusterAssignment] = [] cluster_index = 0 components = backend.connected_components(graph) for component in components: LOGGER.debug("Got component with size: %s", len(component)) if strictness == "component": assignments.extend( [{"id": node_to_id_map[n], "cluster": cluster_index} for n in component] ) cluster_index += 1 continue communities = backend.communities(graph, component) for community_members in communities: LOGGER.debug("Got community with size: %s", len(community_members)) if strictness == "community": assignments.extend( [ {"id": node_to_id_map[n], "cluster": cluster_index} for n in community_members ] ) cluster_index += 1 continue for clique_members in backend.maximal_cliques( graph, community_members, max_clique_batch_size=max_clique_batch_size, ): assignments.extend( [ { "id": node_to_id_map[n], "cluster": cluster_index, } for n in clique_members ] ) cluster_index += 1 return assignments ================================================ FILE: perception/approximate_deduplication/_graph_backend.py ================================================ import sys import typing from abc import ABC, abstractmethod class GraphBackend(ABC): @abstractmethod def build_graph( self, node_count: int, edges: typing.Iterable[tuple[int, int]] ) -> typing.Any: ... @abstractmethod def connected_components(self, graph: typing.Any) -> list[list[int]]: ... @abstractmethod def communities( self, graph: typing.Any, component: list[int] ) -> list[list[int]]: ... @abstractmethod def maximal_cliques( self, graph: typing.Any, community_nodes: list[int], max_clique_batch_size: int, ) -> list[list[int]]: ... class NetworkitGraphBackend(GraphBackend): def __init__(self): import networkit as nk self.nk = nk def build_graph( self, node_count: int, edges: typing.Iterable[tuple[int, int]] ) -> typing.Any: graph = self.nk.Graph(node_count) for start, end in edges: graph.addEdge(start, end) return graph def connected_components(self, graph: typing.Any) -> list[list[int]]: cc_query = self.nk.components.ConnectedComponents(graph) cc_query.run() return cc_query.getComponents() def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]: component_node_map = dict(enumerate(component)) subgraph = self.nk.graphtools.subgraphFromNodes(graph, component, compact=True) algo = self.nk.community.PLP(subgraph, maxIterations=32) algo.run() communities = algo.getPartition() return [ [component_node_map[node] for node in communities.getMembers(community)] for community in communities.subsetSizeMap().keys() ] def maximal_cliques( self, graph: typing.Any, community_nodes: list[int], max_clique_batch_size: int, ) -> list[list[int]]: cliques: list[list[int]] = [] for start in range(0, len(community_nodes), max_clique_batch_size): batch_nodes = community_nodes[start : start + max_clique_batch_size] community_node_map = dict(enumerate(batch_nodes)) subgraph = self.nk.graphtools.subgraphFromNodes( graph, batch_nodes, compact=True ) while subgraph.numberOfNodes() > 0: clique = self.nk.clique.MaximalCliques(subgraph, maximumOnly=True) clique.run() clique_members = clique.getCliques()[0] cliques.append([community_node_map[node] for node in clique_members]) for node in clique_members: subgraph.removeNode(node) return cliques class NetworkxGraphBackend(GraphBackend): def __init__(self): import networkx as nx self.nx = nx def build_graph( self, node_count: int, edges: typing.Iterable[tuple[int, int]] ) -> typing.Any: graph = self.nx.Graph() graph.add_nodes_from(range(node_count)) graph.add_edges_from(edges) return graph def connected_components(self, graph: typing.Any) -> list[list[int]]: return [list(component) for component in self.nx.connected_components(graph)] def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]: subgraph = graph.subgraph(component) return [ list(community) for community in self.nx.algorithms.community.asyn_lpa_communities( subgraph, seed=0 ) ] def maximal_cliques( self, graph: typing.Any, community_nodes: list[int], max_clique_batch_size: int, ) -> list[list[int]]: cliques: list[list[int]] = [] for start in range(0, len(community_nodes), max_clique_batch_size): batch_nodes = community_nodes[start : start + max_clique_batch_size] subgraph = graph.subgraph(batch_nodes).copy() while subgraph.number_of_nodes() > 0: clique_members = max( self.nx.find_cliques(subgraph), key=lambda clique: ( len(clique), tuple(sorted(clique)), ), ) cliques.append(list(clique_members)) subgraph.remove_nodes_from(clique_members) return cliques def get_graph_backend() -> GraphBackend: if sys.platform == "darwin": return NetworkxGraphBackend() return NetworkitGraphBackend() ================================================ FILE: perception/approximate_deduplication/debug.py ================================================ import logging import random import cv2 import numpy as np import perception.local_descriptor_deduplication as ldd LOGGER = logging.getLogger(__name__) # Set a fixed size for drawing, we don't have the real descriptor size. KEYPOINT_SIZE: int = 8 def vizualize_pair( features_1, features_2, ratio: float, match_metadata=None, local_path_col: str | None = None, sanitized: bool = False, include_all_points=False, circle_size=KEYPOINT_SIZE, ): """Given two rows from a reference df vizualize their overlap. Currently recalcs overlap using cv2 default logic. Args: features_1: The row from a reference df for one image. features_2: The row from a reference df for the other image. ratio: Value for ratio test, suggest re-using value from matching. match_metadata: metadata returned from matching, if None will redo brute force matching. local_path_col: column in df with path to the image. If None will use the index: features_1.name and features_2.name sanitized: if True images themselves will not be rendered, only the points. include_all_points: if True will draw all points, not just matched points. circle_size: size of the circle to draw around keypoints. Returns: An image of the two images concatted together and matching keypoints drawn. """ # Set a fixed size for drawing, we don't have the real descriptor size. if local_path_col is not None: features_1_path = features_1[local_path_col] features_2_path = features_2[local_path_col] else: features_1_path = features_1.name features_2_path = features_2.name img1 = np.zeros( (features_1.dimensions[1], features_1.dimensions[0], 1), dtype="uint8" ) img2 = np.zeros( (features_2.dimensions[1], features_2.dimensions[0], 1), dtype="uint8" ) if not sanitized: try: img1 = ldd.load_and_preprocess( features_1_path, max_size=max(features_1.dimensions), grayscale=False ) except Exception: LOGGER.warning("Failed to load image %s", features_1_path) try: img2 = ldd.load_and_preprocess( features_2_path, max_size=max(features_2.dimensions), grayscale=False ) except Exception: LOGGER.warning("Failed to load image %s", features_2_path) if match_metadata is not None: img_matched = viz_match_data( features_1, features_2, img1, img2, match_metadata, include_all_points=include_all_points, circle_size=circle_size, ) else: LOGGER.warning("""No match_metadata provided, recalculating match points, won't match perception match points.""") img_matched = viz_brute_force(features_1, features_2, img1, img2, ratio=ratio) return img_matched def viz_match_data( features_1, features_2, img1, img2, match_metadata, include_all_points=False, circle_size=KEYPOINT_SIZE, ): """Given match data viz matching points. Args: features_1: The row from a reference df for one image. features_2: The row from a reference df for the other image. img1: cv2 of first image img2: cv2 of second image match_metadata: metadata returned from matching, if None will redo brute force matching. include_all_points: if True will draw all points, not just matched points. circle_size: size of the circle to draw around keypoints. Returns: cv2 img with matching keypoints drawn. """ # NOTE: could refactor to put matches in to correct format and use: cv2.drawMatchesKnn, # but python docs on necessary class not clear. # Pad img1 or img2 vertically with black pixels to match the height of the other image if img1.shape[0] > img2.shape[0]: img2 = np.pad( img2, ((0, img1.shape[0] - img2.shape[0]), (0, 0), (0, 0)), mode="constant", constant_values=0, ) elif img1.shape[0] < img2.shape[0]: img1 = np.pad( img1, ((0, img2.shape[0] - img1.shape[0]), (0, 0), (0, 0)), mode="constant", constant_values=0, ) # draw two images h concat: img_matched = np.concatenate((img1, img2), axis=1) overlay = img_matched.copy() if include_all_points: # draw all points in kp_1 for k in features_1["keypoints"]: new_color = ( random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), ) # Draw semi transparent circle cv2.circle(img_matched, (int(k[0]), int(k[1])), circle_size, new_color, 1) # draw all points in kp_2 for k in features_2["keypoints"]: new_color = ( random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), ) cv2.circle( img_matched, (int(k[0] + features_1.dimensions[0]), int(k[1])), circle_size, new_color, 1, ) # draw lines between matching points for i in range(len(match_metadata["final_matched_b_pts"])): new_color = ( random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), ) a_pt = ( int(match_metadata["final_matched_a_pts"][i][0]), int(match_metadata["final_matched_a_pts"][i][1]), ) b_pt = ( int(match_metadata["final_matched_b_pts"][i][0] + features_1.dimensions[0]), int(match_metadata["final_matched_b_pts"][i][1]), ) cv2.circle(img_matched, a_pt, circle_size, new_color, 1) cv2.circle(img_matched, b_pt, circle_size, new_color, 1) cv2.line( img_matched, a_pt, b_pt, new_color, 1, ) # Re-overlay original image to add some transparency effect to lines and circles. alpha = 0.4 # Transparency factor. # Following line overlays transparent rectangle over the image img_matched = cv2.addWeighted(overlay, alpha, img_matched, 1 - alpha, 0) return img_matched def viz_brute_force(features_1, features_2, img1, img2, ratio: float): """ Given two rows from a reference df vizualize their overlap. NOTE: It redoes matching using cv2 bruteforce, so will not match the same as the perception matching code. Args: features_1: The row from a reference df for one image. features_2: The row from a reference df for the other image. img1: cv2 of first image img2: cv2 of second image ratio: Value for ratio test, suggest re-using value from matching. Returns: An image of the two images concatted together and matching keypoints drawn. """ # Convert numpy keypoints to cv2.KeyPoints kp1_fixed = [] for k in features_1["keypoints"]: kp1_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE)) kp2_fixed = [] for k in features_2["keypoints"]: kp2_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE)) brute_force_matcher = cv2.BFMatcher() kn_matches = brute_force_matcher.knnMatch( features_1["descriptors"], features_2["descriptors"], k=2 ) # Apply ratio test good = [] for nearest_match, next_nearest_match in kn_matches: if nearest_match.distance < ratio * next_nearest_match.distance: good.append([nearest_match]) img_matched = cv2.drawMatchesKnn( # type: ignore[call-overload] img1, kp1_fixed, img2, kp2_fixed, good, None, flags=cv2.DrawMatchesFlags_DRAW_RICH_KEYPOINTS, ) return img_matched ================================================ FILE: perception/approximate_deduplication/index.py ================================================ import time import typing import warnings import faiss import numpy as np import pandas as pd import typing_extensions import perception.hashers.tools as pht class QueryInput(typing_extensions.TypedDict): id: str hash: str class QueryMatch(typing_extensions.TypedDict): id: typing.Any matches: list[dict] class TuningFailure(Exception): pass class QueryDecodingFailure(Exception): pass def build_query(table, ids, paramstyle, columns): query = "SELECT {} FROM {} WHERE id in {}" if paramstyle == "pyformat": sql = query.format(",".join(columns), table, "%(ids)s") params = {"ids": tuple(ids)} elif paramstyle == "qmark": params = ids sql = query.format(",".join(columns), table, f"({','.join('?' * len(ids))})") else: raise NotImplementedError("Unsupported paramstyle.") return sql, params def query_by_id(con, table, ids, paramstyle, extra_columns=None) -> pd.DataFrame: """Get data from the database. Args: con: A connection to the database table: The table in which to look up hashes ids: The list of IDs to pull paramstyle: The paramstyle for the database extra_columns: A list of additional (non-ID) columns to pull. """ columns = ["id"] if extra_columns is not None: columns += extra_columns if isinstance(ids, np.ndarray): # If it's a numpy array, coerce to a list. ids = ids.tolist() dfs = [] batch_size = 1000 for start in range(0, len(ids), batch_size): sql, params = build_query( table=table, ids=ids[start : start + batch_size], paramstyle=paramstyle, columns=columns, ) dfs.append(pd.read_sql(con=con, sql=sql, params=params)) return pd.concat(dfs, ignore_index=True).set_index("id") class ApproximateNearestNeighbors: """A wrapper for a FAISS index. Args: con: A database connection from which to obtain metadata for matched hashes. table: The table in the database that we should query for metadata. paramstyle: The parameter style for the given database index: A FAISS index (or filepath to a FAISS index) hash_length: The length of the hash that is being matched against. metadata_columns: The metadata that should be returned for queries. dtype: The data type for the vectors distance_metric: The distance metric for the vectors """ def __init__( self, con, table, paramstyle, index, hash_length, metadata_columns=None, dtype="uint8", distance_metric="euclidean", ): assert ( dtype == "uint8" ), "Only unsigned 8-bit integer hashes are supported at this time." assert ( distance_metric == "euclidean" ), "Only euclidean distance is supported at this time." if isinstance(index, str): index = faiss.read_index(index) self.con = con self.index = index self.distance_metric = distance_metric self.hash_length = hash_length self.dtype = dtype self.table = table self.metadata_columns = metadata_columns self.paramstyle = paramstyle assert ( self.index.d == self.hash_length ), "Index is incompatible with hash length." @classmethod def from_database( cls, con, table, paramstyle, hash_length, ids_train=None, train_size=None, chunksize=100000, metadata_columns=None, index=None, gpu=False, dtype="uint8", distance_metric="euclidean", ): """Train and build a FAISS index from a database connection. Args: con: A database connection from which to obtain metadata for matched hashes. table: The table in the database that we should query for metadata. paramstyle: The parameter style for the given database hash_length: The length of the hash that is being matched against. ids_train: The IDs for the vectors to train on. train_size: The number of vectors to use for training. Will be randomly selected from 1 to the number of vectors in the database. Ignored if ids_train is not None. chunksize: The chunks of data to draw from the database at a time when adding vectors to the index. metadata_columns: The metadata that should be returned for queries. index: If a pretrained index is provided, training will be skipped, any existing vectors will be discarded, and the index will be repopulated with the current contents of the database. gpu: If true, will attempt to carry out training on a GPU. dtype: The data type for the vectors distance_metric: The distance metric for the vectors """ assert ( dtype == "uint8" ), "Only unsigned 8-bit integer hashes are supported at this time." assert ( distance_metric == "euclidean" ), "Only euclidean distance is supported at this time." if index is None: # Train the index using the practices from # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#if-below-1m-vectors-ivfx ntotal = pd.read_sql( sql="select count(*) as count from hashes", con=con ).iloc[0]["count"] assert ( train_size <= ntotal ), "Cannot train on more hashes than are available." nlist = int(min(4 * np.sqrt(ntotal), ntotal / 39)) min_train_size = 39 * nlist if ids_train is not None: train_size = len(ids_train) if train_size is None: train_size = min_train_size assert ( train_size >= min_train_size ), f"Training an index used for {ntotal} hashes requires at least {min_train_size} training hashes." if ids_train is None: ids_train = np.random.choice( np.arange(ntotal), size=train_size, replace=False ) df_train = query_by_id( con=con, table=table, ids=ids_train, paramstyle=paramstyle, extra_columns=["hash"], ) x_train = np.array( [np.frombuffer(h, dtype=dtype) for h in df_train["hash"]] ).astype("float32") assert x_train.shape[1] == hash_length, "Hashes are of incorrect length." index = faiss.IndexIVFFlat( faiss.IndexFlatL2(hash_length), hash_length, nlist ) if gpu: res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, index) gpu_index.train(x_train) index = faiss.index_gpu_to_cpu(gpu_index) else: index.train(x_train) else: index.reset() # Add hashes to the index in chunks. for df_add in pd.read_sql( sql=f"SELECT id, hash FROM {table}", con=con, chunksize=chunksize ): x_add = np.array( [np.frombuffer(h, dtype=dtype) for h in df_add["hash"]] ).astype("float32") index.add_with_ids(x_add, df_add["id"].values) return cls( con=con, index=index, hash_length=hash_length, distance_metric=distance_metric, dtype=dtype, table=table, paramstyle=paramstyle, metadata_columns=metadata_columns, ) def query_by_id( self, ids, include_metadata=True, include_hashes=False ) -> pd.DataFrame: """Get data from the database. Args: ids: The hash IDs to get from the database. include_metadata: Whether to include metadata columns. include_hashes: Whether to include the hashes """ if not self.metadata_columns and include_metadata and not include_hashes: # There won't be anything to return. return pd.DataFrame() extra_columns = [] if self.metadata_columns and include_metadata: extra_columns += self.metadata_columns if include_hashes: extra_columns += ["hash"] return query_by_id( con=self.con, table=self.table, ids=ids, paramstyle=self.paramstyle, extra_columns=extra_columns, ) def string_to_vector(self, s: str, hash_format="base64") -> np.ndarray: """Convert a string to vector form. Args: s: The hash string hash_format: The format for the hash string """ return pht.string_to_vector( s, hash_format=hash_format, dtype=self.dtype, hash_length=self.hash_length ) def vector_to_string(self, vector, hash_format="base64") -> str | None: """Convert a vector back to string Args: vector: The hash vector hash_format: The format for the hash """ return pht.vector_to_string(vector, dtype=self.dtype, hash_format=hash_format) def search( self, queries: list[QueryInput], threshold: int | None = None, threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None, hash_format="base64", k=1, ): """Search the index and return matches. Args: queries: A list of queries in the form of {"id": , "hash": ""} threshold: The threshold to use for matching. Takes precedence over threshold_func. threshold_func: A function that, given a query vector, returns the desired match threshold for that query. hash_format: The hash format used for the strings in the query. k: The number of nearest neighbors to return. Returns: Matches in the form of a list of dicts of the form: { "id": , "matches": [{"distance": , "id": , "metadata": {}}]} The metadata consists of the contents of the metadata columns specified for this matching instance. """ try: xq = np.array( [ self.string_to_vector(h["hash"], hash_format=hash_format) for h in queries ] ).astype("float32") except Exception as exc: raise QueryDecodingFailure("Failed to parse hash query.") from exc thresholds: np.ndarray = np.ones((len(xq), 1)) * np.inf if threshold: thresholds = np.ones((len(xq), 1)) * threshold if not threshold and threshold_func: thresholds = threshold_func(xq) else: thresholds = np.ones((len(xq), 1)) * np.inf distances, indices = self.index.search(xq, k=k) distances = np.sqrt(distances) metadata = ( None if not self.metadata_columns else self.query_by_id(ids=np.unique(indices[distances < thresholds])) ) matches: list[QueryMatch] = [] for match_distances, match_ids, q, q_threshold in zip( distances, indices, queries, thresholds ): match_filter = match_distances < q_threshold match_ids = match_ids[match_filter] match_distances = match_distances[match_filter] match: QueryMatch = {"id": q["id"], "matches": []} for match_id, distance in zip(match_ids, match_distances): entry = {"distance": float(distance), "id": match_id} if metadata is not None: entry["metadata"] = metadata.loc[match_id].to_dict() match["matches"].append(entry) matches.append(match) return matches def tune(self, n_query=100, min_recall=99, max_noise=3): """Obtain minimum value for nprobe that achieves a target level of recall. Args: n_query: The number of hashes to use as test hashes. min_recall: The minimum desired recall for the index. max_noise: The maximum amount of noise to add to each test hash Returns: A tuple of recall, latency (in ms), and nprobe where the nprobe value is the one that achieved the resulting recall. Raises: TuningFailure if no suitable nprobe value is found. """ assert ( n_query <= self.ntotal ), "Cannot use a test larger than ntotal (total number of hashes)." # Pick a random set of query hashes ids = np.random.choice( np.arange(1, self.ntotal + 1), size=n_query, replace=False ) df = self.query_by_id(ids, include_metadata=False, include_hashes=True) xq = np.array( [np.frombuffer(v, dtype=self.dtype) for v in df["hash"]], dtype=np.uint8 ) noise = np.random.randint( low=(-xq.astype("int32")).clip(-max_noise, max_noise), high=(255 - xq.astype("float32")).clip(-max_noise, max_noise), ) xq = (xq.astype("int32") + noise).astype("uint8").astype("float32") if min_recall == 100: warnings.warn( "100% recall can only be ensured with exhaustive search.", UserWarning ) self.set_nprobe(self.nlist) start = time.time() self.index.search(xq, k=1) latency = time.time() - start return (100, 1000 * latency, self.nlist) # Make the search exhaustive so we get ground truth. self.set_nprobe(self.nlist) _, expected = self.index.search(xq, k=1) for nprobe in range(1, self.nlist): self.set_nprobe(nprobe) start = time.time() _, actual = self.index.search(xq, k=1) latency = time.time() - start recall = 100 * (actual[:, 0] == expected).sum() / xq.shape[0] if recall >= min_recall: break else: # If we never break, it means we never reached the target recall # for this query. raise TuningFailure( "Failed to find suitable parameters for selected recall." ) return recall, 1000 * latency, nprobe def save(self, filepath): """Save an index to disk. Args: filepath: Where to save the index. """ faiss.write_index(self.index, filepath) def set_nprobe(self, nprobe) -> int: """Set the value of nprobe. Args: nprobe: The new value for nprobe """ faiss.ParameterSpace().set_index_parameter(self.index, "nprobe", nprobe) return faiss.downcast_index(self.index).nprobe @property def nlist(self): """The number of lists in the index.""" return faiss.downcast_index(self.index).nlist @property def nprobe(self): """The current value of nprobe.""" return faiss.downcast_index(self.index).nprobe @property def ntotal(self): """The number of vectors in the index.""" return self.index.ntotal ================================================ FILE: perception/approximate_deduplication/serve.py ================================================ import asyncio import functools import json import logging import typing import aiohttp.web import numpy as np from pythonjsonlogger import jsonlogger import perception.hashers.tools as pht from .index import ApproximateNearestNeighbors def is_similarity_valid(data, index: ApproximateNearestNeighbors): """Validates input to the similarity endpoint.""" hash_format = data.get("hash_format", "base64") expected_string_length = pht.get_string_length( hash_length=index.hash_length, dtype=index.dtype, hash_format=hash_format ) return ( isinstance(data, dict) and "queries" in data and isinstance(data["queries"], list) and all(isinstance(x.get("hash", None), str) for x in data["queries"]) and hash_format in ["hex", "base64"] and all( len(x.get("hash", None)) == expected_string_length for x in data["queries"] ) ) async def similarity(request): """Responds to a vector similarity query of the form: ``` { "queries": [{"id": str, "hash": "base64_encoded_hash1"}, ...], "k": int, "threshold": float, "hash_format": "base64" } ``` with information about similar vectors in the index in the form: ``` { "queries": [{"id": str, "matches": [{"metadata": {json metadata}, "distance": float},...],...] } ``` """ try: request_data = await request.json() except json.JSONDecodeError: return aiohttp.web.json_response({"reason": "Malformed JSON"}, status=400) index = request.app["index"] try: assert is_similarity_valid(request_data, index) except Exception: return aiohttp.web.json_response({"reason": "Invalid JSON request"}, status=400) async with request.app["query_semaphore"]: matches = await asyncio.get_event_loop().run_in_executor( None, functools.partial( index.search, queries=request_data["queries"], threshold=request_data.get( "threshold", request.app["default_threshold"] ), threshold_func=request.app["default_threshold_func"], k=request_data.get("k", request.app["default_k"]), hash_format=request_data.get("hash_format", "base64"), ), ) matches = json.loads(json.dumps({"queries": matches})) return aiohttp.web.json_response(matches) def get_logger(name, log_level): logger = logging.Logger(name=name, level=log_level) handler = logging.StreamHandler() handler.setFormatter( jsonlogger.JsonFormatter( "%(asctime)s:%(levelname)s:%(name)s:%(message)s%(exc_info)" ) ) logger.addHandler(handler) return logger async def serve( index: ApproximateNearestNeighbors, default_threshold: int | None = None, default_threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None, default_k: int = 1, concurrency: int = 2, log_level=logging.INFO, host="localhost", port=8080, ): """Serve an index as a web API. This function does not block. If you wish to use the function in a blocking manner, you can do something like .. code-block:: python loop = asyncio.get_event_loop() loop.run_until_complete(serve(...)) loop.run_forever() You can query the API with something like: .. code-block:: bash curl --header "Content-Type: application/json" \\ --request POST \\ --data '{"queries": [{"hash": "", "id": "bar"}], "threshold": 1200}' \\ http://localhost:8080/v1/similarity Args: index: The underlying index default_threshold: The default threshold for matches default_k: The default number of nearest neighbors to look for concurrency: The number of concurrent requests served log_level: The log level to use for the logger host: The host for the servoce port: The port for the service """ logger = get_logger(name="serve", log_level=log_level) logger.info("Initializing web service") app = aiohttp.web.Application() app.router.add_post("/v1/similarity", similarity, name="similarity") # Store globals in the application object app["default_threshold"] = default_threshold app["logger"] = logger app["default_k"] = default_k app["default_threshold_func"] = default_threshold_func app["index"] = index app["query_semaphore"] = asyncio.Semaphore(concurrency) logger.info("Entering web service listener loop.") runner = aiohttp.web.AppRunner(app, logger=logger) await runner.setup() site = aiohttp.web.TCPSite(runner, host, port) await site.start() return site ================================================ FILE: perception/benchmarking/__init__.py ================================================ from perception.benchmarking import video_transforms from perception.benchmarking import video from perception.benchmarking import image from perception.benchmarking.image import ( BenchmarkImageDataset, BenchmarkImageTransforms, ) from perception.benchmarking.video import ( BenchmarkVideoDataset, BenchmarkVideoTransforms, ) from perception.benchmarking.common import BenchmarkHashes __all__ = [ "BenchmarkImageDataset", "BenchmarkImageTransforms", "BenchmarkVideoDataset", "BenchmarkVideoTransforms", "BenchmarkHashes", "video_transforms", "video", "image", ] ================================================ FILE: perception/benchmarking/common.py ================================================ import itertools import logging import os import shutil import tempfile import uuid import warnings import zipfile from abc import ABC import matplotlib.pyplot as plt import numpy as np import pandas as pd import tqdm from scipy import spatial, stats from ..hashers.tools import compute_md5, string_to_vector try: from . import extensions # type: ignore except ImportError: warnings.warn( "C extensions were not built. Some metrics will be computed more slowly. " "Please install from wheels or set up a compiler prior to installation " "from source to use extensions." ) extensions = None log = logging.getLogger(__name__) def create_mask(transformed_guids, noop_guids): """Given a list of transformed guids and noop guids, computes an MxN array indicating whether noop n has the same guid as transform m. Used for applying a mask to a distance matrix for efficient computation of recall at different thresholds. Args: transformed_guids: An iterable of transformed guids noop: An iterable of noop guids Returns: An boolean array of shape `(len(transformed_guids), len(transformed_noops))` """ n_noops = len(noop_guids) previous_guid = None start = None end = 0 mask = np.zeros((len(transformed_guids), len(noop_guids)), dtype="bool") for current_guid, row in zip(transformed_guids, mask): if previous_guid is None or current_guid != previous_guid: start = end end = start + next( ( other_index for other_index, guid in enumerate(noop_guids[start:]) if guid != current_guid ), n_noops, ) previous_guid = current_guid row[start:end] = True return mask def compute_threshold_precision_recall(pos, neg, precision_threshold=99.9): # Sort both arrays according to the positive distance neg = neg[pos.argsort()] pos = pos[pos.argsort()] # Compute false-positive rate for every value in pos tp = np.arange(1, len(pos) + 1) fp = np.array([(neg <= t).sum() for t in pos]) precision = 100 * tp / (tp + fp) # Choose the optimal threshold bad_threshold_idxs = np.where(precision < precision_threshold)[0] if len(bad_threshold_idxs) > 0 and bad_threshold_idxs[0] > 0: optimal_threshold = pos[bad_threshold_idxs[0] - 1] recovered = (pos <= optimal_threshold).sum() if recovered == 0: optimal_precision = np.nan else: optimal_precision = precision[pos <= optimal_threshold].min() optimal_recall = round(100 * recovered / len(pos), 3) elif len(bad_threshold_idxs) > 0: # The closest hash was a false positive. optimal_threshold = pos[0] optimal_recall = 0 optimal_precision = np.nan else: optimal_precision = 100 optimal_threshold = pos.max() optimal_recall = 100 return optimal_threshold, optimal_precision, optimal_recall class Filterable(ABC): _df: pd.DataFrame expected_columns: list def __init__(self, df): assert sorted(df.columns) == sorted( self.expected_columns ), f"Column mismatch: Expected {sorted(self.expected_columns)}, found {sorted(df.columns)}." self._df = df @property def categories(self): """The categories included in the dataset""" return self._df["category"].unique() def filter(self, **kwargs): """Obtain a new dataset filtered with the given keyword arguments.""" df = self._df.copy() for field, included in kwargs.items(): existing = self._df[field].unique() if not all(inc in existing for inc in included): missing = ", ".join( [str(inc) for inc in included if inc not in existing] ) message = f"Did not find {missing} in column {field} dataset." warnings.warn(message, UserWarning) df = df[df[field].isin(included)] return self.__class__(df.copy()) class Saveable(Filterable): @classmethod def load( cls, path_to_zip_or_directory: str, storage_dir: str | None = None, verify_md5=True, ): """Load a dataset from a ZIP file or directory. Args: path_to_zip_or_directory: Pretty self-explanatory storage_dir: If providing a ZIP file, where to extract the contents. If None, contents will be extracted to a folder with the same name as the ZIP file in the same directory as the ZIP file. verify_md5: Verify md5s when loading """ # Load index whether from inside ZIP file or from directory. if os.path.splitext(path_to_zip_or_directory)[1] == ".zip": if storage_dir is None: storage_dir = os.path.join( os.path.dirname(os.path.abspath(path_to_zip_or_directory)), os.path.splitext(os.path.basename(path_to_zip_or_directory))[0], ) os.makedirs(storage_dir, exist_ok=True) with zipfile.ZipFile(path_to_zip_or_directory, "r") as z: # Try extracting only the index at first so we can # compare md5. z.extract("index.csv", os.path.join(storage_dir)) index: pd.DataFrame = pd.read_csv( os.path.join(storage_dir, "index.csv") ) index["filepath"] = index["filename"].apply( lambda fn: ( os.path.join(storage_dir, fn) if not pd.isnull(fn) else None ) ) do_zip_extraction = True if index["filepath"].apply(os.path.isfile).all(): if verify_md5: do_zip_extraction = not all( row["md5"] == compute_md5(row["filepath"]) for _, row in tqdm.tqdm( index.iterrows(), desc="Checking cache" ) ) else: do_zip_extraction = False if do_zip_extraction: z.extractall(storage_dir) else: log.info("Found all files already extracted. Skipping extraction.") verify_md5 = False else: assert ( storage_dir is None ), "Storage directory only valid if path is to ZIP file." index = pd.read_csv(os.path.join(path_to_zip_or_directory, "index.csv")) index["filepath"] = index["filename"].apply( lambda fn: ( os.path.join(path_to_zip_or_directory, fn) if not pd.isnull(fn) else None ) ) if verify_md5: assert all( row["md5"] == compute_md5(row["filepath"]) for _, row in tqdm.tqdm( index.iterrows(), desc="Performing final md5 integrity check.", total=len(index.index), ) ), "An md5 mismatch has occurred." return cls(index.drop(["filename", "md5"], axis=1)) def save(self, path_to_zip_or_directory): """Save a dataset to a directory or ZIP file. Args: path_to_zip_or_directory: Pretty self-explanatory """ df = self._df assert "filepath" in df.columns, "Index dataframe must contain filepath." # Build index using filename instead of filepath. index = df.copy() index["filename"] = df["filepath"].apply( lambda filepath: ( os.path.basename(filepath) if not pd.isnull(filepath) else None ) ) if index["filename"].dropna().duplicated().sum() > 0: warnings.warn("Changing filenames to UUID due to duplicates.", UserWarning) index["filename"] = [ ( str(uuid.uuid4()) + os.path.splitext(row["filename"])[1] if not pd.isnull(row["filename"]) else None ) for _, row in index.iterrows() ] index["md5"] = [ compute_md5(filepath) if not pd.isnull(filepath) else None for filepath in tqdm.tqdm(index["filepath"], desc="Computing md5s.") ] # Add all files as well as the dataframe index to # a ZIP file if path is to ZIP file or to the directory if it is # not a ZIP file. if os.path.splitext(path_to_zip_or_directory)[1] == ".zip": with zipfile.ZipFile(path_to_zip_or_directory, "w") as f: with tempfile.TemporaryFile(mode="w+") as index_file: index.drop("filepath", axis=1).to_csv(index_file, index=False) index_file.seek(0) f.writestr("index.csv", index_file.read()) for _, row in tqdm.tqdm( index.iterrows(), desc="Saving files", total=len(df) ): if pd.isnull(row["filepath"]): # There was an error associated with this file. continue f.write(row["filepath"], row["filename"]) else: os.makedirs(path_to_zip_or_directory, exist_ok=True) index.drop("filepath", axis=1).to_csv( os.path.join(path_to_zip_or_directory, "index.csv"), index=False ) for _, row in tqdm.tqdm( index.iterrows(), desc="Saving files", total=len(df) ): if pd.isnull(row["filepath"]): # There was an error associated with this file. continue if row["filepath"] == os.path.join( path_to_zip_or_directory, row["filename"] ): # The source file is the same as the target file. continue shutil.copy( row["filepath"], os.path.join(path_to_zip_or_directory, row["filename"]), ) class BenchmarkHashes(Filterable): """A dataset of hashes for transformed images. It is essentially a wrapper around a `pandas.DataFrame` with the following columns: - guid - error - filepath - category - transform_name - hasher_name - hasher_dtype - hasher_distance_metric - hasher_hash_length - hash """ expected_columns = [ "error", "filepath", "hash", "hasher_name", "hasher_dtype", "hasher_distance_metric", "category", "guid", "input_filepath", "transform_name", "hasher_hash_length", ] def __init__(self, df: pd.DataFrame): super().__init__(df) self._metrics: pd.DataFrame | None = None def __add__(self, other): return BenchmarkHashes(df=pd.concat([self._df, other._df]).drop_duplicates()) def __radd__(self, other): return self.__add__(other) @classmethod def load(cls, filepath: str): return cls(pd.read_csv(filepath)) def save(self, filepath): self._df.to_csv(filepath, index=False) def compute_metrics( self, custom_distance_metrics: dict | None = None ) -> pd.DataFrame: if self._metrics is not None: return self._metrics metrics = [] hashsets = self._df.sort_values("guid") n_dropped = hashsets["hash"].isnull().sum() if n_dropped > 0: hashsets = hashsets.dropna(subset=["hash"]) warnings.warn(f"Dropping {n_dropped} invalid / empty hashes.", UserWarning) for (hasher_name, transform_name, category), hashset in tqdm.tqdm( hashsets.groupby(["hasher_name", "transform_name", "category"]), desc="Computing metrics.", ): # Note the guid filtering below. We need to include only guids # for which we have the transform *and* the guid. One of them # may have been dropped due to being invalid. noops = hashsets[ (hashsets["transform_name"] == "noop") & (hashsets["hasher_name"] == hasher_name) & (hashsets["guid"].isin(hashset["guid"])) ] valid_hashset = hashset[hashset["guid"].isin(noops["guid"])] dtype, distance_metric, hash_length = valid_hashset.iloc[0][ ["hasher_dtype", "hasher_distance_metric", "hasher_hash_length"] ] n_noops = len(noops.guid) n_hashset = len(valid_hashset.guid) noop_guids = noops.guid.values mask = create_mask(valid_hashset.guid.values, noops.guid.values) if distance_metric != "custom": X_trans = np.array( valid_hashset.hash.apply( string_to_vector, # type: ignore[arg-type] hash_length=int(hash_length), dtype=dtype, hash_format="base64", ).tolist() ) X_noop = np.array( noops.hash.apply( string_to_vector, # type: ignore[arg-type] dtype=dtype, hash_format="base64", hash_length=int(hash_length), ).tolist() ) if ( distance_metric != "euclidean" or "int" not in dtype or extensions is None ): distance_matrix = spatial.distance.cdist( XA=X_trans, XB=X_noop, metric=distance_metric ) distance_to_closest_image = distance_matrix.min(axis=1) distance_to_correct_image = np.ma.masked_array( distance_matrix, np.logical_not(mask) ).min(axis=1) distance_matrix_incorrect_image: np.ndarray = np.ma.masked_array( distance_matrix, mask ) distance_to_incorrect_image = distance_matrix_incorrect_image.min( axis=1 ) closest_incorrect_guid = noop_guids[ distance_matrix_incorrect_image.argmin(axis=1) ] else: distances, indexes = extensions.compute_euclidean_metrics( X_noop.astype("int32"), X_trans.astype("int32"), mask ) distance_to_correct_image = distances[:, 1] distance_to_incorrect_image = distances[:, 0] distance_to_closest_image = distances.min(axis=1) closest_incorrect_guid = [noop_guids[idx] for idx in indexes[:, 0]] else: assert ( custom_distance_metrics is not None and hasher_name in custom_distance_metrics ), f"You must provide a custom distance metric for {hasher_name}." noops_hash_values = noops.hash.values hashset_hash_values = valid_hashset.hash.values distance_matrix = np.zeros((n_hashset, n_noops)) distance_function = custom_distance_metrics[hasher_name] for i1, i2 in itertools.product(range(n_hashset), range(n_noops)): distance_matrix[i1, i2] = distance_function( hashset_hash_values[i1], noops_hash_values[i2] ) distance_to_closest_image = distance_matrix.min(axis=1) distance_to_correct_image = np.ma.masked_array( distance_matrix, np.logical_not(mask) ).min(axis=1) distance_matrix_incorrect_image = np.ma.masked_array( distance_matrix, mask ) distance_to_incorrect_image = distance_matrix_incorrect_image.min( axis=1 ) closest_incorrect_guid = noop_guids[ distance_matrix_incorrect_image.argmin(axis=1) ] metrics.append( pd.DataFrame( { "guid": valid_hashset["guid"].values, "transform_name": transform_name, "hasher_name": hasher_name, "category": category, "distance_to_closest_correct_image": distance_to_correct_image, "distance_to_closest_incorrect_image": distance_to_incorrect_image, "distance_to_closest_image": distance_to_closest_image, "closest_incorrect_guid": closest_incorrect_guid, } ) ) metrics_df = pd.concat(metrics) self._metrics = metrics_df return metrics_df def show_histograms(self, grouping=None, precision_threshold=99.9, **kwargs): """Plot histograms for true and false positives, similar to https://tech.okcupid.com/evaluating-perceptual-image-hashes-okcupid/ Additional arguments passed to compute_metrics. Args: grouping: List of fields to group by. By default, all fields are used (category, and transform_name). """ if grouping is None: grouping = ["category", "transform_name"] metrics = self.compute_metrics(**kwargs) hasher_names = metrics["hasher_name"].unique().tolist() bounds = ( metrics.groupby("hasher_name")[ ["distance_to_closest_image", "distance_to_closest_incorrect_image"] ] .max() .max(axis=1) ) if grouping: group_names = [ ":".join(map(str, row.values)) for idx, row in metrics[grouping].drop_duplicates().iterrows() ] else: group_names = [""] ncols = len(hasher_names) nrows = len(group_names) fig, axs = plt.subplots( ncols=ncols, nrows=nrows, figsize=(ncols * 4, nrows * 3), sharey=True ) for group_name, subset in metrics.groupby(["hasher_name"] + grouping): # Get names of group and hasher if grouping: hasher_name = group_name[0] group_name = ":".join(map(str, group_name[1:])) else: hasher_name = group_name group_name = "" # Get the correct axis. colIdx = hasher_names.index(hasher_name) rowIdx = group_names.index(group_name) if ncols > 1 and nrows > 1: ax = axs[rowIdx, colIdx] elif ncols == 1 and nrows == 1: ax = axs else: ax = axs[rowIdx if nrows > 1 else colIdx] # Plot the charts inner_keys = ["guid"] + ( ["transform_name"] if "transform_name" in subset.columns else [] ) pos, neg = ( subset.groupby(inner_keys)[ [ "distance_to_closest_correct_image", "distance_to_closest_incorrect_image", ] ] .min() .values.T ) optimal_threshold, _, optimal_recall = compute_threshold_precision_recall( pos=pos, neg=neg, precision_threshold=precision_threshold ) optimal_threshold = optimal_threshold.round(3) emd = stats.wasserstein_distance(pos, neg).round(2) ax.hist(neg, label="neg", bins=10) ax.hist(pos, label="pos", bins=10) ax.text( 0.5, 0.5, f"Recall: {optimal_recall:.0f}% @ {optimal_threshold}\nemd: {emd:.2f}", horizontalalignment="center", color="black", verticalalignment="center", transform=ax.transAxes, fontsize=12, fontweight=1000, ) ax.set_xlim(-0.05 * bounds[hasher_name], bounds[hasher_name]) if rowIdx == 0: ax.set_title(hasher_name) ax.legend() if colIdx == 0: ax.set_ylabel(group_name) fig.tight_layout() def compute_threshold_recall( self, precision_threshold=99.9, grouping=None, **kwargs ) -> pd.DataFrame: """Compute a table for threshold and recall for each category, hasher, and transformation combinations. Additional arguments passed to compute_metrics. Args: precision_threshold: The precision threshold to use for choosing a distance threshold for each hasher. grouping: List of fields to group by. By default, all fields are used (category, and transform_name). Returns: A pandas DataFrame with 7 columns. The key columns are threshold (The optimal distance threshold for detecting a match for this combination), recall (the number of correct matches divided by the number of possible matches), and precision (the number correct matches divided by the total number of matches whether correct or incorrect). """ if grouping is None: grouping = ["category", "transform_name"] def group_func(subset): inner_keys = ["guid"] + ( ["transform_name"] if "transform_name" in subset.columns else [] ) pos, neg = ( subset.groupby(inner_keys)[ [ "distance_to_closest_correct_image", "distance_to_closest_incorrect_image", ] ] .min() .values.T ) ( optimal_threshold, optimal_precision, optimal_recall, ) = compute_threshold_precision_recall( pos=pos, neg=neg, precision_threshold=precision_threshold ) return pd.Series( { "threshold": optimal_threshold, "recall": optimal_recall, "precision": optimal_precision, "n_exemplars": len(subset), } ) return ( self.compute_metrics(**kwargs) .groupby(grouping + ["hasher_name"]) .apply(group_func) ) class BenchmarkDataset(Saveable): """A dataset of images separated into categories. It is essentially a wrapper around a pandas dataframe with the following columns: - filepath - category """ expected_columns = ["filepath", "category"] @classmethod def from_tuples(cls, files: list[tuple[str, str]]): """Build dataset from a set of files. Args: files: A list of tuples where each entry is a pair filepath and category. """ df = pd.DataFrame.from_records( [{"filepath": f, "category": c} for f, c in files] ) return cls(df) def transform(self, transforms, storage_dir, errors): raise NotImplementedError() class BenchmarkTransforms(Saveable): """A dataset of transformed images. Essentially wraps a DataFrame with the following columns: - guid - filepath - category - transform_name - input_filepath (for memo purposes only) """ expected_columns = [ "filepath", "category", "transform_name", "input_filepath", "guid", ] def compute_hashes(self, hashers, max_workers): raise NotImplementedError() ================================================ FILE: perception/benchmarking/extensions.pyx ================================================ # cython: language_level=3 import cython import numpy as np from cython.parallel import parallel, prange cimport numpy as np from libc.math cimport sqrt from libc.stdlib cimport abort, free, malloc cdef extern from "limits.h": int INT_MAX ctypedef np.uint8_t uint8 @cython.boundscheck(False) @cython.wraparound(False) def compute_euclidean_metrics(int[:, :] X_noop, int[:, :] X_tran, uint8[:, :] mask): """Compute the positive / negative distance metrics between two sets of vectors using euclidean distance. This function obtains the necessary metrics roughly 10x faster than using scipy.spatial.distance.cdist and numpy functions. Args: X_noop: The vectors for the noop hashes with shape (N, K) X_tran: The vectors for the transformed instances with shape (M, K) mask: A (M, N) array indicating whether noop n corresponds to transform m Returns: distances: An M by 2 array with the closest false positive and closest true positive for each transform. indexes: An M by 2 array with the index for the closest false positive noop and the closest true positive noop. """ cdef Py_ssize_t n_noop = X_noop.shape[0] cdef Py_ssize_t d_noop = X_noop.shape[1] cdef Py_ssize_t n_tran = X_tran.shape[0] cdef Py_ssize_t d_tran = X_tran.shape[1] cdef Py_ssize_t n_mask_tran = mask.shape[0] cdef Py_ssize_t n_mask_noop = mask.shape[1] cdef Py_ssize_t i_mask_tran cdef Py_ssize_t i_mask_noop cdef int n_pos cdef int current_distance cdef int current_closest_fp cdef int current_closest_tp cdef int[:] x cdef int[:] y cdef uint8 is_pos cdef Py_ssize_t i_noop, i_tran, i_d cdef Py_ssize_t i_closest_fp = 0 cdef Py_ssize_t i_closest_tp = 1 cdef Py_ssize_t i_closest_fp_idx = 0 cdef Py_ssize_t i_closest_tp_idx = 1 cdef int * local_buf cdef size_t size = 5 cdef float NAN NAN = float("NaN") assert d_noop == d_tran, "Dimensionality of vectors must match." assert n_mask_tran == n_tran, "Dimension 0 of mask must correspond to n_transforms." assert n_mask_noop == n_noop, "Dimension 1 of mask must correspond to n_noops." for i_mask_tran in range(n_mask_tran): n_pos = 0 for i_mask_noop in range(n_mask_noop): if mask[i_mask_tran, i_mask_noop] == True: n_pos += 1 assert n_pos > 0, "All transforms must have at least one positive noop." assert n_pos < n_mask_noop, "All transforms must have at least one negative noop." distances = np.zeros((n_tran, 2), dtype=np.float32) indexes = np.zeros((n_tran, 2), dtype=np.int32) cdef np.float32_t[:, :] distances_view = distances cdef int[:, :] indexes_view = indexes with nogil, parallel(): local_buf = malloc(sizeof(int) * size) if local_buf is NULL: abort() for i_tran in prange(n_tran): local_buf[1] = INT_MAX # Smallest false positive distance local_buf[2] = INT_MAX # Smallest true positive distance local_buf[3] = 0 # Smallest false positive index local_buf[4] = 0 # Smallest true positive index for i_noop in range(n_noop): local_buf[0] = 0 # Current distance is_pos = mask[i_tran, i_noop] == True for i_d in range(d_noop): local_buf[0] += (X_noop[i_noop, i_d] - X_tran[i_tran, i_d]) ** 2 if is_pos and (local_buf[0] < local_buf[2]): local_buf[2] = local_buf[0] local_buf[4] = i_noop if not is_pos and (local_buf[0] < local_buf[1]): local_buf[1] = local_buf[0] local_buf[3] = i_noop # I do not think that an can ever actually be # greater than INT_MAX but we'll leave the check in. if local_buf[1] < INT_MAX: distances_view[i_tran, i_closest_fp] = sqrt(local_buf[1]) else: distances_view[i_tran, i_closest_fp] = NAN if local_buf[2] < INT_MAX: distances_view[i_tran, i_closest_tp] = sqrt(local_buf[2]) else: distances_view[i_tran, i_closest_tp] = NAN indexes_view[i_tran, i_closest_fp_idx] = local_buf[3] indexes_view[i_tran, i_closest_tp_idx] = local_buf[4] free(local_buf) return distances, indexes ================================================ FILE: perception/benchmarking/image.py ================================================ import logging import os import uuid import warnings import cv2 import albumentations import pandas as pd from tqdm import tqdm from ..hashers import tools from ..hashers.hasher import ImageHasher from ..tools import deduplicate, flatten from .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms log = logging.getLogger(__name__) class BenchmarkImageTransforms(BenchmarkTransforms): def compute_hashes( self, hashers: dict[str, ImageHasher], max_workers: int = 5 ) -> BenchmarkHashes: """Compute hashes for a series of files given some set of hashers. Args: hashers: A dictionary of hashers. max_workers: Maximum number of workers for parallel hash computation. Returns: metrics: A BenchmarkHashes object. """ hashsets = [] filepaths = self._df["filepath"] for hasher_name, hasher in hashers.items(): hash_dicts = hasher.compute_parallel( filepaths, progress=tqdm, progress_desc=f"Computing hashes for {hasher_name}", max_workers=max_workers, ) if not hasher.returns_multiple: hashes_df = pd.DataFrame.from_records(hash_dicts) else: hash_groups = [ hash_dict["hash"] if hash_dict["error"] is None else [None] for hash_dict in hash_dicts ] hash_group_sizes = [len(hash_group) for hash_group in hash_groups] current_hashes = flatten(hash_groups) current_filepaths = flatten( [ [hash_dict["filepath"]] * hash_group_size for hash_dict, hash_group_size in zip( hash_dicts, hash_group_sizes ) ] ) current_errors = flatten( [ [hash_dict["error"]] * hash_group_size for hash_dict, hash_group_size in zip( hash_dicts, hash_group_sizes ) ] ) hashes_df = pd.DataFrame( { "error": current_errors, "filepath": current_filepaths, "hash": current_hashes, } ) hashset = hashes_df.assign( hasher_name=hasher_name, hasher_hash_length=hasher.hash_length, hasher_dtype=hasher.dtype, hasher_distance_metric=hasher.distance_metric, ) hashset = hashset.merge(self._df, on="filepath") hashsets.append(hashset) return BenchmarkHashes(pd.concat(hashsets, sort=True)) class BenchmarkImageDataset(BenchmarkDataset): def deduplicate( self, hasher: ImageHasher, threshold=0.001, isometric=False ) -> tuple["BenchmarkImageDataset", set[tuple[str, str]]]: """Remove duplicate files from dataset. Args: files: A list of file paths hasher: A hasher to use for finding a duplicate threshold: The threshold required for a match isometric: Whether to compute the rotated versions of the images Returns: A list where each entry is a list of files that are duplicates of each other. We keep only the last entry. """ pairs: set[tuple[str, str]] = set() for _, group in tqdm( self._df.groupby(["category"]), desc="Deduplicating categories." ): pairs = pairs.union( set( deduplicate( files=group["filepath"].tolist(), hashers=[(hasher, threshold)], isometric=isometric, ) ) ) removed = [pair[0] for pair in pairs] return ( BenchmarkImageDataset(self._df[~self._df["filepath"].isin(removed)].copy()), pairs, ) def transform( self, transforms: dict[str, albumentations.BasicTransform], storage_dir: str, errors: str = "raise", ) -> BenchmarkImageTransforms: """Prepare files to be used as part of benchmarking run. Args: transforms: A dictionary of transformations. The only required key is `noop` which determines how the original, untransformed image is saved. For a true copy, simply make the `noop` key `albumentations.NoOp` storage_dir: A directory to store all the images along with their transformed counterparts. errors: How to handle errors reading files. If "raise", exceptions are raised. If "warn", the error is printed as a warning. Returns: transforms: A BenchmarkImageTransforms object """ assert ( "noop" in transforms ), "You must provide a no-op transform such as `lambda img: img`." os.makedirs(storage_dir, exist_ok=True) files = self._df.copy() files["guid"] = [str(uuid.uuid4()) for n in range(len(files))] def apply_transform(files, transform_name): transform = transforms[transform_name] transformed_arr = [] for _, row in tqdm( files.iterrows(), desc=f"Creating files for {transform_name}", total=len(files), ): filepath, guid, category = row[["filepath", "guid", "category"]] try: image = tools.read(filepath) except Exception as exception: message = f"An error occurred reading {filepath}." if errors == "raise": raise exception warnings.warn(message, UserWarning) continue try: transformed = transform(image=image) # If albumentations, output is a dict with 'image' key if isinstance(transformed, dict) and "image" in transformed: transformed = transformed["image"] except Exception as e: raise RuntimeError( f"An exception occurred while processing {filepath} " f"with transform {transform_name}." ) from e transformed_path = os.path.join( storage_dir, f"{guid}_{transform_name}.jpg" ) cv2.imwrite( transformed_path, cv2.cvtColor(transformed, cv2.COLOR_RGB2BGR) ) transformed_arr.append( { "guid": guid, "transform_name": transform_name, "input_filepath": filepath, "filepath": transformed_path, "category": category, } ) return pd.DataFrame.from_records(transformed_arr) results = [apply_transform(files, transform_name="noop")] for transform_name in transforms.keys(): if transform_name == "noop": continue results.append(apply_transform(results[0], transform_name=transform_name)) benchmark_transforms = BenchmarkImageTransforms( df=pd.concat(results, axis=0, ignore_index=True) ) benchmark_transforms.save(storage_dir) return benchmark_transforms ================================================ FILE: perception/benchmarking/image_transforms.py ================================================ import cv2 import numpy as np def apply_watermark(watermark, alpha: float = 1.0, size: float = 1.0): """Apply a watermark to the bottom right of images. Based on the work provided at https://www.pyimagesearch.com/2016/04/25/watermarking-images-with-opencv-and-python/ Args: watermark: The watermark to overlay alpha: The strength of the overlay size: The maximum proportion of the image taken by the watermark. """ assert watermark.shape[-1] == 4, "Watermark must have an alpha channel." # Why do we have to do this? It's not clear. But the process doesn't work # without it. B, G, R, A = cv2.split(watermark) B = cv2.bitwise_and(B, B, mask=A) G = cv2.bitwise_and(G, G, mask=A) R = cv2.bitwise_and(R, R, mask=A) watermark = cv2.merge([B, G, R, A]) def transform(image): # Add alpha channel h, w = image.shape[:2] wh, ww = watermark.shape[:2] scale = size * min(h / wh, w / ww) image = np.dstack([image, np.ones((h, w), dtype="uint8") * 255]) # Construct an overlay that is the same size as the input. overlay = np.zeros((h, w, 4), dtype="uint8") scaled = cv2.resize(watermark, (int(scale * ww), int(scale * wh))) sh, sw = scaled.shape[:2] overlay[max(h - sh, 0) :, max(w - sw, 0) : w] = scaled # Blend the two images together using transparent overlays output = image.copy() cv2.addWeighted(overlay, alpha, output, 1.0, 0, output) return cv2.cvtColor(output, cv2.COLOR_RGBA2RGB) return transform ================================================ FILE: perception/benchmarking/video.py ================================================ import concurrent.futures import os import typing import uuid import pandas as pd import tqdm from ..hashers import VideoHasher, tools from ..tools import flatten from .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms def _process_row(row, hashers, framerates): error = None try: assert not pd.isnull(row["filepath"]), "No filepath provided." hashes = tools.compute_synchronized_video_hashes( filepath=row["filepath"], hashers=hashers, framerates=framerates, hash_format="base64", ) except Exception as exception: error = str(exception) hashes = { hasher_name: [None] if hasher.returns_multiple else None for hasher_name, hasher in hashers.items() } base_dict = { "guid": row["guid"], "filepath": row["filepath"], "error": error, "category": row["category"], "transform_name": row["transform_name"], "input_filepath": row["input_filepath"], } hash_dicts = [] for hasher_name, hasher in hashers.items(): base_hash_dict = { "hasher_name": hasher_name, "hasher_dtype": hasher.dtype, "hasher_distance_metric": hasher.distance_metric, "hasher_hash_length": hasher.hash_length, } if not hasher.returns_multiple: hash_dicts.append( { **{ "hash": hashes[hasher_name], }, **base_hash_dict, } ) else: for hash_value in hashes[hasher_name]: hash_dicts.append( { **{ "hash": hash_value, }, **base_hash_dict, } ) return [{**hash_dict, **base_dict} for hash_dict in hash_dicts] class BenchmarkVideoDataset(BenchmarkDataset): def transform( self, transforms: dict[str, typing.Callable], storage_dir: str, errors: str = "raise", ): """Prepare files to be used as part of benchmarking run. Args: transforms: A dictionary of transformations. The only required key is `noop` which determines how the original, untransformed video is saved. Each transform should be a callable function with that accepts an `input_filepath` and `output_filepath` argument and it should return the `output_filepath` (which may have a different extension appended by the transform function). storage_dir: A directory to store all the videos along with their transformed counterparts. errors: How to handle errors reading files. If "raise", exceptions are raised. If "warn", the error is printed as a warning. Returns: transforms: A BenchmarkVideoTransforms object """ assert "noop" in transforms, "You must provide a no-op transform." os.makedirs(storage_dir, exist_ok=True) files = self._df.copy() files["guid"] = [str(uuid.uuid4()) for n in range(len(files))] def apply_transform_to_file(input_filepath, guid, transform_name, category): if input_filepath is None: # This can happen if the noop transform did not yield # a file. We don't want to drop the records so we # keep them. return { "guid": guid, "error": "No source file provided", "transform_name": transform_name, "input_filepath": input_filepath, "filepath": None, "category": category, } try: output_filepath = transforms[transform_name]( input_filepath, output_filepath=os.path.join( storage_dir, f"{guid}_{transform_name}" ), ) error = None except Exception as e: output_filepath = None error = str(e) return { "guid": guid, "error": error, "transform_name": transform_name, "input_filepath": input_filepath, "filepath": output_filepath, "category": category, } def apply_transform_to_files(files, transform_name): return pd.DataFrame.from_records( [ apply_transform_to_file( input_filepath=row["filepath"], guid=row["guid"], transform_name=transform_name, category=row["category"], ) for _, row in tqdm.tqdm( files.iterrows(), desc=f"Creating files for {transform_name}", total=len(files), ) ] ) results = [apply_transform_to_files(files, transform_name="noop")] for transform_name in transforms.keys(): if transform_name == "noop": continue results.append( apply_transform_to_files(results[0], transform_name=transform_name) ) benchmark_transforms = BenchmarkVideoTransforms( df=pd.concat(results, axis=0, ignore_index=True) ) benchmark_transforms.save(storage_dir) return benchmark_transforms class BenchmarkVideoTransforms(BenchmarkTransforms): expected_columns = [ "filepath", "category", "transform_name", "input_filepath", "guid", "error", ] def compute_hashes( self, hashers: dict[str, VideoHasher], max_workers: int = 5 ) -> BenchmarkHashes: """Compute hashes for a series of files given some set of hashers. Args: hashers: A dictionary of hashers. max_workers: Maximum number of workers for parallel hash computation. Returns: hashes: A BenchmarkHashes object. """ id_rates = { hasher_name: hasher.frames_per_second for hasher_name, hasher in hashers.items() if hasher.frames_per_second is not None } if id_rates: framerates = tools.get_common_framerates( { hasher_name: hasher.frames_per_second for hasher_name, hasher in hashers.items() if hasher.frames_per_second is not None } ) else: framerates = {} with concurrent.futures.ProcessPoolExecutor( max_workers=max_workers ) as executor: futures = [ executor.submit( _process_row, row=row, framerates=framerates, hashers=hashers ) for index, row in self._df.iterrows() ] return BenchmarkHashes( pd.DataFrame.from_records( flatten( [ future.result() for future in tqdm.tqdm( concurrent.futures.as_completed(futures), desc="Computing hashes.", total=len(self._df), ) ] ) ) ) ================================================ FILE: perception/benchmarking/video_transforms.py ================================================ import os import cv2 import ffmpeg from ..hashers.tools import read_video def probe(filepath): """Get the output of ffprobe.""" return ffmpeg.probe(filepath) def sanitize_output_filepath(input_filepath, output_filepath, output_ext=None): """Get a suitable output filepath with an extension based on an input filepath. Args: input_filepath: The filepath for the source file. output_filepath: The filepath for the output file. output_ext: A new extension to add (e.g., '.gif') """ _, input_ext = os.path.splitext(input_filepath) if not output_filepath.lower().endswith(output_ext or input_ext): output_filepath += output_ext or input_ext return output_filepath def get_simple_transform( width: str | int = -1, height: str | int = -1, pad: str | None = None, codec: str | None = None, clip_pct: tuple[float, float] | None = None, clip_s: tuple[float, float] | None = None, sar=None, fps=None, output_ext=None, ): """Resize to a specific size and re-encode. Args: width: The target width (-1 to maintain aspect ratio) height: The target height (-1 to maintain aspect ratio) pad: An ffmpeg pad argument provided as a string. codec: The codec for encoding the video. fps: The new frame rate for the video. clip_pct: The video start and end in percentages of video duration. clip_s: The video start and end in seconds (used over clip_pct if both are provided). sar: Whether to make all videos have a common sample aspect ratio (i.e., for all square pixels, set this to '1/1'). output_ext: The extension to use when re-encoding (used to select video format). It should include the leading '.'. """ def transform(input_filepath, output_filepath): output_filepath = sanitize_output_filepath( input_filepath, output_filepath, output_ext ) data = None if codec is None: data = data or probe(input_filepath) output_codec = [s for s in data["streams"] if s["codec_type"] == "video"][ 0 ]["codec_name"] else: output_codec = codec format_kwargs = {"codec:v": output_codec} if clip_pct is not None or clip_s is not None: pct_start, pct_end, pos_start, pos_end = None, None, None, None if clip_pct is not None: pct_start, pct_end = clip_pct if clip_s is not None: pos_start, pos_end = clip_s if pct_start is not None: assert 0 <= pct_start <= 1, "Start position must be between 0 and 1." if pct_end is not None: assert 0 <= pct_end <= 1, "End position must be between 0 and 1." if pct_start is not None and pct_end is not None: assert pct_start < pct_end, "End must be greater than start." if (pct_start is not None and pos_start is None) or ( pct_end is not None and pos_end is None ): # We only want to get the duration for the video if we need # it. data = data or probe(input_filepath) duration = float(data["streams"][0]["duration"]) if pct_start is not None or pos_start is not None: format_kwargs["ss"] = pos_start or pct_start * duration # type: ignore if pct_end is not None or pos_end is not None: format_kwargs["t"] = pos_end or pct_end * duration # type: ignore stream = ffmpeg.input(input_filepath) if not (width == -1 and height == -1): stream = stream.filter("scale", width, height) if pad is not None: stream = stream.filter("pad", *pad.split(":")) if fps is not None: stream = stream.filter("fps", fps) if sar is not None: stream = stream.filter("setsar", sar) stream = stream.output(output_filepath, **format_kwargs).overwrite_output() ffmpeg.run(stream) if os.path.isfile(output_filepath): return output_filepath return None return transform def get_slideshow_transform( frame_input_rate, frame_output_rate, max_frames=None, offset=0 ): """Get a slideshow transform to create slideshows from videos. Args: frame_input_rate: The rate at which frames will be sampled from the source video (e.g., a rate of 1 means we collect one frame per second of the input video). frame_output_rate: The rate at which the sampled frames are played in the slideshow (e.g., a rate of 0.5 means each frame will appear for 2 seconds). max_frames: The maximum number of frames to write. offset: The number of seconds to wait before beginning the slide show. """ def transform(input_filepath, output_filepath): output_filepath = sanitize_output_filepath( input_filepath, output_filepath, output_ext=".avi" ) writer = None frame_count = 0 try: for frame, _, timestamp in read_video( filepath=input_filepath, frames_per_second=frame_input_rate ): if timestamp < offset: continue if writer is None: writer = cv2.VideoWriter( filename=output_filepath, fourcc=cv2.VideoWriter_fourcc(*"MJPG"), # type: ignore[attr-defined] fps=frame_output_rate, frameSize=tuple(frame.shape[:2][::-1]), isColor=True, ) writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) frame_count += 1 if max_frames is not None and frame_count >= max_frames: break finally: if writer is not None: writer.release() if os.path.isfile(output_filepath): return output_filepath return None return transform def get_black_frame_padding_transform(duration_s=0, duration_pct=0): """Get a transform that adds black frames at the start and end of a video. Args: duration_s: The duration of the black frames in seconds. duration_pct: The duration of the black frames as a percentage of video duration. If both duration_s and duration_pct are provided, the maximum value is used. """ def transform(input_filepath, output_filepath): output_filepath = sanitize_output_filepath(input_filepath, output_filepath) stream = next( stream for stream in probe(input_filepath)["streams"] if stream["codec_type"] == "video" ) assert stream["sample_aspect_ratio"] == "1:1", "SAR is not 1:1." width = stream["width"] height = stream["height"] duration = max(duration_s, duration_pct * float(stream["duration"])) ffmpeg.input(input_filepath).output( output_filepath, vf=( "color=c=black:s={width}x{height}:d={duration} [pre] ; " "color=c=black:s={width}x{height}:d={duration} [post] ; " "[pre] [in] [post] concat=n=3" ).format(width=width, height=height, duration=duration), fps_mode="vfr", ).overwrite_output().run() if os.path.isfile(output_filepath): return output_filepath return None return transform ================================================ FILE: perception/extensions.pyx ================================================ # cython: language_level=3 # cython: language=c++ import math import sys import cython import numpy as np from cython.parallel import parallel, prange cimport numpy as np from libc.stdlib cimport abort, free, malloc from libcpp cimport bool as cppbool from libcpp.vector cimport vector cdef extern from "limits.h": int INT_MAX ctypedef np.uint8_t uint8 @cython.boundscheck(False) @cython.wraparound(False) def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts: np.uint32_t[:] = None, compute_overlap=False): """Find the pairwise overlap within an array of vectors, where there may be multiple vectors for the same file. This function is faster than using scipy.spatial.distance because it computes distances in parallel, avoids computing full distances when they're not necessary, skips computing distances for pairs of hashes that are for the same file, and skips computing distances for vectors if both have already been matched. Args: X: The vectors with shape (N, D). Vectors for the same file need to be supplied sequentially so that we can use the counts argument to determine which vectors are for the same file. counts: For each file, the number of sequential vectors in X. If not provided, each vector is assumed to be for a different file (i.e., this is equivalent to `counts = np.ones(N)`). compute_overlap: If True, the values returned will be divided by the number of hashes in each file. If False, the raw duplicate counts will be returned. Returns: duplicates: An array of shape (M!/(2*((M-2)!)), 2) indicating the fraction of vectors for each file found in another file. The indexing matches that of scipy.spatial.pdist. M is the number of files. So if M = 4, the array will represent comparisons of the file indexes as follows: [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3)]. So (assuming compute_overlap=True), a possible return would be [(1.0, 1.0), (0, 0), (0, 0), (0.66, 1.0), (0.5, 0.25)] which means that: - There was 100% overlap between file 0 and file 1 - 66% of file 1 was in file 2 and 100% of file 2 was in file 1 - 50% of file 2 was in file 3 and 25% of file 3 was in file 2 """ if counts is None: counts = np.ones(X.shape[0], dtype=np.uint32) cdef Py_ssize_t n = X.shape[0] cdef Py_ssize_t m = counts.shape[0] cdef Py_ssize_t d = X.shape[1] n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2))) assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.' cdef Py_ssize_t n_pairs = n_pairs_python cdef Py_ssize_t max_counts = np.max(counts) cdef int compute_overlap_int = 0 if compute_overlap: compute_overlap_int = 1 # i_1 is the index of file1, i_2 is the index of file2, i_d is the # index of the vector dimension we're on, i_i is used to compute # the starting index in the flattened vector in the different threads. # i_1_subhash is the index of the hash on file1, i_2_subhash is # the index of the hash on file2. cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub, i_1_offset duplicate_arr = np.zeros((n_pairs, 2), dtype=np.double) cdef double[:, :] duplicate = duplicate_arr offsets_arr = np.zeros(m, dtype=np.int32) cdef np.int32_t[:] offsets = offsets_arr for i_1 in range(m): for i_i in range(i_1): offsets[i_1] += counts[i_i] # local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2 cdef size_t local_buf_size = 4 cdef float threshold2 = threshold ** 2 with nogil, parallel(): local_buf = malloc(sizeof(np.uint64_t) * local_buf_size) # An array of flags indicating whether a vector in file 1 was # matched. matched_1 = malloc(sizeof(int) * max_counts) # An array of flags indicating whether a vector in file 2 was # matched. matched_2 = malloc(sizeof(int) * max_counts) if local_buf is NULL or matched_1 is NULL or matched_2 is NULL: abort() # Iterate over all of the files. for i_1 in prange(m-1): local_buf[1] = 0 local_buf[2] = offsets[i_1] # Compute the index of the output vector # where we will count the number of duplicates. for i_i in range(i_1): local_buf[1] += m - i_i - 1 # Iterate over all the other files to compare. for i_2 in range(i_1 + 1, m): local_buf[3] = offsets[i_2] # Initialize all match flags to zero for # both file 1 and file 2. for i_1_sub in range(counts[i_1]): matched_1[i_1_sub] = 0 for i_2_sub in range(counts[i_2]): matched_2[i_2_sub] = 0 # Iterate over all the hashes in file1 for i_1_sub in range(counts[i_1]): # Iterate over all the hashes in file2 for i_2_sub in range(counts[i_2]): local_buf[0] = 0 if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1: # Both the vectors in this pair have already been matched, so # there is nothing to gain from this comparison. continue for i_d in range(d): local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2 if local_buf[0] > threshold2: # If we're already beyond the distance threshold, # we don't need to continue computing squared # distances. break if local_buf[0] < threshold2: # A match was found. Set flags for both vectors # to 1. matched_1[i_1_sub] = 1 matched_2[i_2_sub] = 1 # Add up the number of matches for file 1. for i_1_sub in range(counts[i_1]): duplicate[local_buf[1], 0] += matched_1[i_1_sub] # Add up the number of matches for file 2. for i_2_sub in range(counts[i_2]): duplicate[local_buf[1], 1] += matched_2[i_2_sub] # Divide by the total number of vectors for each file. if compute_overlap_int: duplicate[local_buf[1], 0] /= counts[i_1] duplicate[local_buf[1], 1] /= counts[i_2] # Advance to the next pair index. local_buf[1] += 1 free(local_buf) free(matched_1) free(matched_2) return duplicate_arr @cython.boundscheck(False) @cython.wraparound(False) def compute_euclidean_pairwise_duplicates_simple(int[:, :] X, float threshold, np.uint32_t[:] counts = None, float minimum_overlap = 0): """Find the pairwise overlap within an array of vectors, where there may be multiple vectors for the same file. This function is similar to compute_euclidean_pairwise_duplicates but uses much less memory. Args: X: The vectors with shape (N, D). Vectors for the same file need to be supplied sequentially so that we can use the counts argument to determine which vectors are for the same file. threshold: The maximum distance between to vectors to allow for a match. counts: For each of the M files, the number of sequential vectors in X. If not provided, each vector is assumed to be for a different file (i.e., this is equivalent to `counts = np.ones(N)` which also implies M == N). Otherwise, assumed to have length M. The counts should add up to N. minimum_overlap: The minimum overlap between two groups of hashes to call it a match. Returns: pairs: Pairs of indexes that met the matching criteria. """ if counts is None: counts_arr = np.ones(X.shape[0], dtype=np.uint32) counts = counts_arr cdef Py_ssize_t n = X.shape[0] cdef Py_ssize_t m = counts.shape[0] cdef Py_ssize_t d = X.shape[1] n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2))) assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.' cdef Py_ssize_t n_pairs = n_pairs_python cdef Py_ssize_t max_counts = np.max(counts) # i_1 is the index of file1, i_2 is the index of file2, i_d is the # index of the vector dimension we're on, i_i is used to compute # the starting index in the flattened vector in the different threads. # i_1_subhash is the index of the hash on file1, i_2_subhash is # the index of the hash on file2. cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub cdef vector[cppbool] duplicate duplicate.resize(n_pairs) offsets_arr = np.zeros(m, dtype=np.uint64) cdef np.uint64_t[:] offsets = offsets_arr cdef np.int32_t expected_n = 0 for i_1 in range(m): for i_i in range(i_1): offsets[i_1] += counts[i_i] expected_n += counts[i_1] assert expected_n == n, "Provided value for counts is inconsistent with X." # local_buf will contain: # distance, flattened array offset, # index_offset_1, index_offset_2 cdef size_t local_buf_size = 4 cdef float threshold2 = threshold ** 2 with nogil, parallel(): local_buf = malloc(sizeof(np.uint64_t) * local_buf_size) # An array of flags indicating whether a vector in file 1 was # matched. matched_1 = malloc(sizeof(int) * max_counts) # An array of flags indicating whether a vector in file 2 was # matched. matched_2 = malloc(sizeof(int) * max_counts) # Pair overlap and minimum required overlap overlap = malloc(sizeof(float) * 4) if local_buf is NULL or matched_1 is NULL or matched_2 is NULL or overlap is NULL: abort() # Iterate over all of the files. for i_1 in prange(m-1): local_buf[1] = 0 local_buf[2] = offsets[i_1] # Compute the index of the output vector # where we will count the number of duplicates. for i_i in range(i_1): local_buf[1] += m - i_i - 1 # Iterate over all the other files to compare. for i_2 in range(i_1 + 1, m): # Set the current and minimum overlaps overlap[0] = 0 overlap[1] = 0 overlap[2] = minimum_overlap * counts[i_1] overlap[3] = minimum_overlap * counts[i_2] local_buf[3] = offsets[i_2] # Set early termination flag. local_buf[4] = 0 # Initialize all match flags to zero for # both file 1 and file 2. for i_1_sub in range(counts[i_1]): matched_1[i_1_sub] = 0 for i_2_sub in range(counts[i_2]): matched_2[i_2_sub] = 0 # Iterate over all the hashes in file1 for i_1_sub in range(counts[i_1]): # Stop early if there's no way to get enough # matches from i1 to i2 if overlap[0] + counts[i_1] - i_1_sub < overlap[2]: break # Stop early if we've already reached the minimum overlap if overlap[0] >= overlap[2] and overlap[1] >= overlap[3] and overlap[0] > 0 and overlap[1] > 0: break # Iterate over all the hashes in file2 for i_2_sub in range(counts[i_2]): local_buf[0] = 0 if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1: # Both the vectors in this pair have already been matched, so # there is nothing to gain from this comparison. continue for i_d in range(d): local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2 if local_buf[0] > threshold2: # If we're already beyond the distance threshold, # we don't need to continue computing squared # distances. break if local_buf[0] < threshold2: # A match was found. Set flags for both vectors # to 1 and increment the overlap. if matched_1[i_1_sub] != 1: overlap[0] += 1 if matched_2[i_2_sub] != 1: overlap[1] += 1 matched_1[i_1_sub] = 1 matched_2[i_2_sub] = 1 if overlap[0] >= overlap[2] and overlap[1] >= overlap[3] and overlap[0] > 0 and overlap[1] > 0: duplicate[local_buf[1]] = 1 local_buf[1] += 1 free(matched_1) free(matched_2) free(overlap) free(local_buf) cdef int n_duplicates = 0 cdef Py_ssize_t i_offset = 0 for i_offset in range(n_pairs): if duplicate[i_offset] > 0: n_duplicates += 1 pairs_arr = np.zeros((n_duplicates, 2), dtype=np.int32) cdef np.int32_t[:, :] pairs = pairs_arr i_offset = 0 cdef Py_ssize_t pair_offset = 0 for i_1 in range(m-1): # Compute the index of the output vector # where we will count the number of duplicates. for i_2 in range(i_1 + 1, m): if duplicate[i_offset] > 0: pairs[pair_offset][0] = i_1 pairs[pair_offset][1] = i_2 pair_offset += 1 i_offset += 1 return pairs_arr ================================================ FILE: perception/hashers/__init__.py ================================================ from .hasher import ImageHasher, VideoHasher from .image.average import AverageHash from .image.dhash import DHash from .image.opencv import BlockMean, ColorMoment, MarrHildreth from .image.phash import PHash, PHashF, PHashU8 from .image.wavelet import WaveletHash from .video.framewise import FramewiseHasher from .video.tmk import TMKL1, TMKL2 __all__ = [ "ImageHasher", "VideoHasher", "AverageHash", "PHash", "WaveletHash", "MarrHildreth", "BlockMean", "ColorMoment", "DHash", "FramewiseHasher", "TMKL1", "TMKL2", "PHashU8", "PHashF", ] try: from .image.pdq import PDQHash as PDQHash, PDQHashF as PDQHashF except ImportError: pass else: __all__.extend(["PDQHash", "PDQHashF"]) ================================================ FILE: perception/hashers/hasher.py ================================================ import concurrent.futures import typing import warnings from abc import ABC, abstractmethod from logging import warning import numpy as np import scipy.spatial import tqdm from perception.hashers import tools class Hasher(ABC): """All hashers implement a common set of methods from the Hasher base class. """ #: The metric to use when computing distance between two hashes. All hashers #: must supply this parameter. distance_metric: str #: The numpy type to use when converting from string to array form. #: All hashers must supply this parameter. dtype: str #: Indicates the length of the hash vector hash_length: int #: Whether or not this hash returns multiple values returns_multiple: bool = False #: Indicates whether the hashes can be computed in parallel allow_parallel: bool = True def string_to_vector(self, hash_string: str, hash_format: str = "base64"): """Convert hash string to vector. Args: hash_string: The input hash string hash_format: One of 'base64' or 'hex' """ return tools.string_to_vector( hash_string, dtype=self.dtype, hash_length=self.hash_length, hash_format=hash_format, ) def vector_to_string( self, vector: np.ndarray, hash_format: str = "base64" ) -> str | None: """Convert vector to hash string. Args: vector: Input vector hash_format: One of 'base64' or 'hex' """ return tools.vector_to_string(vector, dtype=self.dtype, hash_format=hash_format) def compute_distance( self, hash1: np.ndarray | str, hash2: np.ndarray | str, hash_format="base64", ): """Compute the distance between two hashes. Args: hash1: The first hash or vector hash2: The second hash or vector hash_format: If either or both of the hashes are hash strings, what format the string is encoded in. """ hash1 = ( self.string_to_vector(hash1, hash_format=hash_format) if isinstance(hash1, str) else hash1 ) # makes mypy happy hash2 = ( self.string_to_vector(hash2, hash_format=hash_format) if isinstance(hash2, str) else hash2 ) if self.distance_metric == "sqeuclidean": return scipy.spatial.distance.sqeuclidean( hash1.astype("float32"), hash2.astype("float32") ) if self.distance_metric == "euclidean": return scipy.spatial.distance.euclidean( hash1.astype("float32"), hash2.astype("float32") ) if self.distance_metric == "hamming": return scipy.spatial.distance.hamming(hash1, hash2) if self.distance_metric == "cosine": return scipy.spatial.distance.cosine( hash1.astype("float32"), hash2.astype("float32") ) if self.distance_metric == "custom": return self._compute_distance(hash1, hash2) raise NotImplementedError( f"Distance metric: {self.distance_metric} not supported." ) def _compute_distance(self, vector1, vector2): raise ValueError("Called a custom distance function but it is not implemented.") @typing.no_type_check def compute_parallel( self, filepaths: list[str], progress: tqdm.tqdm | None = None, progress_desc: str | None = None, max_workers: int = 5, isometric: bool = False, ): """Compute hashes in a parallelized fashion. Args: filepaths: A list of paths to images or videos (depending on the hasher). progress: A tqdm-like wrapper for reporting progress. If None, progress is not reported. progress_desc: The title of the progress bar. max_workers: The maximum number of workers isometric: Whether to compute all eight isometric transforms for each image. """ if not self.allow_parallel and max_workers != 1: warnings.warn( message="This hash cannot be used in parallel. Setting max_workers to 1.", category=UserWarning, ) max_workers = 1 assert all( isinstance(p, str) for p in filepaths ), "All images should be provided as paths." if isinstance(self, VideoHasher) and isometric: raise ValueError("Computing isometric hashes for videos is not supported.") # We can use a with statement to ensure threads are cleaned up promptly records = [] if isinstance(self, VideoHasher): executor_class = concurrent.futures.ProcessPoolExecutor else: executor_class = concurrent.futures.ThreadPoolExecutor with executor_class(max_workers=max_workers) as executor: # Start the load operations and mark each future with its filepath compute: typing.Callable = ( self.compute_isometric if isometric else self.compute ) future_to_path: dict = { executor.submit(compute, path): path for path in filepaths } generator = concurrent.futures.as_completed(future_to_path) if progress is not None: generator = progress( generator, total=len(filepaths), desc=progress_desc ) for future in generator: path = future_to_path[future] try: hash_value = future.result() except Exception as exc: records.append({"filepath": path, "hash": None, "error": str(exc)}) else: records.append( {"filepath": path, "hash": hash_value, "error": None} ) return records class ImageHasher(Hasher): @abstractmethod def _compute(self, image: np.ndarray) -> np.ndarray: """Compute hash from an image. Args: image: A numpy array representing an image as of shape (H, W, 3) where channels are ordered as RGB or a filepath to an image. """ def compute_isometric_from_hash(self, hash_string_or_vector, hash_format="base64"): """For supported hashes, obtain the hashes for the dihedral transformations of the original image. They are provided in the following order: - Vertical flip - Horizontal flip - 180 degree rotation - 90 degree rotation - 90 degree rotation and vertical flip - 90 degree rotation and horizontal flip - 270 degree rotation Args: hash_string_or_vector: The hash string or vector hash_format: One 'base64' or 'hex' """ if not hasattr(self, "_compute_isometric_from_hash"): raise NotImplementedError("This hasher does not support hash rotation.") rotations = self._compute_isometric_from_hash( # type: ignore hash_string_or_vector if isinstance(hash_string_or_vector, np.ndarray) else self.string_to_vector(hash_string_or_vector, hash_format=hash_format) ) return { transform_name: self.vector_to_string(vector, hash_format=hash_format) for transform_name, vector in rotations.items() } def compute_isometric(self, image: tools.ImageInputType): image = tools.to_image_array(image) if hasattr(self, "_compute_isometric"): hashes = self._compute_isometric(image) # type: ignore elif hasattr(self, "_compute_isometric_from_hash"): hashes = self._compute_isometric_from_hash( # type: ignore self._compute(image) ) else: transforms = tools.get_isometric_transforms(image) for name, transform in transforms.items(): transforms[name] = self._compute(transform) hashes = transforms return { transform_name: self.vector_to_string(vector) for transform_name, vector in hashes.items() } def compute( self, image: tools.ImageInputType, hash_format="base64" ) -> np.ndarray | str | None | list[str | None]: """Compute a hash from an image. Args: image: An image represented as a filepath, a PIL image object, or as an np.ndarray object. If it is an np.ndarray object, it must be in RGB color order (note the OpenCV default is BGR). hash_format: One 'base64', 'hex', or 'vector' """ vector = self._compute(tools.to_image_array(image)) if hash_format == "vector": # Take care of this separately because we took out `vector` # as valid return type to vector_to_string(). # The .tolist() might seem unnecessary for the # ndarray `vector` but downstream expects a list and it # stays consistent with original, so keeping for now. # return (vector.tolist() if self.returns_multiple # else vector) return vector # should iterate the same as vector.tolist() if self.returns_multiple: return [self.vector_to_string(v, hash_format=hash_format) for v in vector] return self.vector_to_string(vector, hash_format=hash_format) def compute_with_quality( self, image: tools.ImageInputType, hash_format="base64" ) -> tuple[ (np.ndarray | str | None | list[str | None]), int, ]: """Compute hash and hash quality from image. Args: image: An image represented as a filepath, a PIL image object, or as an np.ndarray object. If it is an np.ndarray object, it must be in RGB color order (note the OpenCV default is BGR). hash_format: One 'base64', 'hex', or 'vector' Returns: A tuple of (hash, quality) """ vector, quality = self._compute_with_quality(tools.to_image_array(image)) if hash_format == "vector": return vector, quality if self.returns_multiple: return ( [self.vector_to_string(v, hash_format=hash_format) for v in vector], quality, ) return (self.vector_to_string(vector, hash_format=hash_format), quality) def _compute_with_quality(self, image: np.ndarray) -> tuple[np.ndarray, int]: return self._compute(image), tools.compute_quality(image) class VideoHasher(Hasher): #: The frame rate at which videos are read frames_per_second: float = 1 @abstractmethod def process_frame( self, frame: np.ndarray, frame_index: int | None, frame_timestamp: float | None, state: dict | None = None, ) -> dict: """Called for each frame in the video. For all but the first frame, a state is provided recording the state from the previous frame. Args: frame: The current frame as an RGB ndarray frame_index: The current frame index frame_timestamp: The current frame timestamp state: The state from the last call to process_frame """ @abstractmethod def hash_from_final_state(self, state: dict) -> np.ndarray: """Called after all frames have been processed. Returns the final feature vector. Args: state: The state dictionary at the end of processing. """ def compute( self, filepath, errors="raise", hash_format="base64", scenes=None, **kwargs, ): """Compute a hash for a video at a given filepath. All other arguments are passed to perception.hashers.tools.read_video. Args: filepath: Path to video file errors: One of "raise", "ignore", or "warn". Passed to perception.hashers.tools.read_video. hash_format: One of "vector", "base64", or "hex" max_duration: The maximum length of the video to hash. max_size: The maximum size of frames to queue scenes: An array used to pass scene info back to wrapper functions """ frame_timestamp, state = None, None # Iterate through the video, aggregating scene info in the state # dict for frame, frame_index, frame_timestamp in tools.read_video( filepath=filepath, frames_per_second=self.frames_per_second, errors=errors, **kwargs, ): state = self.process_frame( frame=frame, frame_index=frame_index, frame_timestamp=frame_timestamp, state=state, ) if state is None: if errors == "raise": raise ValueError( f"Video processing failed for {filepath}, State is None." ) if errors == "warn": warning(f"Video processing failed for {filepath}, State is None.") return None # Persist the final timestamp in the state to allow us to pass along # duration state["end"] = frame_timestamp vectors = self.hash_from_final_state(state=state) if scenes is not None: scenes += state.get("scenes", []) if hash_format == "vector": # Take care of this separately because we took out `vector` # as valid return type to vector_to_string(). # The .tolist() might seem unnecessary for the # ndarray `vector` but downstream expects a list and it # stays consistent with original, so keeping for now. # return (vector.tolist() if self.returns_multiple # else vector) return vectors # should iterate the same as vector.tolist() if self.returns_multiple: return [self.vector_to_string(v, hash_format=hash_format) for v in vectors] return self.vector_to_string(vectors, hash_format=hash_format) ================================================ FILE: perception/hashers/image/__init__.py ================================================ from .average import AverageHash from .dhash import DHash from .opencv import BlockMean, ColorMoment, MarrHildreth from .phash import PHash, PHashF, PHashU8 from .wavelet import WaveletHash __all__ = [ "AverageHash", "PHash", "WaveletHash", "MarrHildreth", "BlockMean", "ColorMoment", "DHash", "PHashF", "PHashU8", ] ================================================ FILE: perception/hashers/image/average.py ================================================ import cv2 from .. import tools from ..hasher import ImageHasher class AverageHash(ImageHasher): """Computes a simple hash comparing the intensity of each pixel in a resized version of the image to the mean. Implementation based on that of `ImageHash `_.""" distance_metric = "hamming" dtype = "bool" def __init__(self, hash_size=8): assert hash_size >= 2, "Hash size must be greater than or equal to 2." self.hash_size = hash_size self.hash_length = hash_size * hash_size def _compute(self, image): image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) image = cv2.resize( image, dsize=(self.hash_size, self.hash_size), interpolation=cv2.INTER_AREA ) diff = image > image.mean() return diff.flatten() def _compute_isometric_from_hash(self, vector): return { transform_name: diff.flatten() for transform_name, diff in tools.get_isometric_transforms( vector.reshape(self.hash_size, self.hash_size, 1), require_color=False ).items() } ================================================ FILE: perception/hashers/image/dhash.py ================================================ import cv2 from ..hasher import ImageHasher class DHash(ImageHasher): """A hash based on the differences between adjacent pixels. Implementation based on that of `ImageHash `_. """ dtype = "bool" distance_metric = "hamming" def __init__(self, hash_size=8): assert hash_size > 1, "Hash size must be greater than 1." self.hash_size = hash_size self.hash_length = hash_size * hash_size def _compute(self, image): image = cv2.resize( image, dsize=(self.hash_size + 1, self.hash_size), interpolation=cv2.INTER_AREA, ) image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) previous = image[:, :-1] current = image[:, 1:] difference = previous > current return difference.flatten() ================================================ FILE: perception/hashers/image/opencv.py ================================================ import cv2 import numpy as np from ..hasher import ImageHasher class OpenCVHasher(ImageHasher): allow_parallel = False def __init__(self): if not hasattr(cv2, "img_hash"): raise RuntimeError( "You do not appear to have opencv-contrib installed. It is required for pure OpenCV hashers." ) class MarrHildreth(OpenCVHasher): """A wrapper around OpenCV's Marr-Hildreth hash. See `paper `_ for details.""" dtype = "bool" distance_metric = "hamming" hash_length = 576 def __init__(self): super().__init__() self.hasher = cv2.img_hash.MarrHildrethHash.create() # type: ignore[attr-defined] def _compute(self, image): return np.unpackbits(self.hasher.compute(image)[0]) class ColorMoment(OpenCVHasher): """A wrapper around OpenCV's Color Moments hash. See `paper `_ for details.""" dtype = "float32" distance_metric = "euclidean" hash_length = 42 def __init__(self): super().__init__() self.hasher = cv2.img_hash.ColorMomentHash.create() # type: ignore[attr-defined] def _compute(self, image): return 10000 * self.hasher.compute(image)[0] class BlockMean(OpenCVHasher): """A wrapper around OpenCV's Block Mean hash. See `paper `_ for details.""" dtype = "bool" distance_metric = "hamming" hash_length = 968 def __init__(self): super().__init__() self.hasher = cv2.img_hash.BlockMeanHash.create(1) # type: ignore[attr-defined] def _compute(self, image): # https://stackoverflow.com/questions/54762896/why-cv2-norm-hamming-gives-different-value-than-actual-hamming-distance return np.unpackbits(self.hasher.compute(image)[0]) ================================================ FILE: perception/hashers/image/pdq.py ================================================ import pdqhash from ..hasher import ImageHasher class PDQHash(ImageHasher): """The Facebook PDQ hash. Based on the original implementation located at the `official repository `_. """ distance_metric = "hamming" dtype = "bool" hash_length = 256 def _compute(self, image): return pdqhash.compute(image)[0] > 0 def _compute_with_quality(self, image): hash_vector, quality = pdqhash.compute(image) return hash_vector > 0, quality def _compute_isometric(self, image): hash_vectors, _ = pdqhash.compute_dihedral(image) names = ["r0", "r90", "r180", "r270", "fv", "fh", "r90fv", "r90fh"] return dict(zip(names, hash_vectors)) class PDQHashF(PDQHash): dtype = "float32" distance_metric = "euclidean" hash_length = 256 def _compute(self, image): return pdqhash.compute_float(image)[0] ================================================ FILE: perception/hashers/image/phash.py ================================================ import cv2 import numpy as np import scipy.fftpack from .. import tools from ..hasher import ImageHasher class PHash(ImageHasher): """Also known as the DCT hash, a hash based on discrete cosine transforms of images. See `complete paper `_ for details. Implementation based on that of `ImageHash `_. Args: hash_size: The number of DCT elements to retain (the hash length will be hash_size * hash_size). highfreq_factor: The multiple of the hash size to resize the input image to before computing the DCT. exclude_first_term: WHether to exclude the first term of the DCT freq_shift: The number of DCT low frequency elements to skip. """ distance_metric = "hamming" dtype = "bool" def __init__( self, hash_size=8, highfreq_factor=4, exclude_first_term=False, freq_shift=0 ): assert hash_size >= 2, "Hash size must be greater than or equal to 2" assert ( freq_shift <= highfreq_factor * hash_size - hash_size ), "Frequency shift is too large for this hash size / highfreq_factor combination." self.hash_size = hash_size self.highfreq_factor = highfreq_factor self.exclude_first_term = exclude_first_term self.hash_length = hash_size * hash_size self.freq_shift = freq_shift if exclude_first_term: self.hash_length -= 1 def _compute_dct(self, image): img_size = self.hash_size * self.highfreq_factor image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) image = cv2.resize( image, dsize=(img_size, img_size), interpolation=cv2.INTER_AREA ) dct = scipy.fftpack.dct(scipy.fftpack.dct(image, axis=0), axis=1) return dct[ self.freq_shift : self.hash_size + self.freq_shift, self.freq_shift : self.hash_size + self.freq_shift, ] def _dct_to_hash(self, dct): dct = dct.flatten() if self.exclude_first_term: dct = dct[1:] return dct > np.median(dct) def _compute(self, image): dct = self._compute_dct(image) return self._dct_to_hash(dct) def _compute_isometric(self, image): return { transform_name: self._dct_to_hash(dct) for transform_name, dct in tools.get_isometric_dct_transforms( self._compute_dct(image) ).items() } class PHashF(PHash): """A real-valued version of PHash. It returns the raw 32-bit floats in the DCT. For a more compact approach, see PHashU8.""" dtype = "float32" distance_metric = "euclidean" def _dct_to_hash(self, dct): dct = dct.flatten() if self.exclude_first_term: dct = dct[1:] if (dct == 0).all(): return None return dct class PHashU8(PHash): """A real-valued version of PHash. It uses minimum / maximum scaling to convert DCT values to unsigned 8-bit integers (more compact than the 32-bit floats used by PHashF at the cost of precision).""" dtype = "uint8" distance_metric = "euclidean" def _dct_to_hash(self, dct): dct = dct.flatten() if self.exclude_first_term: dct = dct[1:] if (dct == 0).all(): return None min_value = dct.min() max_value = dct.max() dct = np.uint8(255 * (dct - min_value) / (max_value - min_value)) return dct ================================================ FILE: perception/hashers/image/wavelet.py ================================================ import cv2 import numpy as np import pywt from ..hasher import ImageHasher class WaveletHash(ImageHasher): """Similar to PHash but using wavelets instead of DCT. Implementation based on that of `ImageHash `_. """ distance_metric = "hamming" dtype = "bool" def __init__(self, hash_size=8, image_scale=None, mode="haar"): assert hash_size & (hash_size - 1) == 0, "Hash size must be a power of 2." if image_scale is not None: assert ( image_scale & (image_scale - 1) == 0 ), "Image scale must be a power of 2." assert ( image_scale >= hash_size ), "Image scale must be greater than or equal to than hash size." self.hash_size = hash_size self.image_scale = image_scale self.mode = mode self.hash_length = hash_size * hash_size def _compute(self, image): if self.image_scale is None: image_scale = max(2 ** int(np.log2(min(image.shape[:2]))), self.hash_size) else: image_scale = self.image_scale image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) image = cv2.resize( image, dsize=(image_scale, image_scale), interpolation=cv2.INTER_AREA ) image = np.float32(image) / 255 ll_max_level = int(np.log2(image_scale)) level = int(np.log2(self.hash_size)) dwt_level = ll_max_level - level if self.mode == "haar": coeffs = pywt.wavedec2(image, "haar", level=ll_max_level) coeffs = list(coeffs) coeffs[0] *= 0 image = pywt.waverec2(coeffs, "haar") coeffs = pywt.wavedec2(image, self.mode, level=dwt_level) dwt_low = coeffs[0] # Subtract median and compute hash med = np.median(dwt_low) diff = dwt_low > med return diff.flatten() ================================================ FILE: perception/hashers/tools.py ================================================ import base64 import fractions import functools import hashlib import io import itertools import json import logging import math import os import queue import shlex import subprocess import tempfile import threading import typing import warnings from collections import Counter from http import client from numbers import Number from urllib import request import cv2 import numpy as np import PIL import PIL.Image import validators LOGGER = logging.getLogger(__name__) ImageInputType = typing.Union[ str, np.ndarray, "PIL.Image.Image", io.BytesIO, tempfile.SpooledTemporaryFile ] SIZES = {"float32": 32, "uint8": 8, "bool": 1} # Map codec names to the CUDA-accelerated version. Obtain # from ffmpeg -codecs after building using CUDA. CUDA_CODECS = { "h264": "h264_cuvid", "hevc": "hevc_cuvid", "mjpeg": "mjpeg_cuvid", "mpeg1video": "mpeg1_cuvid", "mpeg2video": "mpeg2_cuvid", "mpeg4": "mpeg4_cuvid", "vc1": "vc1_cuvid", "vp8": "vp8_cuvid", "vp9": "vp9_cuvid", } FramesWithIndexesAndTimestamps = typing.Generator[ tuple[np.ndarray, int | None, float | None], None, None ] def get_ffprobe(): return os.environ.get("PERCEPTION_FFPROBE_BINARY", "ffprobe") def get_ffmpeg(): return os.environ.get("PERCEPTION_FFMPEG_BINARY", "ffmpeg") def compute_quality(image) -> int: """Compute a quality metric, using the calculation proposed by `Facebook `_ for their PDQ hash algorithm.""" if len(image.shape) == 3: image = cv2.cvtColor(image, code=cv2.COLOR_RGB2GRAY) if image.shape[0] != 64 or image.shape[1] != 64: image = cv2.resize(src=image, dsize=(64, 64)).astype("float32") dx = 100 * np.abs(image[:, 1:] - image[:, :-1]) / 255 dy = 100 * np.abs(image[1:] - image[:-1]) / 255 dx = dx.astype("int").sum() dy = dy.astype("int").sum() return int(np.clip(a=int((dx + dy) / 90), a_min=0, a_max=100)) def compute_md5(filepath) -> str: """Compute the md5 hash for a file at `filepath`. Args: filepath: The path to the file """ with open(filepath, "rb") as f: hash_str = hashlib.md5(f.read()).hexdigest() return hash_str def get_string_length(hash_length: int, dtype: str, hash_format="hex") -> int: """Compute the expected length of a hash string. Args: hash_length: The length of the hash vector dtype: The dtype of the vector hash_format: One of 'base64' or 'hex' Returns: The expected string length """ hash_bytes = math.ceil(hash_length * SIZES[dtype] / 8) if hash_format == "base64": return int((4 * hash_bytes / 3) + 3) & ~3 if hash_format == "hex": return 2 * hash_bytes raise NotImplementedError("Unknown hash format: " + hash_format) def vector_to_string(vector: np.ndarray, dtype: str, hash_format: str) -> str | None: """Convert vector to hash. Args: vector: Input vector """ # At times, a vector returned by a hasher is None (e.g., for hashes # that depend on the image not being featureless). In those cases, # we need to just return None, which is the least surprising outcome # because after all, the string representation of None is None. if vector is None: return None if hash_format == "vector": # return vector.astype(dtype) # old behavior raise DeprecationWarning("`hash_format` `vector` has been removed.") if dtype == "uint8": vector_bytes = vector.astype("uint8") elif dtype == "float32": vector_bytes = vector.astype("float32") elif dtype == "bool": vector_bytes = np.packbits(vector.astype("bool")) else: raise NotImplementedError(f"Cannot convert hash of type {dtype}.") if hash_format == "base64": return base64.b64encode(vector_bytes.tobytes()).decode("utf-8") if hash_format == "hex": return vector_bytes.tobytes().hex() raise NotImplementedError(f"Cannot convert to string format: {hash_format}.") def string_to_vector( hash_string: str, dtype: str, hash_length: int, hash_format: str, verify_length: bool = True, ) -> np.ndarray: """Convert hash back to vector. Args: hash_string: The input hash string dtype: The data type of the hash hash_length: The length of the hash vector hash_format: The input format of the hash (base64 or hex) verify_length: Whether to verify the string length """ assert not verify_length or len(hash_string) == get_string_length( hash_length=hash_length, hash_format=hash_format, dtype=dtype ), "Incorrect string length for this hash format." if hash_format == "base64": vector_bytes = np.frombuffer( base64.b64decode(hash_string), dtype="uint8" if dtype in ["bool", "uint8"] else dtype, ) elif hash_format == "hex": vector_bytes = np.frombuffer( bytearray.fromhex(hash_string), dtype="uint8" if dtype in ["bool", "uint8"] else dtype, ) else: raise NotImplementedError(f"Cannot convert to string format: {hash_format}") if dtype == "uint8": return vector_bytes[:hash_length] if dtype == "float32": return vector_bytes[:hash_length] if dtype == "bool": return np.unpackbits(vector_bytes)[:hash_length].astype("bool") raise NotImplementedError(f"Cannot convert hash of type {dtype}.") def hex_to_b64( hash_string: str, dtype: str, hash_length: int, verify_length: bool = True ): """Convert a hex-encoded hash to base64. Args: hash_string: The input base64 hash string dtype: The data type of the hash hash_length: The length of the hash vector verify_length: Whether to verify the string length """ return vector_to_string( string_to_vector( hash_string, hash_length=hash_length, hash_format="hex", dtype=dtype, verify_length=verify_length, ), dtype=dtype, hash_format="base64", ) def b64_to_hex( hash_string: str, dtype: str, hash_length: int, verify_length: bool = True ): """Convert a base64-encoded hash to hex. Args: hash_string: The input hex hash string dtype: The data type of the hash hash_length: The length of the hash vector verify_length: Whether to verify the string length """ return vector_to_string( string_to_vector( hash_string, hash_length=hash_length, hash_format="base64", dtype=dtype, verify_length=verify_length, ), dtype=dtype, hash_format="hex", ) def to_image_array(image: ImageInputType, require_color=True) -> np.ndarray: if isinstance(image, np.ndarray): assert image.flags["C_CONTIGUOUS"], ( "Provided arrays must be contiguous to avoid " "erroneous results when arrays are passed to " "underlying libraries. This can be achieved using" "np.ascontiguousarray(image)" ) assert not require_color or ( len(image.shape) == 3 and image.shape[-1] == 3 ), "Provided images must be RGB images." return image return read(image) def get_common_framerates(id_rates: dict): """Compute an optimal set of framerates for a list of framerates. Optimal here means that reading the video at each of the framerates will allow one to collect all of the frames required with the smallest possible number of frames decoded. For example, consider if we need to read a video at 3 fps, 5 fps, 1 fps and 0.5 fps. We could read the video 4 times (once per framerate). But a more optimal approach is to read the video only twice, once at 3 frames per second and another time at 5 frames per second. For the 1 fps hasher, we simply pass every 3rd frame of the 3 fps pass. For the 0.5 fps hasher, we pass every 6th frame of the 3 fps pass. So if you pass this function {A: 3, B: 5, C: 1, D: 0.5}, you will get back {3: [A, C, D], 5: C}. Args: id_rates: A dictionary with IDs as keys and frame rates as values. Returns: rate_ids: A dictionary with framerates as keys and a list of ids as values. """ def partition(collection): """This function taken from https://stackoverflow.com/questions/19368375/set-partitions-in-python/30134039#30134039 """ if len(collection) == 1: yield [collection] return first = collection[0] for smaller in partition(collection[1:]): # insert `first` in each of the subpartition's subsets for n, subset in enumerate(smaller): yield smaller[:n] + [[first] + subset] + smaller[n + 1 :] # put `first` in its own subset yield [[first]] + smaller framerates = list(id_rates.values()) factor = 2 * 3 * 5 * 7 * 11 * 60 * 60 assert ( min(framerates) >= 1 / factor ), "Framerates must be at least 1 frame per hour." best_frame_count = np.inf best_grouping: list | None = None best_frame_rates: list | None = None # We try every possible grouping of framerates to minimize the number # of frames we decode. There is likely a better way to do this, # but this seems to do the job for now. for grouping in partition(list(set(framerates))): current_frame_rates = [ functools.reduce(np.lcm, (np.array(group) * factor).round().astype(int)) / factor for group in grouping ] current_frame_count = sum(current_frame_rates) if current_frame_count < best_frame_count: best_frame_count = current_frame_count best_frame_rates = current_frame_rates best_grouping = grouping assert best_frame_rates is not None assert best_grouping is not None return { framerate: tuple(name for name, rate in id_rates.items() if rate in group) for framerate, group in zip(best_frame_rates, best_grouping) } def get_isometric_transforms(image: ImageInputType, require_color=True) -> dict: image_array = to_image_array(image, require_color=require_color) return { "r0": image_array, "fv": np.ascontiguousarray(image_array[::-1, :]), "fh": np.ascontiguousarray(image_array[:, ::-1]), "r180": np.ascontiguousarray(image_array[::-1, ::-1]), "r90": np.ascontiguousarray(image_array.transpose(1, 0, 2)[::-1, :, :]), "r90fv": np.ascontiguousarray(image_array.transpose(1, 0, 2)), "r90fh": np.ascontiguousarray(image_array.transpose(1, 0, 2)[::-1, ::-1]), "r270": np.ascontiguousarray(image_array.transpose(1, 0, 2)[:, ::-1]), } def get_isometric_dct_transforms(dct: np.ndarray): T1 = np.empty_like(dct) T1[::2] = 1 T1[1::2] = -1 T2 = np.empty_like(dct) T2[::2, ::2] = 1 T2[1::2, 1::2] = 1 T2[::2, 1::2] = -1 T2[1::2, ::2] = -1 return { "r0": dct, "fv": dct * T1, "fh": dct * T1.T, "r180": dct * T2, "r90": dct.T * T1, "r90fv": dct.T, "r90fh": dct.T * T2, "r270": dct.T * T1.T, } def read(filepath_or_buffer: ImageInputType, timeout=None) -> np.ndarray: """Read a file into an image object Args: filepath_or_buffer: The path to the file or any object with a `read` method (such as `io.BytesIO`) timeout: If filepath_or_buffer is a URL, the timeout to use for making the HTTP request. """ if isinstance(filepath_or_buffer, PIL.Image.Image): return np.array(filepath_or_buffer.convert("RGB")) if isinstance( filepath_or_buffer, (io.BytesIO, client.HTTPResponse, tempfile.SpooledTemporaryFile), ): image = np.asarray(bytearray(filepath_or_buffer.read()), dtype=np.uint8) decoded_image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED) elif isinstance(filepath_or_buffer, str): if validators.url(filepath_or_buffer): with request.urlopen(filepath_or_buffer, timeout=timeout) as response: return read(response) if not os.path.isfile(filepath_or_buffer): raise FileNotFoundError( "Could not find image at path: " + filepath_or_buffer ) decoded_image = cv2.imread(filepath_or_buffer) else: raise RuntimeError( "Unhandled filepath_or_buffer type: " + str(type(filepath_or_buffer)) ) if decoded_image is None: raise ValueError(f"An error occurred reading {filepath_or_buffer}.") # We use cvtColor here instead of just ret[..., ::-1] # in order to ensure that we provide a contiguous # array for later processing. Some hashers use ctypes # to pass the array and non-contiguous arrays can lead # to erroneous results. return cv2.cvtColor(decoded_image, cv2.COLOR_BGR2RGB) def _get_keyframes(filepath): """Get the keyframes for a video. Args: filepath: Path to the target file Returns: A list of frame indexes. """ args = [ get_ffprobe(), "-select_streams", "v", "-i", f"'{filepath}'", "-print_format", "json", "-show_entries", "frame=pict_type,coded_picture_number", ] with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p: out, err = p.communicate() if p.returncode != 0: raise ValueError(f"{str(out)}: {str(err)}") data = json.loads(out.decode("utf-8"))["frames"] frames = [f["coded_picture_number"] for f in data if f["pict_type"] == "I"] # ffprobe will return frames repeated and out of order at times. This # last step deduplicates and sorts them. frames = list(set(frames)) frames.sort() return frames def get_video_properties(filepath): cmd = f""" {get_ffprobe()} -select_streams v:0 -i '{filepath}' -print_format json -show_entries stream=width,height,avg_frame_rate,codec_name,start_time """ with subprocess.Popen( shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE ) as p: out, err = p.communicate() if p.returncode != 0: raise ValueError(f"{str(out)}: {str(err)}") data = json.loads(out.decode("utf-8"))["streams"][0] numerator, denominator = tuple(map(int, data["avg_frame_rate"].split("/")[:2])) avg_frame_rate: fractions.Fraction | None if numerator > 0 and denominator > 0: avg_frame_rate = fractions.Fraction( numerator=numerator, denominator=denominator ) else: avg_frame_rate = None return ( data["width"], data["height"], avg_frame_rate, data["codec_name"], float(data.get("start_time", "0")), ) def read_video_to_generator_ffmpeg( filepath, frames_per_second: str | float | None = None, errors="raise", max_duration: float | None = None, max_size: int | None = None, interp: str | None = None, frame_rounding: str = "up", draw_timestamps=False, use_cuda=False, ) -> FramesWithIndexesAndTimestamps: """This is used by :code:`read_video` when :code:`use_ffmpeg` is True. It differs from :code:`read_video_to_generator` in that it uses FFMPEG instead of OpenCV and, optionally, allows for CUDA acceleration. CUDA acceleration can be faster for larger videos (>1080p) where downsampling is desired. For other videos, CUDA may be slower, but the decoding load will still be taken off the CPU, which may still be advantageous. You can specify which FFMPEG binary to use by setting PERCEPTION_FFMPEG_BINARY. Args: filepath: See read_video frames_per_second: See read_video errors: See read_video max_duration: See read_video max_size: See read_video interp: The interpolation method to use. When not using CUDA, you must choose one of the `interpolation options `_ (default: area). When using CUDA, you must choose from the `interp_algo options `_ (default: super). frame_rounding: The frame rounding method. draw_timestamps: Draw original timestamps onto the frames (for debugging only) use_cuda: Whether to enable CUDA acceleration. Requires a CUDA-accelerated version of ffmpeg. To build FFMPEG with CUDA, do the following in a Docker container based on nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04. The FFMPEG binary will be ffmpeg/ffmpeg. .. code-block:: bash git clone https://git.videolan.org/git/ffmpeg/nv-codec-headers.git cd nv-codec-headers make sudo make install cd .. git clone https://git.ffmpeg.org/ffmpeg.git cd ffmpeg sudo apt-get update && sudo apt-get -y install yasm export PATH=$PATH:/usr/local/cuda/bin # Note: Scroll far right to see full configure command: ./configure --enable-cuda-nvcc --enable-cuvid --enable-nvenc --enable-nvdec \ --enable-libnpp --enable-nonfree --extra-cflags=-I/usr/local/cuda/include \ --extra-ldflags=-L/usr/local/cuda/lib64 make -j 10 sudo make install Returns: See :code:`read_video` """ if interp is None: interp = "super" if use_cuda else "area" try: ( raw_width, raw_height, avg_frame_rate, codec_name, start_time, ) = get_video_properties(filepath) start_time_offset = ( 0.0 if avg_frame_rate is None else float(1 / (2 * avg_frame_rate)) ) LOGGER.debug( "raw_width: %s, raw_height: %s, avg_frame_rate: %s, codec_name: %s, start_time: %s", raw_width, raw_height, avg_frame_rate, codec_name, start_time, ) channels = 3 scale = ( min(max_size / raw_width, max_size / raw_height, 1) if max_size is not None else 1 ) width, height = map(lambda d: int(round(scale * d)), [raw_width, raw_height]) # If there is no average frame rate, the offset tends to be unreliable. offset = max(start_time, start_time_offset) if avg_frame_rate is not None else 0 cmd = ( f"{get_ffmpeg()} -hide_banner -an -vsync 0 -loglevel fatal " f"-itsoffset -{offset}" ) filters = [] if draw_timestamps: pattern = "%{pts}-%{frame_num}" filters.append( f"drawtext=fontsize={int(raw_height * 0.1)}:" f"fontcolor=yellow:text={pattern}" ":x=(w-text_w):y=(h-text_h)" ) # Add frame rate filters. if frames_per_second is None: seconds_per_frame = ( float(1 / avg_frame_rate) if avg_frame_rate is not None else None ) elif frames_per_second == "keyframes": seconds_per_frame = None filters.append(r"select=eq(pict_type\,I)") else: assert isinstance( frames_per_second, (float, int) ), f"Invalid framerate: {frames_per_second}" seconds_per_frame = 1 / frames_per_second filters.append( f"fps={frames_per_second}:round={frame_rounding}:start_time={offset}" ) # Add resizing filters. if use_cuda and codec_name in CUDA_CODECS: cuda_codec = CUDA_CODECS[codec_name] cmd += f" -hwaccel cuda -c:v {cuda_codec}" filters.append("hwupload_cuda") if scale != 1: filters.append(f"scale_npp={width}:{height}:interp_algo={interp}") filters.extend( [ "hwdownload", "format=nv12", ] ) elif scale != 1: filters.append(f"scale={width}:{height}:flags={interp}") cmd += f" -i '{filepath}'" if filters: cmd += f" -vf '{','.join(filters)}'" cmd += " -pix_fmt rgb24 -f image2pipe -vcodec rawvideo -" LOGGER.debug("running ffmpeg with: %s", cmd) framebytes = width * height * channels bufsize = framebytes * int(os.environ.get("PERCEPTION_FFMPEG_BUFSIZE", "5")) with subprocess.Popen( shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=bufsize, ) as p: assert p.stdout is not None, "Could not launch subprocess pipe." timestamp: float | None = 0 frame_index: int | None = 0 while True: batch = p.stdout.read(bufsize) if not batch: break for image in np.frombuffer(batch, dtype="uint8").reshape( ( -1, height, width, channels, ) ): if frames_per_second != "keyframes": yield (image, frame_index, timestamp) if seconds_per_frame is not None: assert timestamp is not None timestamp += seconds_per_frame frame_index = ( math.ceil(avg_frame_rate * timestamp) if avg_frame_rate is not None else None ) else: timestamp = None frame_index = None else: # Obtaining the keyframe indexes with ffprobe is very slow (slower # than reading the video sometimes). We don't *have* to do it # when using ffmpeg, so we don't. The OpenCV approach *does* # get the keyframe indexes, but only because they're required # in order to select them. yield (image, None, None) if ( max_duration is not None and timestamp is not None and timestamp > max_duration ): break stdout, stderr = p.communicate() if p.returncode != 0: raise ValueError( f"Error parsing video: {stdout.decode('utf-8')} {stderr.decode('utf-8')}" ) except Exception as e: if errors not in ["warn", "ignore"]: raise e if errors == "warn": warnings.warn( message=f"An error occurred while reading {filepath}. Processing may be truncated." ) def read_video_to_generator( filepath, frames_per_second: str | float | None = None, errors="raise", max_duration: float | None = None, max_size: int | None = None, ) -> FramesWithIndexesAndTimestamps: """This is used by :code:`read_video` when :code:`use_ffmpeg` is False (default). Args: filepath: See :code:`read_video`. frames_per_second: See :code:`read_video`. errors: See :code:`read_video`. max_duration: See :code:`read_video`. max_size: See :code:`read_video`. Returns: See :code:`read_video`. """ if cv2.__version__ < "4.1.1" and filepath.lower().endswith("gif"): message = "Versions of OpenCV < 4.1.1 may read GIF files improperly. Upgrade recommended." if errors == "raise": raise ValueError(message) warnings.warn(message=message) if not os.path.isfile(filepath): raise FileNotFoundError(f"Could not find {filepath}.") if not os.access(filepath, os.R_OK): raise OSError(f"{filepath} is not readable") cap = cv2.VideoCapture(filename=filepath, apiPreference=cv2.CAP_FFMPEG) try: # The purpose of the following block is largely to create a # frame_indexes (iterator or list) that indicates which # frames we should be returning to the user and then # yielding those frames as we come across them. file_frames_per_second = cap.get(cv2.CAP_PROP_FPS) if file_frames_per_second == 0: if errors == "raise": raise ValueError("Video file has framerate of 0fps.") # The known case where this occurs is for GIFs, where # 0 fps is typically inferred as 10 fps. file_frames_per_second = 10 if errors == "warn": warnings.warn( message="Video file has framerate of 0 fps. Guessing framerate of 10fps." ) if frames_per_second is None: frames_per_second = file_frames_per_second seconds_between_desired_frames = ( None if (frames_per_second is not None and isinstance(frames_per_second, str)) else 1 / frames_per_second # type: ignore ) seconds_between_grabbed_frames = 1 / file_frames_per_second grabbed_frame_count = 0 if frames_per_second == "keyframes": frame_indexes: range | list[int] | typing.Iterator[int] = _get_keyframes( filepath ) # The repeat flag is used to handle the case where the # desired sampling rate is higher than the file's frame # rate. In this case, we will need to repeat frames in # order to provide the least-surprising behavior that # we can. repeat = False else: num_frames_per_second = float(frames_per_second) frame_indexes = itertools.count( 0, max(1, file_frames_per_second / num_frames_per_second) ) repeat = file_frames_per_second < num_frames_per_second input_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) input_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) if max_size is not None: scale = min(max_size / max(input_width, input_height), 1) else: scale = 1 target_size: tuple[int, int] | None if scale < 1: target_size = (int(scale * input_width), int(scale * input_height)) else: target_size = None for frame_index in frame_indexes: while grabbed_frame_count < frame_index: # We need to skip this frame. success = cap.grab() if not success: break grabbed_frame_count += 1 success, frame = cap.read() grabbed_frame_count += 1 if not success: # The video is over or an error has occurred. break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if target_size is not None: frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_NEAREST) current_timestamp = frame_index / file_frames_per_second yield frame, grabbed_frame_count - 1, current_timestamp if max_duration is not None and current_timestamp > max_duration: break if repeat and isinstance(seconds_between_desired_frames, Number): next_desired_timestamp = ( current_timestamp + seconds_between_desired_frames ) next_timestamp = current_timestamp + seconds_between_grabbed_frames while next_desired_timestamp < next_timestamp: yield (frame, grabbed_frame_count - 1, next_desired_timestamp) next_desired_timestamp += seconds_between_desired_frames except Exception as e: if errors not in ["warn", "ignore"]: raise e if errors == "warn": warnings.warn( message=f"An error occurred while reading {filepath}. Processing may be truncated." ) finally: cap.release() def read_video_into_queue(*args, video_queue, terminate, func, **kwargs): # We're inside a thread now and the queue is being read elsewhere. try: for frame, frame_index, timestamp in func(*args, **kwargs): if not terminate.is_set(): video_queue.put((frame, frame_index, timestamp)) else: break finally: video_queue.put((None, None, None)) def read_video( filepath, frames_per_second: str | float | None = None, max_queue_size=128, use_queue=True, errors="raise", use_ffmpeg=False, **kwargs, ) -> FramesWithIndexesAndTimestamps: """Provides a generator of RGB frames, frame indexes, and timestamps from a video. This function requires you to have installed ffmpeg. All other arguments passed to read_video_to_generator. Args: filepath: Path to the video file frames_per_second: How many frames to provide for each second of video. If None, all frames are provided. If frames_per_second is "keyframes", we use ffmpeg to select I frames from the video. max_queue_size: The maximum number of frames to load in the queue use_queue: Whether to use a queue of frames during processing max_duration: The maximum length of the video to hash. max_size: The maximum size of frames to queue errors: Whether to 'raise', 'warn', or 'ignore' errors use_ffmpeg: Whether to use the FFMPEG CLI to read videos. If True, other kwargs (e.g., :code:`use_cuda`) are passed to :code:`read_video_to_generator_ffmpeg`. Yields: (frame, frame_index, timestamp) tuples """ for ffmpeg_kwarg in ["interp", "frame_rounding", "draw_timestamps", "use_cuda"]: if not use_ffmpeg and ffmpeg_kwarg in kwargs: if kwargs[ffmpeg_kwarg] is not None: # Only log a warning if the value is something other than None. warnings.warn( f"{ffmpeg_kwarg} is ignored when use_ffmpeg is False.", UserWarning ) del kwargs[ffmpeg_kwarg] generator: typing.Callable[..., FramesWithIndexesAndTimestamps] if use_ffmpeg: generator = read_video_to_generator_ffmpeg else: generator = read_video_to_generator frame_index: int | None timestamp: float | None if use_queue: video_queue: queue.Queue[tuple[np.ndarray, int, float]] = queue.Queue( maxsize=max_queue_size ) terminate = threading.Event() thread = threading.Thread( target=read_video_into_queue, kwargs={ "frames_per_second": frames_per_second, "func": generator, "video_queue": video_queue, "filepath": filepath, "errors": errors, "terminate": terminate, **kwargs, }, ) thread.start() try: while True: frame, frame_index, timestamp = video_queue.get() video_queue.task_done() if frame is None: break yield (frame, frame_index, timestamp) finally: # Set the termination flag for the # background thread. terminate.set() try: # Unblock the thread, in the event # that it is waiting. video_queue.get_nowait() # Do it twice for the edge case # where the queue is completely # full and the end sentinel is # blocking. video_queue.get_nowait() except queue.Empty: # It doesn't matter if it's empty. pass # Wait for the background thread to terminate. thread.join() else: for frame, frame_index, timestamp in generator( filepath=filepath, frames_per_second=frames_per_second, errors=errors, **kwargs, ): yield (frame, frame_index, timestamp) def compute_synchronized_video_hashes( filepath: str, hashers: dict, framerates=None, hash_format="base64", use_queue=True ): """Compute the video hashes for a group of hashers with synchronized frame processing wherever possible. Args: filepath: Path to video file. hashers: A dictionary mapping hasher names to video hasher objects hash_format: The format in which to return the hashes use_queue: Whether to use queued video frames """ if framerates is None: framerates = get_common_framerates( { k: h.frames_per_second for k, h in hashers.items() if h.frames_per_second is not None } ) else: assert all( any(hasher_name in hasher_names for hasher_names in framerates.values()) for hasher_name, hasher in hashers.items() if hasher.frames_per_second is not None ), "Provided framerates do not have an entry for all required hashers." results = { hasher_name: { "state": None, "hash": None, "relative_framerate": next( framerate / hasher.frames_per_second for framerate, hasher_names in framerates.items() if hasher_name in hasher_names ), } for hasher_name, hasher in hashers.items() if hasher.frames_per_second is not None } for current_framerate, current_hasher_names in framerates.items(): for frame_index, (frame, grabbed_frame_index, frame_timestamp) in enumerate( read_video( filepath=filepath, frames_per_second=current_framerate, use_queue=use_queue, ) ): for hasher_name in current_hasher_names: config = results[hasher_name] hasher = hashers[hasher_name] assert config["relative_framerate"] is not None if frame_index % config["relative_framerate"] == 0: config["state"] = hasher.process_frame( frame=frame, frame_index=grabbed_frame_index, frame_timestamp=frame_timestamp, state=config["state"], ) for hasher_name in current_hasher_names: config = results[hasher_name] hasher = hashers[hasher_name] current_hash = hasher.hash_from_final_state(state=config["state"]) if hash_format == "vector": config["hash"] = current_hash else: if not hasher.returns_multiple: config["hash"] = hasher.vector_to_string( current_hash, hash_format=hash_format ) else: config["hash"] = [ hasher.vector_to_string(h, hash_format=hash_format) for h in current_hash ] config["state"] = None hashes = {hasher_name: config["hash"] for hasher_name, config in results.items()} for hasher_name, hasher in hashers.items(): if hasher.frames_per_second is None: # This is a custom hasher that we just pass a video path to. hashes[hasher_name] = hasher.compute(filepath) return hashes def unletterbox( image: np.ndarray, only_remove_black: bool = False, min_fraction_meaningful_pixels: float = 0.1, color_threshold: float = 2, min_side_length: int = 50, min_reduction: float = 0.02, ) -> tuple[tuple[int, int], tuple[int, int]] | None: """Return bounds of the non-trivial (content) region of an image, or None. Letterboxing refers to uniform-color borders added around an image (e.g., black bars on a video frame). This function detects such borders by identifying the background color from the image corners and finding the bounding box of pixels that differ from that background. The function returns bounds as ``(x1, x2), (y1, y2)`` suitable for slicing: ``image[y1:y2, x1:x2]``. The bounds are exclusive on the right/bottom (i.e., x2 and y2 point one past the last content pixel). **Algorithm overview:** 1. Sample the four corner pixels and find the most common value as the candidate background color. If all four corners differ, return ``None`` (no consistent letterbox detected). 2. Build a binary content mask where each pixel whose grayscale intensity differs from the background by more than ``color_threshold`` is marked as content. 3. Project the mask onto rows and columns and find the first/last row and column where the fraction of content pixels exceeds ``min_fraction_meaningful_pixels``. 4. Validate that the resulting crop is meaningfully smaller than the original (controlled by ``min_reduction``) and that both sides exceed ``min_side_length``. Returns ``None`` when: - No two corners share the same color (no clear background). - Every pixel differs from the detected background (no border). - No row or column meets the content-pixel threshold. - The crop would not remove at least ``min_reduction`` fraction from any dimension. - Either cropped dimension would be smaller than ``min_side_length``. Args: image: Input image as an ``np.ndarray``. May be grayscale (H×W) or RGB (H×W×3); RGB images are converted to grayscale internally for background detection. only_remove_black: If ``True``, treat black (intensity 0) as the background regardless of corner colors. If ``False`` (default), infer the background color from the most common corner value. min_fraction_meaningful_pixels: The minimum fraction (0–1) of pixels in a row or column that must differ from the background for that row/column to be considered part of the content region. Defaults to 0.1 (10%). color_threshold: The minimum absolute difference in grayscale intensity between a pixel and the background color for that pixel to be classified as content. Defaults to 2. min_side_length: The minimum width or height (in pixels) of the cropped region. If the crop would be smaller, ``None`` is returned. Defaults to 50. min_reduction: The minimum fraction (0–1) of the original width or height that must be removed for the crop to be worthwhile. If the crop removes less than this from both dimensions, ``None`` is returned. Defaults to 0.02 (2%). Returns: A tuple ``((x1, x2), (y1, y2))`` giving the left, right, top, and bottom bounds of the content region (right/bottom exclusive), or ``None`` if no meaningful letterbox was detected. """ if not 0 <= min_fraction_meaningful_pixels <= 1: raise ValueError("min_fraction_meaningful_pixels must be between 0 and 1") if not 0 <= min_reduction <= 1: raise ValueError("min_reduction must be between 0 and 1") image = image.astype(np.uint8) shape = image.shape h, w = shape[0:2] if len(shape) == 3: image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) # Determine background color and build binary content mask. if only_remove_black: bg_gray = 0 else: # Sample the four corner pixels. If all four are unique there is no # consistent background color, so we bail out early (O(1) rejection). corners = ( image[0, 0], image[0, w - 1], image[h - 1, 0], image[h - 1, w - 1], ) if len(set(corners)) == 4: LOGGER.debug("No common corner color detected, skipping content detection.") return ( (0, w), (0, h), ) # Return full image bounds instead of None to maintain backwards compatibility # Use the most common corner value as the background intensity. counts = Counter(corners) bg_gray = counts.most_common(1)[0][0] # Mark pixels whose grayscale intensity differs from the background # by more than color_threshold as content (True). content_mask = np.abs(image.astype(np.int16) - bg_gray) > color_threshold # If every pixel is classified as content, there is no border to remove. if content_mask.all(): LOGGER.debug("All pixels differ from background; no letterbox detected.") return ( (0, w), (0, h), ) # Return full image bounds instead of None to maintain backwards compatibility # Find the content bounding box by projecting the mask onto rows and # columns. cv2.reduce is used instead of np.sum for performance. mask_u8 = content_mask.astype(np.uint8) row_content = cv2.reduce(mask_u8, 1, cv2.REDUCE_SUM, dtype=cv2.CV_32S).ravel() col_content = cv2.reduce(mask_u8, 0, cv2.REDUCE_SUM, dtype=cv2.CV_32S).ravel() # Thresholds for minimum content per row/column row_threshold = min_fraction_meaningful_pixels * w col_threshold = min_fraction_meaningful_pixels * h # Find first/last rows and columns with sufficient content content_rows = np.where(row_content > row_threshold)[0] content_cols = np.where(col_content > col_threshold)[0] if len(content_rows) == 0 or len(content_cols) == 0: LOGGER.debug("No rows or columns with sufficient content detected.") return None top = int(content_rows[0]) bottom = int(content_rows[-1]) + 1 left = int(content_cols[0]) right = int(content_cols[-1]) + 1 height = bottom - top width = right - left # Reject if the crop does not remove at least min_reduction from # at least one dimension (i.e., the border is negligibly thin). if width >= w * (1 - min_reduction) and height >= h * (1 - min_reduction): LOGGER.debug( "Crop would not reduce either dimension by %.0f%%; skipping.", min_reduction * 100, ) return ( (0, w), (0, h), ) # Return full image bounds instead of None to maintain backwards compatibility # Reject if the remaining content region is too small to be useful. if width < min_side_length or height < min_side_length: LOGGER.debug( "Cropped region (%dx%d) smaller than min_side_length=%d; skipping.", width, height, min_side_length, ) return None return ((left, right), (top, bottom)) def unletterbox_crop( image: np.ndarray, min_fraction_meaningful_pixels: float = 0.1, color_threshold: float = 2, min_side_length: int = 50, min_reduction: float = 0.02, ) -> np.ndarray | None: """Detect and crop the letterboxed regions from an image. Args: image: The image from which to remove letterboxing. min_fraction_meaningful_pixels: 0 to 1: if cropped version is smaller than this fraction of the image do not unletterbox. 0.1 == 10% of the image. color_threshold: The minimum absolute difference in grayscale intensity between a pixel and the background color for that pixel to be classified as content. Defaults to 2. min_side_length: The minimum width or height (in pixels) of the cropped region. If the crop would be smaller, ``None`` is returned. Defaults to 50. min_reduction: The minimum fraction (0–1) of the original width or height that must be removed for the crop to be worthwhile. If the crop removes less than this from both dimensions, the original image is returned. Defaults to 0.02 (2%). Returns: The cropped image or None if the image is mostly blank space. """ if not isinstance(image, np.ndarray): raise TypeError(f"Expected np.ndarray, got {type(image).__name__}") bounds = unletterbox( image, min_fraction_meaningful_pixels=min_fraction_meaningful_pixels, color_threshold=color_threshold, min_side_length=min_side_length, min_reduction=min_reduction, ) if bounds is None: return None (x1, x2), (y1, y2) = bounds cropped = np.ascontiguousarray(image[y1:y2, x1:x2]) assert cropped.data.contiguous return cropped ================================================ FILE: perception/hashers/video/__init__.py ================================================ from .framewise import FramewiseHasher from .tmk import TMKL1, TMKL2 __all__ = ["FramewiseHasher", "TMKL1", "TMKL2"] ================================================ FILE: perception/hashers/video/framewise.py ================================================ import numpy as np from .. import tools from ..hasher import ImageHasher, VideoHasher class FramewiseHasher(VideoHasher): """A hasher that simply returns frame-wise hashes at some regular interval with some minimum inter-frame distance threshold.""" returns_multiple = True def __init__( self, frame_hasher: ImageHasher, interframe_threshold: float, frames_per_second: int = 15, quality_threshold: float | None = None, ): self.hash_length = frame_hasher.hash_length self.frames_per_second = frames_per_second self.frame_hasher = frame_hasher self.distance_metric = frame_hasher.distance_metric if self.distance_metric == "hamming" and interframe_threshold > 1: raise ValueError( "Hamming distance is always between 0 and 1 but " f"`interframe_threshold` was set to {interframe_threshold}." ) self.dtype = frame_hasher.dtype self.interframe_threshold = interframe_threshold self.quality_threshold = quality_threshold def process_frame(self, frame, frame_index, frame_timestamp, state=None): if self.quality_threshold is None: current = self.frame_hasher.compute(frame, hash_format="vector") else: current, quality = self.frame_hasher.compute_with_quality( frame, hash_format="vector" ) if quality < self.quality_threshold: return state or {"previous": None, "hashes": []} assert isinstance(current, np.ndarray) # help type checking below if state is None or state["previous"] is None: # We keep a separate reference to the previous hash instead of using # the last entry in the hashes list because `compute_batches` may # clear the hashes list but we still want to be able to compare # the final entry. state = { "previous": current, "hashes": [current], } else: if ( self.frame_hasher.compute_distance(current, state["previous"]) > self.interframe_threshold ): state["hashes"].append(current) return state def compute_batches( self, filepath: str, batch_size: int, errors="raise", hash_format="base64" ): """Compute hashes for a video in batches. Args: filepath: Path to video file batch_size: The batch size to use for returning hashes errors: One of "raise", "ignore", or "warn". Passed to perception.hashers.tools.read_video. hash_format: The format in which to return hashes """ def format_batch(hashes): return [ ( self.vector_to_string(vector, hash_format=hash_format) if hash_format != "vector" else vector ) for vector in hashes ] state = None for frame, frame_index, frame_timestamp in tools.read_video( filepath=filepath, frames_per_second=self.frames_per_second, errors=errors ): state = self.process_frame( frame=frame, frame_index=frame_index, frame_timestamp=frame_timestamp, state=state, ) if state is not None and len(state["hashes"]) > batch_size: yield format_batch(state["hashes"]) state["hashes"] = [] if state is not None and state["hashes"]: yield format_batch(state["hashes"]) def hash_from_final_state(self, state): if state is None: return [] return state["hashes"] ================================================ FILE: perception/hashers/video/tmk.py ================================================ import platform import warnings import numpy as np import scipy.special from ..hasher import ImageHasher, VideoHasher from ..image.phash import PHashF class TMKL2(VideoHasher): """The TMK L2 video hashing algorithm.""" dtype = "float32" distance_metric = "custom" def __init__( self, frame_hasher: ImageHasher | None = None, frames_per_second: int = 15, normalization: str = "matrix", ): if platform.machine() == "arm64": warnings.warn("TMK is not supported on ARM64") T = np.array([2731, 4391, 9767, 14653]).astype("float32") m = 32 if frame_hasher is None: frame_hasher = PHashF(hash_size=16, exclude_first_term=True, freq_shift=1) self.frames_per_second = frames_per_second assert frame_hasher.dtype != "bool", "This hasher requires real valued hashes." # Beta parameter of the modified Bessel function of the first kind self.beta = 32 # Number of Fourier coefficients per period self.m = m # The periods with shape (T, ) self.T = T # (T) # The Fourier coefficients with shape (T, m, 1) self.ms = 2 * np.pi * np.arange(0, self.m).astype("float32") # (m) self.ms_normed = (self.ms[np.newaxis,] / self.T.reshape(-1, 1)).reshape( len(self.T), self.m, 1 ) # (T, m, 1) # The weights with shape (T, 2m, 1) a = np.array( [ (scipy.special.iv(0, self.beta) - np.exp(-self.beta)) / (2 * np.sinh(self.beta)) ] + [ scipy.special.iv(i, self.beta) / np.sinh(self.beta) for i in range(1, self.m) ] ) a = a.reshape(1, -1).repeat(repeats=len(self.T), axis=0) # type: ignore a = np.sqrt(a) self.a = a[..., np.newaxis] # The frame-wise hasher self.frame_hasher = frame_hasher self.hash_length = self.T.shape[0] * 2 * self.m * self.frame_hasher.hash_length self.normalization = normalization def process_frame(self, frame, frame_index, frame_timestamp, state=None): if state is None: state = {"features": [], "timestamps": []} state["features"].append(self.frame_hasher.compute(frame, hash_format="vector")) state["timestamps"].append(frame_timestamp) return state def hash_from_final_state(self, state): timestamps = np.array(state["timestamps"]) features = np.array(state["features"]).reshape( ( 1, 1, timestamps.shape[0], self.frame_hasher.hash_length, ) ) x = self.ms_normed * timestamps yw1 = np.sin(x) * self.a yw2 = np.cos(x) * self.a yw = np.concatenate([yw1, yw2], axis=1)[..., np.newaxis] # (T, 2m, t, 1) y = (yw * features).sum(axis=2) # (T, 2m, d) return y.flatten() def _compute_distance(self, vector1, vector2): shape = (len(self.T), 2 * self.m, self.frame_hasher.hash_length) return 1 - self._score_pair( fv_a=vector1.reshape(shape), fv_b=vector2.reshape(shape), offsets=None, normalization=self.normalization, ) def _score_pair(self, fv_a, fv_b, offsets=None, normalization="matrix"): eps = 1e-8 if offsets is None: offsets = np.array([0]) assert normalization in [ "feat", "freq", "feat_freq", "matrix", ], "Invalid normalization" if "feat" in normalization: a_xp = np.concatenate([self.a, self.a], axis=1) # (T, 2m, 1) fv_a_0 = fv_a / a_xp fv_b_0 = fv_b / a_xp norm_a = np.sqrt(np.sum(fv_a_0**2, axis=2, keepdims=True) + eps) + eps norm_b = np.sqrt(np.sum(fv_b_0**2, axis=2, keepdims=True) + eps) + eps fv_a = fv_a / norm_a fv_b = fv_b / norm_b if "freq" in normalization: norm_a, norm_b = ( np.sqrt((fv**2).sum(axis=1, keepdims=True) / self.m + eps) + eps for fv in [fv_a, fv_b] ) fv_a = fv_a / norm_a fv_b = fv_b / norm_b if normalization == "matrix": norm_a, norm_b = ( np.sqrt(np.sum(fv**2, axis=(1, 2)) + eps)[..., np.newaxis] + eps for fv in [fv_a, fv_b] ) # (T, 1) fv_a_sin, fv_b_sin = (fv[:, : self.m] for fv in [fv_a, fv_b]) # (T, m, d) fv_a_cos, fv_b_cos = (fv[:, self.m :] for fv in [fv_a, fv_b]) # (T, m, d) ms = self.ms.reshape(-1, 1) # (m, 1) dot_sin_sin, dot_sin_cos, dot_cos_cos, dot_cos_sin = ( np.sum(p, axis=2, keepdims=True) for p in [ fv_a_sin * fv_b_sin, fv_a_sin * fv_b_cos, fv_a_cos * fv_b_cos, fv_a_cos * fv_b_sin, ] ) # (T, m, 1) delta = ( ms.reshape(1, -1, 1) * offsets.reshape(1, -1) / self.T.reshape((-1, 1, 1)) ) cos_delta = np.cos(delta) # (T, m, delta) sin_delta = np.sin(delta) # (T, m, delta) dots = ( dot_sin_sin * cos_delta + dot_sin_cos * sin_delta + dot_cos_cos * cos_delta - dot_cos_sin * sin_delta ).sum(axis=1) if normalization == "matrix": dots = dots / (norm_a * norm_b) if normalization == "freq": dots = dots / self.m # (T, m, delta) elif normalization in ["feat", "feat_freq"]: dots = dots / 512 return dots.mean(axis=0) class TMKL1(VideoHasher): """The TMK L1 video hashing algorithm.""" def __init__( self, frame_hasher: ImageHasher | None = None, frames_per_second: int = 15, dtype="float32", distance_metric="cosine", norm=2, quality_threshold=None, ): if frame_hasher is None: frame_hasher = PHashF(hash_size=16, exclude_first_term=True, freq_shift=1) self.hash_length = frame_hasher.hash_length self.frames_per_second = frames_per_second assert frame_hasher.dtype != "bool", "This hasher requires real valued hashes." self.frame_hasher = frame_hasher self.norm = norm self.dtype = dtype or self.frame_hasher.dtype self.distance_metric = distance_metric or self.frame_hasher.distance_metric self.quality_threshold = quality_threshold def process_frame(self, frame, frame_index, frame_timestamp, state=None): if state is None: state = {"sum": np.zeros(self.frame_hasher.hash_length), "frame_count": 0} if self.quality_threshold is None: hash_vector = self.frame_hasher.compute(frame, hash_format="vector") else: hash_vector, quality = self.frame_hasher.compute_with_quality( frame, hash_format="vector" ) if quality < self.quality_threshold: return state assert isinstance(hash_vector, np.ndarray) # help type checking below if hash_vector is not None: state["sum"] += hash_vector.astype(np.float32) state["frame_count"] += 1 return state def hash_from_final_state(self, state): if state["frame_count"] == 0: return None average_vector = state["sum"] / state["frame_count"] if self.norm is not None: return ( average_vector / np.linalg.norm(average_vector, ord=self.norm) ).astype(self.frame_hasher.dtype) return average_vector.astype(self.frame_hasher.dtype) ================================================ FILE: perception/local_descriptor_deduplication.py ================================================ import concurrent.futures import logging import typing from abc import ABC from warnings import warn import cv2 import numpy as np import pandas as pd import tqdm import typing_extensions import perception.approximate_deduplication as ad import perception.hashers.tools as pht LOGGER = logging.getLogger(__name__) DEFAULT_MAX_FEATURES = 256 DEFAULT_OVERLAP = 0.01 DEFAULT_MATCH_PCT = 0.4 DEFAULT_INTERSECTION = 0.6 DEFAULT_INLIERS = 5 DEFAULT_MAX_SIZE = 256 DEFAULT_MIN_FEATURES = 10 DEFAULT_THRESHOLD = 100 DEFAULT_SIFT_THRESHOLD = 100 DEFAULT_AKAZE_THRESHOLD = 250 DEFAULT_RATIO = 0.5 DEFAULT_SIFT_RATIO = 0.5 DEFAULT_AKAZE_RATIO = 0.85 class Descriptors(typing_extensions.TypedDict): keypoints: np.ndarray descriptors: np.ndarray descriptor_count: int dimensions: tuple[int, int] filepath: str hasher: str class MatchStats(typing_extensions.TypedDict): match: float | None min_kpBM: int | None MAB: str | None intersection: float | None inliers: float | None bounds_intersection: float | None final_matched_a_pts: list[np.ndarray] | None final_matched_b_pts: list[np.ndarray] | None class LocalHasher(ABC): grayscale = False name: str hasher: typing.Any ratio: float threshold: int def __init__( self, max_features: int = DEFAULT_MAX_FEATURES, ratio: float = DEFAULT_SIFT_RATIO, threshold: int = DEFAULT_THRESHOLD, overlap: float = DEFAULT_OVERLAP, validation_match: float = DEFAULT_MATCH_PCT, validation_inliers: int = DEFAULT_INLIERS, validation_intersection: float = DEFAULT_INTERSECTION, ): self.ratio = ratio self.threshold = threshold self.max_features = max_features self.overlap = overlap self.validation_match = validation_match self.validation_inliers = validation_inliers self.validation_intersection = validation_intersection def compute(self, image) -> tuple[np.ndarray, np.ndarray]: return self.hasher.detectAndCompute(image, None) def validate_match( self, descriptor1: Descriptors, descriptor2: Descriptors, minimum_match: float = DEFAULT_MATCH_PCT, minimum_intersection: float = DEFAULT_INTERSECTION, minimum_inliers: int = DEFAULT_INLIERS, ) -> tuple[bool, MatchStats]: """Validate the match between two sets of keypoints and descriptors. The validation algorithm is as follows: #. Compute the mutual set of matches between the two sets of descriptors and filter them using Lowe's ratio test. #. If the minimum number of passing matches is less than "minimum_match", the match fails. This ensures we don't have trivial matches. #. Compute the intersection area of the matched keypoints versus the raw keypoints. If the area overlap is less than minimum_intersection, the match fails. This ensures we don't match on small subsegments of an image, such as logos. #. Compute a transformation matrix using cv2.findHomography. If we cannot obtain a transformation matrix, the match fails. If the sum total of inliers for the transformation matrix is less than minimum_inliers, the match fails. #. Finally, use the transformation matrix on a set of points representing the bounding box of each image. If less than minimum_intersection of the bounding box fits within the bounds of the transformed version, the match fails. This is a second pass safety check for logos and other subsegments of images. Args: kp1: The first set of keypoints des1: The first set of descriptors kp2: The second set of keypoints des2: The second set of descriptors dims1: The dimensions (width, height) for the first image dims2: The dimensions (width, height) for the second image minimum_match: The minimum number of matches passing the ratio test. minimum_intersection: The minimum overlapping area between the keypoints in the filtered set of matches and the original keypoints. minimum_inliers: The minimum number of inliers for the transformation matrix. ratio: The ratio to use for Lowe's ratio test. Returns: True if the match passes, False otherwise. """ swap = descriptor1["keypoints"].shape[0] < descriptor2["keypoints"].shape[0] descriptorA = descriptor2 if swap else descriptor1 descriptorB = descriptor1 if swap else descriptor2 stats: MatchStats = { "match": None, "min_kpBM": None, "MAB": None, "intersection": None, "inliers": None, "bounds_intersection": None, "final_matched_a_pts": None, "final_matched_b_pts": None, } indexA = ad.build_index(descriptorA["descriptors"], approximate=False) indexB = ad.build_index(descriptorB["descriptors"], approximate=False) if ( descriptorA["descriptors"] is None or indexA is None or descriptorB["descriptors"] is None or indexB is None ): return False, stats distances_A2B, indexes_A2B = indexB.search( descriptorA["descriptors"].astype("float32"), 2 ) distances_B2A, _ = indexA.search( descriptorB["descriptors"].astype("float32"), 2 ) good_A2B, good_B2A = map( lambda distances: (distances[:, 0] < distances[:, 1] * self.ratio), [distances_A2B, distances_B2A], ) match = min( good_A2B.sum() / good_A2B.shape[0], good_B2A.sum() / good_B2A.shape[0] ) stats["match"] = match if match < minimum_match: # We didn't get enough good matches. return False, stats kpAM = descriptorA["keypoints"][good_A2B] kpBM = descriptorB["keypoints"][indexes_A2B[good_A2B, 0]] # findHomography requires 4 points from each to work. stats["min_kpBM"] = min(len(kpAM), len(kpBM)) if len(kpAM) < 4 or len(kpBM) < 4: return False, stats intersection = compute_minimum_intersection( kp1=descriptorA["keypoints"], kp2=descriptorB["keypoints"], filter_arr1=good_A2B, filter_arr2=indexes_A2B[good_A2B, 0], ) stats["intersection"] = intersection if intersection < minimum_intersection: return False, stats MAB, mask = cv2.findHomography( kpAM.reshape(-1, 1, 2), kpBM.reshape(-1, 1, 2), cv2.RANSAC, 1.0, maxIters=50_000, confidence=0.9999, ) stats["MAB"] = "good" if MAB is None: # We didn't get a transformation matrix. stats["MAB"] = "is-None" return False, stats stats["inliers"] = mask.sum() if mask.sum() < minimum_inliers: # The transformation matrix didn't include enough inliers. return False, stats # Check how much of each original bounding box fits onto # the other image. try: MBA = np.linalg.inv(MAB) except np.linalg.LinAlgError: # We couldn't compute the matrix inverse. stats["MAB"] = "inverse-failed" return False, stats ptsA = np.array([[0, 0], descriptorA["dimensions"]]).astype("float32") ptsB = np.array([[0, 0], descriptorB["dimensions"]]).astype("float32") ptsAt = ( cv2.perspectiveTransform(ptsA.reshape((-1, 1, 2)), MAB) .reshape(-1, 2) .clip(0, descriptorB["dimensions"]) ) ptsBt = ( cv2.perspectiveTransform(ptsB.reshape((-1, 1, 2)), MBA) .reshape(-1, 2) .clip(0, descriptorA["dimensions"]) ) bounds_intersection = min( abs(np.prod(ptsBt[1] - ptsBt[0]) / np.prod(descriptorA["dimensions"])), abs(np.prod(ptsAt[1] - ptsAt[0]) / np.prod(descriptorB["dimensions"])), ) stats["bounds_intersection"] = bounds_intersection # Apply mask index to kpAM, kpBM for list of matcihing points. mask ==1 for keep matched_a_pts = [] matched_b_pts = [] for i in range(mask.shape[0]): if mask[i][0] == 1: matched_a_pts.append(kpAM[i]) matched_b_pts.append(kpBM[i]) # Unswap points before final return. if swap: stats["final_matched_a_pts"] = matched_b_pts stats["final_matched_b_pts"] = matched_a_pts else: stats["final_matched_a_pts"] = matched_a_pts stats["final_matched_b_pts"] = matched_b_pts return (bounds_intersection >= minimum_intersection, stats) class SIFT(LocalHasher): name = "SIFT" def __init__( self, max_features: int = DEFAULT_MAX_FEATURES, ratio: float = DEFAULT_SIFT_RATIO, threshold: int = DEFAULT_SIFT_THRESHOLD, **kwargs, ): super().__init__(max_features, ratio, threshold, **kwargs) self.hasher = cv2.SIFT_create(nfeatures=self.max_features) # type: ignore[attr-defined] class AKAZE(LocalHasher): name = "AKAZE" def __init__( self, max_features: int = DEFAULT_MAX_FEATURES, ratio: float = DEFAULT_AKAZE_RATIO, threshold: int = DEFAULT_AKAZE_THRESHOLD, **kwargs, ): super().__init__(max_features, ratio, threshold, **kwargs) LOGGER.warning("The default AKAZE tuning has issues with some cropped images.") self.hasher = cv2.AKAZE_create() # type: ignore[attr-defined] def load_and_preprocess(filepath, max_size=DEFAULT_MAX_SIZE, grayscale=True): """Read, unletterbox, and resize an image. Args: filepath: The path to the file max_size: The maximum size for a dimension of the image grayscale: Set to false to get RGB """ image = pht.read(filepath) if image is None: LOGGER.warning("Failed to load image %s", filepath) return None res = pht.unletterbox(image) if res is None: return None (x1, x2), (y1, y2) = res image = np.ascontiguousarray(image[y1:y2, x1:x2]) if grayscale: image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) max_dimension = max(image.shape[:2]) if max_dimension > max_size: scale = max_size / max_dimension image = cv2.resize( image, (int(image.shape[1] * scale), int(image.shape[0] * scale)) ) return image def generate_image_descriptors( filepath: str, hasher: LocalHasher | None = None, min_features=DEFAULT_MIN_FEATURES, max_size=DEFAULT_MAX_SIZE, ) -> Descriptors | None: """Generate local descriptors for a file. Args: filepath: Path to image file. max_features: The maximum number of features to extract. min_features: The minimum number of features to extract. max_size: The maximum side length for an image. Returns: If successful, returns a tuple of keypoints, descriptors, and a (width, height) tuple. """ if hasher is None: hasher = SIFT( max_features=DEFAULT_MAX_FEATURES, ) try: image = load_and_preprocess( filepath, max_size=max_size, grayscale=hasher.grayscale ) if image is None: return None keypoints, descriptors = hasher.compute(image) except FileNotFoundError: LOGGER.warning("Image file %s not found.", filepath) return None except ValueError as e: LOGGER.error("Processing image file %s failed.", filepath, exc_info=e) return None if descriptors is None: return None if descriptors.shape[0] < min_features: return None keypoints = np.array([kp.pt for kp in keypoints], dtype=np.float32) return { "keypoints": keypoints, "descriptors": descriptors, "descriptor_count": descriptors.shape[0], "filepath": filepath, "dimensions": (image.shape[1], image.shape[0]), "hasher": hasher.name, } def build_reference_df( filepaths: typing.Iterable[str], hasher: LocalHasher | None = None, min_features=DEFAULT_MIN_FEATURES, max_size=DEFAULT_MAX_SIZE, show_progress=False, ) -> pd.DataFrame: """Build descriptors for a list of files. Args: filepaths: A list of filepaths for which descriptors are desired. hasher: The local descriptor hasher to use to extract features. min_features: The minimum number of features to extract. max_size: The maximum side length for an image. Returns: A dataframe, indexed by filepath with columns for descriptors and descriptor counts. """ LOGGER.debug("Generating descriptors") if hasher is None: hasher = SIFT() features = [] for filepath in tqdm.tqdm(filepaths, disable=not show_progress, desc="Filepaths"): features.append( generate_image_descriptors( filepath, hasher=hasher, min_features=min_features, max_size=max_size, ) ) LOGGER.debug("Finished computing descriptors.") return pd.DataFrame( { "descriptors": [ f["descriptors"] if f is not None else None for f in features ], "keypoints": [f["keypoints"] if f is not None else None for f in features], "descriptor_count": [ f["descriptor_count"] if f is not None else None for f in features ], # type: ignore "dimensions": [ f["dimensions"] if f is not None else None for f in features ], "hasher": hasher.name, "filepath": filepaths, } ).set_index("filepath") def hasher_name(df: pd.DataFrame) -> str: return df.iloc[0].get("hasher", "SIFT") def check_hasher(df1: pd.DataFrame, df2: pd.DataFrame): assert hasher_name(df1) == hasher_name( df2 ), "The hashers must mach for deduplication to work." def compute_pairs( match_df, query_df=None, hasher: LocalHasher | None = None, pct_probe=0.1, use_gpu: bool = True, faiss_cache_path: str | None = None, show_progress: bool = False, ): """Compute pairs of matching images from a reference dataframe. Args: match_df: A dataframe, as computed by build_reference_df, will compute pairs against self, unless query_df is provided. query_df: optional, if provided will be used to query against match_df for matches. threshold: The match threshold between two vectors. minimum_overlap: The minimum overlap between a pair of files. pct_probe: The percentage of the dataset to search for approximate search. faiss_cache_path: If provided load any existing faiss index from this path, and if it does not exist then save the generated faiss index to the path. show_progress: Whether or not to show a progress bar while computing pairs """ match_df = match_df.dropna(subset=["descriptors"]) counts = match_df["descriptor_count"].values.astype("uint32") descriptors = np.vstack(match_df["descriptors"].values) if hasher is None: hasher = SIFT() if query_df is None: assert ( hasher_name(match_df) == hasher.name ), "The hasher must mach the original hash format." y_counts = None y_descriptors = None else: check_hasher(match_df, query_df) query_df = query_df.dropna(subset=["descriptors"]) y_counts = query_df["descriptor_count"].values.astype("uint32") y_descriptors = np.vstack(query_df["descriptors"].values).astype("float32") LOGGER.debug("Computing euclid pairs aprox") pairs = ad.compute_euclidean_pairwise_duplicates_approx( X=descriptors.astype("float32"), counts=counts, threshold=hasher.threshold, minimum_overlap=hasher.overlap, pct_probe=pct_probe, Y=y_descriptors, y_counts=y_counts, use_gpu=use_gpu, faiss_cache_path=faiss_cache_path, show_progress=show_progress, ) if query_df is None: query_df = match_df # Assign query_df to be able to lookup matches. return [(query_df.iloc[p1].name, match_df.iloc[p2].name) for p1, p2 in pairs] def compute_area(box): """Compute the area of a box given a set of points x1, y1, x2, y2. Args: box: A list of coordinates. """ return (box[3] - box[1]) * (box[2] - box[0]) def compute_intersection(kps, filter_arr): """Compute the percentage of area covered by a set of filtered keypoints versus raw keypoints. Args: kps: A list of points filter_arr: A filter array of same length as kps_raw indicating whether to keep that keypoint. """ kps_filtered = kps[filter_arr] box_after = np.hstack([kps_filtered.min(axis=0), kps_filtered.max(axis=0)]) box_before = np.hstack([kps.min(axis=0), kps.max(axis=0)]) area_before = compute_area(box_before) area_after = compute_area(box_after) return area_after / area_before def compute_minimum_intersection(kp1, kp2, filter_arr1, filter_arr2): """Compute the minimum intersection between two pairs of keypoints (filtered and unfiltered). Args: kp1: A list of the first set of keypoints kp2: A list of the second set of keypoints filter_arr1: A filter array for the first set of keypoints filter_arr2: A filter array for the second set of keypoints """ return min( compute_intersection(kp1, filter_arr1), compute_intersection(kp2, filter_arr2) ) def deduplicate_sift_dfs(*args, **kwargs): "DEPRECATED please use deduplicate_dfs." warn("deduplicate_sift_dfs is deprecated.", DeprecationWarning, stacklevel=2) deduplicate_dfs(*args, **kwargs) def deduplicate_dfs( match_df: pd.DataFrame, query_df: pd.DataFrame | None = None, coarse_pct_probe: float = ad.DEFAULT_PCT_PROBE, max_workers: int | None = None, use_gpu: bool = True, faiss_cache_path: str | None = None, verbose: bool = False, hasher: LocalHasher | None = None, show_progress: bool = False, ) -> ( list[tuple[typing.Any, typing.Any]] | list[tuple[typing.Any, typing.Any, MatchStats]] ): """Deduplicate images within one set of images or between two sets of images: #. Given a dataframe (or two) of descriptors and keypoints for images. #. Perform a coarse, approximate search for images with common features. #. For each candidate pair, validate it pairwise by checking the features and keypoints with the traditional approach using the ratio test. See validate_match for more information. Args: match_df: Dataframe of features to dedup within. query_df: If provided will search for matches between this and match_df, if None will just search match_df against itself. coarse_pct_probe: The minimum fraction of nearest lists to search. If the product of pct_probe and the number of lists is less than 1, one list will be searched. corase_threshold: The threshold for a match as a euclidean distance. minimum_coarse_overlap: The minimum overlap between two files to qualify as a match. minimum_validation_match: The minimum number of matches passing the ratio test. minimum_validation_intersection: The minimum overlapping area between the keypoints in the filtered set of matches and the original keypoints. minimum_validation_inliers: The minimum number of inliers for the transformation matrix. ratio: The ratio to use for Lowe's ratio test. max_workers: The maximum number of threads to use for doing the final validation step. faiss_cache_path: If provided load any existing faiss index from this path, and if it does not exist then save the generated faiss index to the path. Most helpful if doing multiple queries against the same match_df. verbose: return metada with matches such as overlap percent etc. show_progress: Whether or not to show a progress bar while computing duplicate file pairs Returns: A list of pairs of file duplicates. If verbose is true the tuple will be: (match_id1, match_id2, metadata_dict) """ if hasher is None: hasher = SIFT() LOGGER.debug("Computing candidate pairs") candidates = compute_pairs( match_df, query_df, pct_probe=coarse_pct_probe, hasher=hasher, use_gpu=use_gpu, faiss_cache_path=faiss_cache_path, show_progress=show_progress, ) if query_df is None: query_df = match_df assert ( match_df.index.is_unique ), "Index of match_df must be unique, or it will cause wrong matches." assert ( query_df.index.is_unique ), "Index of query_df must be unique, or it will cause wrong matches." LOGGER.debug("Validating candidate pairs: %d", len(candidates)) keep: ( list[tuple[typing.Any, typing.Any]] | list[tuple[typing.Any, typing.Any, MatchStats]] ) = [] # type: ignore with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: batch_size = 10_000 for start in tqdm.tqdm(range(0, len(candidates), batch_size)): futures = { executor.submit( hasher.validate_match, descriptor1=query_df.loc[c1].to_dict(), descriptor2=match_df.loc[c2].to_dict(), minimum_match=hasher.validation_match, minimum_inliers=hasher.validation_inliers, minimum_intersection=hasher.validation_intersection, ): (c1, c2) for c1, c2 in candidates[start : start + batch_size] } for future in concurrent.futures.as_completed(futures): is_match, metadata = future.result() if is_match: if verbose: keep.append( (futures[future][0], futures[future][1], metadata) # type: ignore ) else: keep.append(futures[future]) # type: ignore LOGGER.debug("Validating complete, keeping: %d", len(keep)) return keep def deduplicate( filepaths_or_reference_df: typing.Iterable[str] | pd.DataFrame, query_filepaths_or_df: None | (typing.Iterable[str] | pd.DataFrame) = None, max_features: int = DEFAULT_MAX_FEATURES, min_features: int = DEFAULT_MIN_FEATURES, max_size: int = DEFAULT_MAX_SIZE, hasher: LocalHasher | None = None, show_progress: bool = False, **kwargs, ) -> ( list[tuple[typing.Any, typing.Any]] | list[tuple[typing.Any, typing.Any, MatchStats]] ): """Deduplicate images by doing the following: #. Unletterbox all images and resize to some maximum size, preserving aspect ratio. #. Compute the descriptors and keypoints for all the resulting images. #. See `deduplicate_dfs` for remaining steps. Args: filepaths_or_reference_df: The list of images to deduplicate, or a precomputed descriptor DataFrame. query_filepaths_or_df: If provided will look for matches between these files and the files in the first param. max_features: The maximum number of features to extract. min_features: The minimum number of features to extract. max_size: The maximum side length for an image. show_progress: Whether or not to show a progress bar while building descriptors and computing pairs of file duplicates Returns: A list of pairs of file duplicates. If verbose is true the tuple will be: (match_id1, match_id2, metadata_dict) """ if hasher is None: hasher = SIFT(max_features=max_features) if isinstance(filepaths_or_reference_df, pd.DataFrame): reference_df = filepaths_or_reference_df else: reference_df = build_reference_df( filepaths=filepaths_or_reference_df, hasher=hasher, min_features=min_features, max_size=max_size, show_progress=show_progress, ) if query_filepaths_or_df is None: query_df = None else: if isinstance(query_filepaths_or_df, pd.DataFrame): query_df = query_filepaths_or_df else: query_df = build_reference_df( filepaths=query_filepaths_or_df, hasher=hasher, min_features=min_features, max_size=max_size, show_progress=show_progress, ) return deduplicate_dfs( reference_df, query_df=query_df, hasher=hasher, show_progress=show_progress, **kwargs, ) ================================================ FILE: perception/py.typed ================================================ ================================================ FILE: perception/testing/__init__.py ================================================ import atexit import math import typing from contextlib import ExitStack from importlib import resources import cv2 import numpy as np import pandas as pd import pytest from PIL import Image from .. import hashers, tools SIZES = {"float32": 32, "uint8": 8, "bool": 1} def get_low_detail_image(): v = np.arange(0, 50, 1) v = np.concatenate([v, v[::-1]])[np.newaxis,] image = np.matmul(v.T, v) image = (image * 255 / image.max()).astype("uint8") image = image[..., np.newaxis].repeat(repeats=3, axis=2) image[:, 50:] = 0 image[50:] = 0 return image LOW_DETAIL_IMAGE = get_low_detail_image() file_manager = ExitStack() atexit.register(file_manager.close) DEFAULT_TEST_IMAGES = [ str( file_manager.enter_context( resources.as_file( resources.files("perception") / "testing" / "images" / f"image{n}.jpg" ) ) ) for n in range(1, 11) ] DEFAULT_TEST_LOGOS = [ str( file_manager.enter_context( resources.as_file( resources.files("perception") / "testing" / "logos" / "logoipsum.png" ) ) ) ] DEFAULT_TEST_VIDEOS = [ str( file_manager.enter_context( resources.as_file( resources.files("perception") / "testing" / "videos" / f"v{n}.m4v" ) ) ) for n in range(1, 3) ] + [ str( file_manager.enter_context( resources.as_file( resources.files("perception") / "testing" / "videos" / "v2s.mov" ) ) ) ] @typing.no_type_check def test_opencv_hasher(hasher: hashers.ImageHasher, image1: str, image2: str): # For OpenCV hashers we make sure the distance we compute # is the same as inside OpenCV f1 = image1 f2 = image2 opencv_distance = hasher.hasher.compare( hasher.hasher.compute(hashers.tools.read(f1)), hasher.hasher.compute(hashers.tools.read(f2)), ) if hasher.distance_metric == "hamming": opencv_distance /= hasher.hash_length np.testing.assert_approx_equal( opencv_distance, hasher.compute_distance(hasher.compute(f1), hasher.compute(f2)), significant=4, ) def hash_dicts_to_df(hash_dicts, returns_multiple): assert all( h["error"] is None for h in hash_dicts ), "An error was found in the hash dictionaries" if returns_multiple: return pd.DataFrame( { "filepath": tools.flatten( [[h["filepath"]] * len(h["hash"]) for h in hash_dicts] ), "hash": tools.flatten([h["hash"] for h in hash_dicts]), } ).assign(error=np.nan) return pd.DataFrame.from_records(hash_dicts).assign(error=np.nan) def test_hasher_parallelization(hasher, test_filepaths): filepaths_10x = test_filepaths * 10 if not hasher.allow_parallel: with pytest.warns(UserWarning, match="cannot be used in parallel"): hashes_parallel_dicts = hasher.compute_parallel(filepaths=filepaths_10x) else: hashes_parallel_dicts = hasher.compute_parallel(filepaths=filepaths_10x) hashes_sequential_dicts = [ {"filepath": filepath, "hash": hasher.compute(filepath), "error": None} for filepath in filepaths_10x ] hashes_parallel = hash_dicts_to_df( hashes_parallel_dicts, returns_multiple=hasher.returns_multiple ).sort_values(["filepath", "hash"]) hashes_sequential = hash_dicts_to_df( hashes_sequential_dicts, returns_multiple=hasher.returns_multiple ).sort_values(["filepath", "hash"]) assert (hashes_sequential.hash.values == hashes_parallel.hash.values).all() assert (hashes_sequential.filepath.values == hashes_parallel.filepath.values).all() def test_video_hasher_integrity( hasher: hashers.VideoHasher, test_videos: list[str] = DEFAULT_TEST_VIDEOS ): test_hasher_parallelization(hasher, test_videos) def test_image_hasher_integrity( hasher: hashers.ImageHasher, pil_opencv_threshold: float, transform_threshold: float, test_images: list[str] = DEFAULT_TEST_IMAGES, opencv_hasher: bool = False, ): """Test to ensure a hasher works correctly. Args: hasher: The hasher to test. test_images: A list of filepaths to images to use for testing. pil_opencv_threshold: The hash distance permitted for an image when loaded with OpenCV vs. PIL. transform_threshold: The permitted error in isometric transform hashes. opencv_hasher: Whether the hasher is an OpenCV hasher. Used to determine whether to check for consistent distances. """ assert len(test_images) >= 2, "You must provide at least two test images." image1 = test_images[0] image2 = test_images[1] hash1_1 = str(hasher.compute(image1)) # str() games for mypy, not proud hash1_2 = str(hasher.compute(Image.open(image1))) image_cv = cv2.imread(image1) assert image_cv is not None, f"Failed to load image: {image1}" hash1_3 = str(hasher.compute(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB))) hash2_1 = str(hasher.compute(image2)) # There is a small distance because PIL and OpenCV read # JPEG images a little differently (e.g., libjpeg-turbo vs. libjpeg) assert hasher.compute_distance(hash1_1, hash1_2) < pil_opencv_threshold assert hasher.compute_distance(hash1_1, hash2_1) > pil_opencv_threshold assert hasher.compute_distance(hash1_1, hash1_3) == 0 # Ensure the conversion to and from vectors works for both base64 and hex. assert hasher.vector_to_string(hasher.string_to_vector(hash2_1)) == hash2_1 assert ( hasher.vector_to_string( hasher.string_to_vector( str( hasher.vector_to_string( hasher.string_to_vector(hash2_1), hash_format="hex" ) ), hash_format="hex", ) ) == hash2_1 ) # Ensure parallelization works properly. test_hasher_parallelization(hasher=hasher, test_filepaths=test_images) # Ensure the isometric hashes computation work properly for image in test_images: transforms = hashers.tools.get_isometric_transforms(image) hashes_exp = { key: str(hasher.compute(value)) for key, value in transforms.items() } hashes_act = hasher.compute_isometric(transforms["r0"]) for transform_name in hashes_exp.keys(): assert ( hasher.compute_distance( hashes_exp[transform_name], hashes_act[transform_name] ) < transform_threshold ) # Verify that hashes are the correct length. hash_bits = hasher.hash_length * SIZES[hasher.dtype] words_base64 = math.ceil(hash_bits / 6) # Base64 uses 8 bits for every 6 bits words_base64 += ( 0 if words_base64 % 4 == 0 else 4 - (words_base64 % 4) ) # Base64 always uses multiples of four assert len(hash2_1) == words_base64 words_hex = 2 * math.ceil(hash_bits / 8) # Hex uses 16 bits for every 8 bits words_hex += ( 0 if words_hex % 2 == 0 else 1 ) # Two characters for every one character. assert ( len( str( hasher.vector_to_string( hasher.string_to_vector(hash2_1), hash_format="hex" ) ) ) == words_hex ) # Verify that low quality images yield zero quality image = np.zeros((100, 100, 3)).astype("uint8") # type: ignore _, quality = hasher.compute_with_quality(image) assert quality == 0 # Verify that high quality images yield high quality # scores. assert ( min(hasher.compute_with_quality(filepath)[1] for filepath in test_images) == 100 ) # Verify that medium quality images yield medium quality _, quality = hasher.compute_with_quality(LOW_DETAIL_IMAGE) assert 0 < quality < 100 if opencv_hasher: test_opencv_hasher(hasher, image1, image2) ================================================ FILE: perception/testing/images/README.md ================================================ # Sample images These images were obtained from Wikimedia Commons. - [Image 1](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:ADAC-Zentrale,_Munich,_March_2017-05.jpg) - [Image 2](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Two-tailed_pasha_(Charaxes_jasius_jasius)_Greece.jpg) - [Image 3](https://commons.wikimedia.org/wiki/Main_Page#/media/File:Escolta_presidencial,_Plaza_de_Armas,_Lima,_Per%C3%BA,_2015-07-28,_DD_40.JPG) - [Image 4](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Iglesia_de_Ntra._Sra._de_la_Junquera,_Luesma,_Zaragoza,_Espa%C3%B1a,_2017-01-04,_DD_60.jpg) - [Image 5](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Bahrain_Fort_March_2015.JPG) - [Image 6](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:ET_Gondar_asv2018-02_img18_Fasil_Ghebbi.jpg) - [Image 7](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:M%C3%BCnster,_Beresa,_Mercedes-Benz_C-Klasse_Cabrio_--_2018_--_1757.jpg) - [Image 8](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Panoramic_sunset_in_Conques_02.jpg) - [Image 9](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Catedral_de_San_Basilio,_Mosc%C3%BA,_Rusia,_2016-10-03,_DD_05-06_HDR.jpg) - [Image 10](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Tupolev_Tu-160_overflying_Moscow_fix.jpg) ================================================ FILE: perception/testing/logos/README.md ================================================ # Sample Logos These logos were obtained from free sources. - [LogoIpsum](https://logoipsum.com/) ================================================ FILE: perception/testing/videos/README.md ================================================ Video from https://www.youtube.com/watch?v=84Er4LnWXtI under Creative Commons Attribution License. Notes - v1 is a fairly short, slow moving video - v2 is a longer but faster-paced video - v2s is the same as v2 but with a snippet removed in the middle (simulates a scene or cut) ================================================ FILE: perception/tools.py ================================================ import base64 import json import os import urllib.parse import urllib.request import warnings import numpy as np from scipy import spatial from tqdm import tqdm from . import hashers as perception_hashers from .utils import flatten try: from . import extensions # type: ignore except ImportError: warnings.warn( "C extensions were not built. Some metrics will be computed more slowly. " "Please install from wheels or set up a compiler prior to installation " "from source to use extensions." ) extensions = None def _multiple_hashes_for_ids(hashes: list[tuple[str, str | np.ndarray]]): """Check if a list of (hash_id, hash) tuples has more than one hash for a hash_id. Args: hashes: A list of (hash_id, hash) tuples. """ hash_ids = [hash_id for hash_id, _ in hashes] return len(hash_ids) != len(set(hash_ids)) def deduplicate_hashes( hashes: list[tuple[str, str | np.ndarray]], threshold: float, hash_format: str = "base64", hasher: perception_hashers.ImageHasher | None = None, hash_length: int | None = None, hash_dtype: str | None = None, distance_metric: str | None = None, progress: tqdm | None = None, ) -> list[tuple[str, str]]: """Find duplicates using a list of precomputed hashes. Args: hashes: A list of (id, hash) tuples threshold: A distance threshold hasher: A hasher to use for computing distances progress: A tqdm object for reporting progress Returns: A list of duplicated id pairs. To use, you can just remove the first entry of each pair from your dataset. The pairs are provided in the event that you wish to apply further analysis. """ assert ( hash_length is not None and hash_dtype is not None and distance_metric is not None ) or (hasher is not None), ( "You must provide either `hasher` or all of " "`hash_length`, `hash_dtype`, and `distance_metric`." ) if hasher is not None: assert all( k is None for k in [hash_length, hash_dtype, distance_metric] ), "If hasher is provided, hash_length, hash_dtype, and distance_metric must all be None." hash_length = hasher.hash_length hash_dtype = hasher.dtype distance_metric = hasher.distance_metric assert hash_length is not None assert isinstance(hash_dtype, str) assert isinstance(distance_metric, str) # If there is more than one hash for an id, we want them # to be sequential in case we are able to use the more # efficient distance calculation (compute_euclidean_pairwise_duplicates) # that skips computation of distance between two hashes for the same file. multiple_hashes_per_id = _multiple_hashes_for_ids(hashes) if multiple_hashes_per_id: hashes = sorted(hashes) vectors = np.array( [ ( perception_hashers.tools.string_to_vector( hash_string=hash_string_or_vector, hash_format=hash_format, hash_length=hash_length, dtype=hash_dtype, ) if isinstance(hash_string_or_vector, str) else hash_string_or_vector ) for _, hash_string_or_vector in hashes ] ) files = np.array([identifier for identifier, _ in hashes]) pairs: list[tuple[str, str]] = [] n_hashes = len(vectors) start_idx = 0 end_idx = None if distance_metric != "euclidean" or "int" not in hash_dtype or extensions is None: iterator = range(n_hashes) if progress is not None: iterator = progress(iterator, total=n_hashes, desc="Deduplicating.") # type: ignore[operator] distances = spatial.distance.pdist(vectors, metric=distance_metric) for hash_index in iterator: if end_idx is not None: start_idx = end_idx end_idx = start_idx + (n_hashes - hash_index - 1) current_distances = distances[start_idx:end_idx] duplicated_files = files[hash_index + 1 :][current_distances < threshold] current_file = files[hash_index] # We have to make sure the two files are not the same file # because it can happen for highly symmetric images when # we are including isometric hashes. pairs.extend( [ (current_file, duplicated_file) for duplicated_file in duplicated_files if duplicated_file != current_file ] ) else: # We want to count the number of hashes for each unique hash ID. There # may be more than one -- for example in the case of video. We need # this so we can pass it to the compute_euclidean_pairwise_duplicates # function. if multiple_hashes_per_id: counts = np.zeros(shape=len({hash_id for hash_id, _ in hashes})).astype( "uint32" ) previous_hash_id = None counts_idx = 0 files_ = ( [] # make type check happy ) # We're going to re-build the IDs with deduplicated files. for hash_id, _ in hashes: if hash_id != previous_hash_id: files_.append(hash_id) if previous_hash_id is not None and hash_id != previous_hash_id: counts_idx += 1 counts[counts_idx] += 1 previous_hash_id = hash_id files = np.array(files_) else: counts = None # type: ignore pairs = [ (files[idx1], files[idx2]) for idx1, idx2 in extensions.compute_euclidean_pairwise_duplicates_simple( vectors.astype("int32"), threshold=threshold, counts=counts ) ] return list(set(pairs)) def deduplicate( files: list[str], hashers: list[tuple[perception_hashers.ImageHasher, float]], isometric: bool = False, progress: tqdm | None = None, ) -> list[tuple[str, str]]: """Find duplicates in a list of files. Args: files: A list of filepaths. hashers: A list of tuples of the form (hasher, threshold) isometric: Whether to compare the rotated versions of the images progress: A tqdm progress indicator Returns: A list of duplicated file pairs. To use, you can just remove the first entry of each pair from your dataset. The pairs are provided in the event that you wish to apply further analysis. """ files_dedup = set(files) if len(files_dedup) != len(files): warnings.warn( message="Duplicate file paths were provided. These will be automatically removed.", category=UserWarning, ) files = list(files_dedup) pairs: list[tuple[str, str]] = [] for hasher_idx, (hasher, threshold) in enumerate(hashers): hash_dicts = hasher.compute_parallel( filepaths=files, progress=progress, progress_desc=f"Computing hashes for hash {hasher_idx+1} of {len(hashers)}.", isometric=isometric, ) hash_list = sorted(hash_dicts, key=lambda h: h["filepath"]) if isometric: hash_list = flatten( [ list(row["hash"].values()) for row in hash_dicts if row["error"] is None ] ) files_for_hashes = flatten( [[row["filepath"]] * 8 for row in hash_dicts if row["error"] is None] ) elif hasher.returns_multiple: hash_list = flatten( [row["hash"] for row in hash_dicts if row["error"] is None] ) files_for_hashes = flatten( [[row["filepath"]] * 8 for row in hash_dicts if row["error"] is None] ) else: hash_list = [row["hash"] for row in hash_dicts if row["error"] is None] files_for_hashes = [ row["filepath"] for row in hash_dicts if row["error"] is None ] pairs.extend( deduplicate_hashes( hashes=list(zip(files_for_hashes, hash_list)), hasher=hasher, threshold=threshold, progress=progress, ) ) return list(set(pairs)) class SaferMatcher: """An object for matching hashes with the known CSAM hashes in the Safer matching service. Please contact `info@getsafer.io `_ for details on obtaining credentials and information on how match responses are provided. Here's a minimalist example: .. code-block:: python from perception import hashers, tools hasher = hashers.PHash(hash_size=16) matches = hashers.tools.SaferMatcher( api_key='YOUR_API_KEY', username='YOUR_USERNAME', # You only need to provide password='YOUR_PASSWORD', # an API key OR username/password. url='MATCHING_SERVICE_URL' ) For authentication, you must provide the API key OR username and password pair. If neither is provided, the function will attempt to find them as environment variables with names :code:`SAFER_MATCHING_SERVICE_API_KEY`, :code:`SAFER_MATCHING_SERVICE_USERNAME`, and :code:`SAFER_MATCHING_SERVICE_PASSWORD`, respectively. You must also provide the URL endpoint for the matching service, either as a keyword argument or as a :code:`SAFER_MATCHING_SERVICE_URL` environment variable. Args: api_key: A base64 encoded set of matching service credentials username: Matching service username password: Matching service password url: Safer matching service URL hasher: A hasher to use for matching hasher_api_id: The hasher ID for finding matches. quality_threshold: The quality threshold filter to use """ def __init__( self, api_key: str | None = None, username: str | None = None, password: str | None = None, url: str | None = None, hasher: perception_hashers.ImageHasher | None = None, hasher_api_id: str | None = None, quality_threshold: int = 90, ): if ( username is None and password is None and api_key is None and os.environ.get("SAFER_MATCHING_SERVICE_USERNAME") is not None and os.environ.get("SAFER_MATCHING_SERVICE_PASSWORD") is not None ): username = os.environ["SAFER_MATCHING_SERVICE_USERNAME"] password = os.environ["SAFER_MATCHING_SERVICE_PASSWORD"] if username is not None and password is not None: credentials = f"{username}:{password}" api_key = base64.b64encode(credentials.encode("ascii")).decode("ascii") if api_key is None: api_key = os.environ.get("SAFER_MATCHING_SERVICE_API_KEY") if api_key is None: raise ValueError( "You must provide one of (1) API key, (2) API key provided as " "`SAFER_MATCHING_SERVICE_API_KEY` env var, (3) username and password or " "(4) username and password as `SAFER_MATCHING_SERVICE_USERNAME` and " "`SAFER_MATCHING_SERVICE_PASSWORD` env vars." ) if url is None: url = os.environ.get("SAFER_MATCHING_SERVICE_URL") if url is None: raise ValueError( "You must provide either the url or the SAFER_MATCHING_SERVICE_URL env var." ) if urllib.parse.urlparse(url).scheme != "https" and not os.environ.get( "SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP" ): raise ValueError("You must provide an url that begins with `https://`.") self.api_key = api_key self.url = url if hasher is None: hasher = perception_hashers.PHash(hash_size=16, highfreq_factor=4) if hasher_api_id is None: hasher_api_id = "phash" self.hasher = hasher self.hasher_api_id = hasher_api_id self.quality_threshold = quality_threshold def match( self, images: list[(str | tuple[perception_hashers.tools.ImageInputType, str])], ) -> dict: """Match hashes with the Safer matching service. Args: images: A list of image filepaths or (image_like, image_id) tuples. Returns: A dictionary of matches. See Safer matching service documentation ( contact Thorn for a copy). """ raw_hashes = [ self.hasher.compute_with_quality( image if isinstance(image, str) else image[0] ) for image in images ] hashes = [ { "id": image if isinstance(image, str) else image[1], self.hasher_api_id: hash_string, "md5": ( perception_hashers.tools.compute_md5(image) if isinstance(image, str) else ( perception_hashers.tools.compute_md5(image[0]) if isinstance(image[0], str) else None ) ), } for image, (hash_string, quality) in zip(images, raw_hashes) if quality > self.quality_threshold ] for hash_dict in hashes: # We cannot include an md5 key if we don't # have the md5. if hash_dict["md5"] is None: del hash_dict["md5"] if not hashes: warnings.warn( message="No images of sufficient quality were found.", category=UserWarning, ) return {} body = {"hashes": hashes, "version": "v2"} headers = { "Authorization": f"Basic {self.api_key}", "Content-Type": "application/json", } req = urllib.request.Request( url=self.url, data=str(json.dumps(body)).encode("utf-8"), headers=headers, method="POST", ) with urllib.request.urlopen(req) as res: ret = json.loads(res.read().decode("utf-8")) return ret ================================================ FILE: perception/utils.py ================================================ def flatten(list_of_lists): return [item for sublist in list_of_lists for item in sublist] ================================================ FILE: poetry.toml ================================================ [virtualenvs] create = true in-project = true ================================================ FILE: pyproject.toml ================================================ [project] name = "Perception" dynamic = ["version"] description = "Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use." authors = [{ name = "Thorn", email = "info@wearethorn.org" }] license = "Apache-2.0" readme = "README.md" requires-python = ">=3.10,<4.0" dependencies = [ "Cython>=3.0.0,<4.0.0", "numpy>=1.26.4,<3.0.0", "opencv-contrib-python-headless>=4.10.0,<5.0.0", "faiss-cpu>=1.8.0,<2.0.0", "networkit>=11.1,<12.0.0; sys_platform != 'darwin'", "networkx>=3.0,<4.0; sys_platform == 'darwin'", "pandas", "Pillow", "pywavelets>=1.5.0,<2.0.0", "validators>=0.22.0,<1.0.0", "rich>=13.7.0,<14.0.0", "scipy", "tqdm>=4.67.1,<5.0.0", ] [project.optional-dependencies] benchmarking = [ "matplotlib", "albumentations>=2.0.8,<3.0.0", "tabulate", "scikit-learn", "ffmpeg-python", ] matching = ["aiohttp", "python-json-logger"] pdq = ["pdqhash>=0.2.7,<0.3.0"] [tool.poetry] version = "0.0.0" [tool.poetry.group.dev.dependencies] black = "^26" coverage = "*" ipython = "*" mypy = "*" pandas-stubs = "*" pre-commit = "*" pytest = "*" pytest-cov = "*" ruff = "*" types-pillow = "*" types-tqdm = "*" twine = "*" albumentations = "^2.0.8" [tool.poetry.build] script = "build.py" generate-setup-file = true [tool.mypy] exclude = ["/tests/"] check_untyped_defs = true ignore_missing_imports = true [tool.poetry-dynamic-versioning] enable = true vcs = "git" [build-system] requires = [ "poetry-core", "poetry-dynamic-versioning", "numpy", "Cython", "setuptools", "wheel", ] build-backend = "poetry_dynamic_versioning.backend" ================================================ FILE: setup.py ================================================ # -*- coding: utf-8 -*- from setuptools import setup packages = [ "perception", "perception.approximate_deduplication", "perception.benchmarking", "perception.hashers", "perception.hashers.image", "perception.hashers.video", "perception.testing", ] package_data = {"": ["*"], "perception.testing": ["images/*", "logos/*", "videos/*"]} extras_require = { "benchmarking": [ "matplotlib", "scipy", "albumentations", "tabulate", "scikit-learn", "ffmpeg-python", ], "experimental": ["networkit", "faiss-cpu"], "matching": ["aiohttp", "python-json-logger"], } setup_kwargs = { "name": "Perception", "version": "0.0.0", "description": "Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.", "long_description": "# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)\n\n`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.\n\n## Background\n\n`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).\n\n## Getting Started\n\n### Installation\n\n`pip install perception`\n\n### Hashing\n\nHashing with different functions is simple with `perception`.\n\n```python\nfrom perception import hashers\n\nfile1, file2 = 'test1.jpg', 'test2.jpg'\nhasher = hashers.PHash()\nhash1, hash2 = hasher.compute(file1), hasher.compute(file2)\ndistance = hasher.compute_distance(hash1, hash2)\n```\n\n### Examples\n\nSee below for end-to-end examples for common use cases for perceptual hashes.\n\n- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)\n- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)\n- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)\n\n## Supported Hashing Algorithms\n\n`perception` currently ships with:\n\n- pHash (DCT hash) (`perception.hashers.PHash`)\n- Facebook's PDQ Hash (`perception.hashers.PDQ`)\n- dHash (difference hash) (`perception.hashers.DHash`)\n- aHash (average hash) (`perception.hashers.AverageHash`)\n- Marr-Hildreth (`perception.hashers.MarrHildreth`)\n- Color Moment (`perception.hashers.ColorMoment`)\n- Block Mean (`perception.hashers.BlockMean`)\n- wHash (wavelet hash) (`perception.hashers.WaveletHash`)\n\n## Contributing\n\nTo work on the project, start by doing the following.\n\n```bash\n# Install local dependencies for\n# code completion, etc.\nmake init\n\n- To do a (close to) comprehensive check before committing code, you can use `make precommit`.\n\nTo implement new features, please first file an issue proposing your change for discussion.\n\nTo report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.\n\n## Alternatives\n\nThere are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some\nexamples.\n\n- [dedupe](https://github.com/dedupeio/dedupe)\n- [imagededup](https://idealo.github.io/imagededup/)\n- [ImageHash](https://github.com/JohannesBuchner/imagehash)\n- [PhotoHash](https://github.com/bunchesofdonald/photohash)\n```\n", "author": "Thorn", "author_email": "info@wearethorn.org", "maintainer": "None", "maintainer_email": "None", "url": "None", "packages": packages, "package_data": package_data, "extras_require": extras_require, "python_requires": ">=3.10,<4.0", } from build import * build(setup_kwargs) setup(**setup_kwargs) ================================================ FILE: tests/test_approximate_deduplication.py ================================================ import perception.approximate_deduplication as ad def get_cluster_members(assignments): clusters: dict[int, list[str]] = {} for assignment in assignments: clusters.setdefault(assignment["cluster"], []).append(assignment["id"]) return sorted(sorted(members) for members in clusters.values()) def test_pairs_to_clusters_component_strictness(): assignments = ad.pairs_to_clusters( ids=["a", "b", "c", "d"], pairs=[("a", "b"), ("b", "c")], strictness="component", ) assert get_cluster_members(assignments) == [["a", "b", "c"], ["d"]] def test_pairs_to_clusters_community_strictness(): assignments = ad.pairs_to_clusters( ids=["a", "b", "c"], pairs=[("a", "b"), ("b", "c")], strictness="community", ) assert get_cluster_members(assignments) == [["a", "b", "c"]] def test_pairs_to_clusters_clique_strictness(): assignments = ad.pairs_to_clusters( ids=["a", "b", "c", "d"], pairs=[("a", "b"), ("a", "c"), ("b", "c"), ("c", "d")], strictness="clique", ) assert get_cluster_members(assignments) == [["a", "b", "c"], ["d"]] ================================================ FILE: tests/test_benchmarking.py ================================================ import base64 import os import shutil import tempfile import numpy as np import pytest import albumentations from scipy import spatial from perception import benchmarking, hashers, testing from perception.benchmarking import video_transforms from perception.benchmarking.image import BenchmarkImageDataset from perception.benchmarking.video import BenchmarkVideoDataset files = testing.DEFAULT_TEST_IMAGES dataset = BenchmarkImageDataset.from_tuples([(fn, i % 2) for i, fn in enumerate(files)]) def test_deduplicate(): tempdir = tempfile.TemporaryDirectory() new_file = os.path.join(tempdir.name, "dup_file.jpg") shutil.copy(files[0], new_file) duplicated_files = files + [new_file] deduplicated, duplicates = BenchmarkImageDataset.from_tuples( [(fn, i % 2) for i, fn in enumerate(duplicated_files)] ).deduplicate(hasher=hashers.AverageHash(), threshold=1e-2) assert len(duplicates) == 1 assert len(deduplicated._df) == len(files) def test_bad_dataset(): bad_files = files + ["tests/images/nonexistent.jpg"] bad_dataset = BenchmarkImageDataset.from_tuples( [(fn, i % 2) for i, fn in enumerate(bad_files)] ) transforms = { "blur0.05": albumentations.GaussianBlur(sigma_limit=0.05, p=1), "noop": albumentations.Resize(height=256, width=256, p=1), } with pytest.raises(Exception): transformed = bad_dataset.transform( transforms=transforms, storage_dir="/tmp/transforms", errors="raise" ) with pytest.warns(UserWarning, match="occurred reading"): transformed = bad_dataset.transform( transforms=transforms, storage_dir="/tmp/transforms", errors="warn" ) assert len(transformed._df) == len(files) * 2 def test_benchmark_dataset(): assert len(dataset._df) == len(files) assert len(dataset.filter(category=[0])._df) == len(files) / 2 with pytest.warns(UserWarning, match="Did not find"): assert len(dataset.filter(category=[3])._df) == 0 dataset.save("/tmp/dataset.zip") dataset.save("/tmp/dataset_folder") o1 = BenchmarkImageDataset.load("/tmp/dataset.zip") o2 = BenchmarkImageDataset.load("/tmp/dataset_folder") o3 = BenchmarkImageDataset.load("/tmp/dataset.zip") for opened in [o1, o2, o3]: assert ( opened._df["filepath"].apply(os.path.basename) == dataset._df["filepath"].apply(os.path.basename) ).all() def test_benchmark_transforms(): transformed = dataset.transform( transforms={ "blur0.05": albumentations.GaussianBlur(sigma_limit=0.05, p=1), "noop": albumentations.Resize(height=256, width=256, p=1), }, storage_dir="/tmp/transforms", ) assert len(transformed._df) == len(files) * 2 hashes = transformed.compute_hashes(hashers={"pdna": hashers.PHash()}) tr = hashes.compute_threshold_recall().reset_index() hashes._metrics = None hashes._df.at[0, "hash"] = None with pytest.warns(UserWarning, match="invalid / empty hashes"): hashes.compute_threshold_recall() assert (tr[tr["transform_name"] == "noop"]["recall"] == 100.0).all() # This is a charting function but we execute it just to make sure # it runs without error. hashes.show_histograms() def convert_hash_string_to_vector(hash_string): buff = base64.decodebytes(hash_string.encode("utf-8")) return np.frombuffer(buff, dtype=np.uint8) def test_video_benchmark_dataset(): video_dataset = BenchmarkVideoDataset.from_tuples( files=[ ("perception/testing/videos/v1.m4v", "category1"), ("perception/testing/videos/v2.m4v", "category1"), ("perception/testing/videos/v1.m4v", "category2"), ("perception/testing/videos/v2.m4v", "category2"), ] ) transforms = { "noop": video_transforms.get_simple_transform(width=128, sar="1/1"), "gif": video_transforms.get_simple_transform(codec="gif", output_ext=".gif"), "clip1s": video_transforms.get_simple_transform(clip_s=(1, None)), "blackpad": video_transforms.get_black_frame_padding_transform(duration_s=1), "slideshow": video_transforms.get_slideshow_transform( frame_input_rate=1, frame_output_rate=1 ), } transformed = video_dataset.transform( storage_dir=tempfile.TemporaryDirectory().name, transforms=transforms ) assert len(transformed._df) == len(transforms) * len(video_dataset._df) assert transformed._df["filepath"].isnull().sum() == 0 # We will compute hashes for each of the transformed # videos and check the results for correctness. phash_framewise_hasher = hashers.FramewiseHasher( frame_hasher=hashers.PHash(), interframe_threshold=-1, frames_per_second=2 ) hashes = transformed.compute_hashes( hashers={"phashframewise": phash_framewise_hasher} ) guid = hashes._df.guid.iloc[0] df = hashes._df[hashes._df["guid"] == guid] clip1s = df[(df.transform_name == "clip1s")] noop = df[(df.transform_name == "noop")] blackpad = df[(df.transform_name == "blackpad")] slideshow = df[(df.transform_name == "slideshow")] # We should have dropped two hashes from the beginning # on the clipped video. assert len(clip1s) == len(noop) - 2 # The first hash from the clipped video should be the # same as the third hash from the original np.testing.assert_allclose( convert_hash_string_to_vector(clip1s.hash.iloc[0]), convert_hash_string_to_vector(noop.hash.iloc[2]), rtol=0.2, ) # The black padding adds four hashes (two on either side). assert len(blackpad) == len(noop) + 4 # A black frame should yield all zeros for PHash assert phash_framewise_hasher.string_to_vector(blackpad.iloc[0].hash).sum() == 0 # The slideshow hashes should be the same as the noop # hashes for every other hash. # Note: this is a weird test structure now because the original test, which was # assert (noop.hash.values[::2] == slideshow.hash.values[::2]).all() # kept failing because of 1 bit difference in 1 hash. This is keeps the same # spirit, but is more complex with a little leniency. We suspect the difference is # due to some versioning. So might be worthwhile to try replacing the test with the # original one occasionally. noop_hash_vectors = [ convert_hash_string_to_vector(h) for h in noop.hash.values[::2] ] slideshow_hash_vectors = [ convert_hash_string_to_vector(h) for h in slideshow.hash.values[::2] ] total_missed_bits = 0 for noop_vector, slideshow_vector in zip(noop_hash_vectors, slideshow_hash_vectors): for n in range(0, len(noop_vector)): if noop_vector[n] != slideshow_vector[n]: total_missed_bits += 1 assert total_missed_bits <= 4 # Every second hash in the slideshow should be the same as the # previous one. for n in range(0, 10, 2): assert slideshow.hash.values[n] == slideshow.hash.values[n + 1] def test_euclidean_extension(): # This function plainly inplements the process of computing # the closest positive and negative examples and their indexes. def compute_euclidean_metrics_py(X_noop, X_transformed, mask): distance_matrix = spatial.distance.cdist( XA=X_transformed, XB=X_noop, metric="euclidean" ) pos = np.ma.masked_array(distance_matrix, np.logical_not(mask)) neg = np.ma.masked_array(distance_matrix, mask) distances = np.concatenate( [neg.min(axis=1).data[np.newaxis], pos.min(axis=1).data[np.newaxis]], axis=0 ).T indexes = np.concatenate( [ neg.argmin(axis=1)[np.newaxis], pos.argmin(axis=1)[np.newaxis], ] ).T return distances, indexes X_noop = np.random.uniform(low=0, high=255, size=(5, 144)).astype("int32") X_trans = np.random.uniform(low=0, high=255, size=(10, 144)).astype("int32") mask = np.array([True, False] * 5 * 5).reshape(10, 5) distances, indexes = benchmarking.common.extensions.compute_euclidean_metrics( X_noop, X_trans, mask ) distances_py, indexes_py = compute_euclidean_metrics_py(X_noop, X_trans, mask) assert (indexes_py == indexes).all() np.testing.assert_allclose(distances, distances_py) ================================================ FILE: tests/test_hashers.py ================================================ import os import string import pytest from perception import hashers, testing from perception.hashers.image.pdq import PDQHash TEST_IMAGES = [os.path.join("tests", "images", f"image{n}.jpg") for n in range(1, 11)] # The PDQ hash isometric computation is inexact. See # https://github.com/faustomorales/pdqhash-python/blob/master/tests/test_compute.py # for details. @pytest.mark.parametrize( "hasher_class,pil_opencv_threshold,transform_threshold,opencv_hasher", [ (hashers.AverageHash, 0.1, 0.1, False), (hashers.WaveletHash, 0.1, 0.1, False), (hashers.PHash, 0.1, 0.1, False), (PDQHash, 0.1, 0.15, False), (hashers.DHash, 0.1, 0.1, False), (hashers.MarrHildreth, 0.1, 0.1, True), (hashers.BlockMean, 0.1, 0.1, True), (hashers.ColorMoment, 10, 0.1, True), ], ) def test_image_hashing_common( hasher_class, pil_opencv_threshold, transform_threshold, opencv_hasher ): testing.test_image_hasher_integrity( hasher=hasher_class(), pil_opencv_threshold=pil_opencv_threshold, transform_threshold=transform_threshold, opencv_hasher=opencv_hasher, ) def test_video_hashing_common(): testing.test_video_hasher_integrity( hasher=hashers.FramewiseHasher( frame_hasher=hashers.PHash(hash_size=16), interframe_threshold=0.1, frames_per_second=1, ) ) def test_video_reading(): # We should get one red, one green, and one blue frame for frame, _, timestamp in hashers.tools.read_video( filepath="perception/testing/videos/rgb.m4v", frames_per_second=0.5 ): assert timestamp in [0.0, 2.0, 4.0] channel = int(timestamp / 2) assert frame[:, :, channel].min() > 220 for other in [0, 1, 2]: if other == channel: continue assert frame[:, :, other].max() < 20 def test_common_framerate(): assert hashers.tools.get_common_framerates( dict(zip(["a", "b", "c"], [1 / 3, 1 / 2, 1 / 5])) ) == {1.0: ("a", "b", "c")} assert hashers.tools.get_common_framerates( dict(zip(["a", "b", "c"], [1 / 3, 1 / 6, 1 / 9])) ) == {1 / 3: ("a", "b", "c")} assert hashers.tools.get_common_framerates( dict(zip(["a", "b", "c", "d", "e"], [1 / 3, 1 / 2, 1 / 5, 1 / 7, 1 / 11])) ) == {1.0: ("a", "b", "c", "d", "e")} assert hashers.tools.get_common_framerates( dict(zip(string.ascii_lowercase[:6], [10, 5, 3, 1 / 3, 1 / 6, 1 / 9])) ) == {3.0: ("c", "d", "e", "f"), 10.0: ("a", "b")} assert hashers.tools.get_common_framerates(dict(zip(["a", "b"], [100, 1]))) == { 100: ("a", "b") } def test_synchronized_hashing(): video_hashers = { "phashframewise": hashers.FramewiseHasher( frame_hasher=hashers.PHash(hash_size=16), frames_per_second=1, interframe_threshold=0.2, ), "tmkl2": hashers.TMKL2(frames_per_second=15), "tmkl1": hashers.TMKL1(frames_per_second=15), } for filepath in [ "perception/testing/videos/v1.m4v", "perception/testing/videos/v2.m4v", ]: # Ensure synchronized hashing hashes1 = { hasher_name: hasher.compute(filepath) for hasher_name, hasher in video_hashers.items() } hashes2 = hashers.tools.compute_synchronized_video_hashes( filepath=filepath, hashers=video_hashers ) assert hashes1 == hashes2 def test_hex_b64_conversion(): b64_string = """ CFFRABrAaRKCDQigEBIGwAhNBdIISgVZBxQYAgP4fwYNUR0oBgYCPwwIDSqTAmIH FRQhCiT/IT9DpHIeIx4cA2hQcBTwISovFkspMxz/MzdnljeCOEs4LnBYNHHBMC4x EC8mPxLaLkI/dywmNk1lMXoqJyCLSyg7BxwRSgTmIlI/LwsrP04hTCMtBSxaGAFB """.replace("\n", "").replace(" ", "").strip() hex_string = """ 085151001ac06912820d08a0101206c0084d05d2084a05590714180203f87f06 0d511d280606023f0c080d2a930262071514210a24ff213f43a4721e231e1c03 68507014f0212a2f164b29331cff333767963782384b382e70583471c1302e31 102f263f12da2e423f772c26364d65317a2a27208b4b283b071c114a04e62252 3f2f0b2b3f4e214c232d052c5a180141 """.replace("\n", "").replace(" ", "").strip() assert ( hashers.tools.hex_to_b64(hex_string, dtype="uint8", hash_length=144) == b64_string ) assert ( hashers.tools.b64_to_hex(b64_string, dtype="uint8", hash_length=144) == hex_string ) ================================================ FILE: tests/test_local_descriptor_deduplication.py ================================================ import os import tempfile import albumentations import cv2 import pandas as pd import pytest import perception.benchmarking.image as pb import perception.benchmarking.image_transforms as pbit import perception.approximate_deduplication as ad import perception.local_descriptor_deduplication as ldd import perception.hashers.tools as pht import perception.testing as pt from perception.approximate_deduplication.debug import vizualize_pair # Params for object level matching. OBJECT_MATCH_PARAMS = { "strong_match_threshold": 0.3, # Ideally something close to 95% precision. "ratio": 0.5, "coarse_pct_probe": 0.1, "minimum_coarse_overlap": 0.001, "coarse_threshold": 100.0, "minimum_validation_match": 0.04, "minimum_validation_intersection": 0.04, "minimum_validation_inliers": 6, } @pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()]) def test_deduplication(hasher): tdir = tempfile.TemporaryDirectory() watermark = cv2.cvtColor( cv2.imread(pt.DEFAULT_TEST_LOGOS[0], cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA ) transformed = pb.BenchmarkImageDataset.from_tuples( files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES] ).transform( transforms={ "noop": albumentations.NoOp(p=1), "pad": albumentations.CropAndPad(percent=0.1, p=1), "crop": albumentations.CropAndPad(percent=-0.1, p=1), "watermark": pbit.apply_watermark(watermark, alpha=1, size=0.8), # type: ignore }, storage_dir=tdir.name, ) df = transformed._df.set_index("filepath") pairs = ldd.deduplicate( filepaths_or_reference_df=df.index, max_workers=2, hasher=hasher ) # Test throws errors if unset. clustered = ( pd.DataFrame( ad.pairs_to_clusters(ids=df.index, pairs=pairs, strictness="component") ) .set_index("id") .merge(df, left_index=True, right_index=True) .reset_index() ) print("test2") n_clusters = clustered["cluster"].nunique() n_transforms = clustered["transform_name"].nunique() perfect = ( clustered.groupby("cluster") .apply( lambda g: g["guid"].nunique() == 1 and g["transform_name"].nunique() == n_transforms ) .sum() ) tainted = clustered.groupby("cluster")["guid"].nunique().gt(1).sum() pct_perfect = perfect / n_clusters pct_tainted = tainted / n_clusters assert pct_tainted == 0 assert pct_perfect > 0.1 @pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()]) def test_deduplication_across_sets(hasher): tdir = tempfile.TemporaryDirectory() watermark = cv2.cvtColor( cv2.imread(pt.DEFAULT_TEST_LOGOS[0], cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA ) transformed = pb.BenchmarkImageDataset.from_tuples( files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES] ).transform( transforms={ "noop": albumentations.NoOp(p=1), "pad": albumentations.CropAndPad(percent=0.1, p=1), "crop": albumentations.CropAndPad(percent=0.1, p=1), "watermark": pbit.apply_watermark(watermark, alpha=1, size=0.8), # type: ignore }, storage_dir=tdir.name, ) df = transformed._df.set_index("filepath") query_images = list(df[df.transform_name == "noop"].index.values) images_to_match_to = list(df[~(df.transform_name == "noop")].index.values) pairs = ldd.deduplicate( filepaths_or_reference_df=images_to_match_to, query_filepaths_or_df=query_images, max_workers=2, hasher=hasher, ) # Test throws errors if unset. assert len(pairs) >= 20, "Wrong # of pairs." only_one_noop = [p for p in pairs if (("noop" in p[0]) != ("noop" in p[1]))] assert len(only_one_noop) == len( pairs ), "All pairs must be between a noop and non-noop file" @pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()]) def test_validation_for_overlapping_case(hasher): tdir = tempfile.TemporaryDirectory() # Each image will have the center of the other # pasted in the top left corner. image1 = pht.read(pt.DEFAULT_TEST_IMAGES[0]) image2 = pht.read(pt.DEFAULT_TEST_IMAGES[1]) image1[:100, :100] = image2[100:200, 100:200] image2[:100, :100] = image1[100:200, 100:200] fp1 = os.path.join(tdir.name, "test1.jpg") fp2 = os.path.join(tdir.name, "test2.jpg") cv2.imwrite(fp1, image1[..., ::-1]) cv2.imwrite(fp2, image2[..., ::-1]) descriptor1 = ldd.generate_image_descriptors(fp1, hasher) descriptor2 = ldd.generate_image_descriptors(fp2, hasher) assert descriptor1 is not None assert descriptor2 is not None # These images should not match. assert not hasher.validate_match(descriptor1=descriptor1, descriptor2=descriptor2)[ 0 ] @pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()]) def test_handling_bad_file_case(caplog, hasher): tdir = tempfile.TemporaryDirectory() missing_file = os.path.join(tdir.name, "missing-file") bad_file_handle = tempfile.NamedTemporaryFile() bad_file = bad_file_handle.name transformed = pb.BenchmarkImageDataset.from_tuples( files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES] ).transform( transforms={ "noop": lambda image: image, }, storage_dir=tdir.name, ) df = transformed._df.set_index("filepath") df.loc[missing_file] = df.iloc[0] df.loc[bad_file] = df.iloc[0] pairs = ldd.deduplicate(filepaths_or_reference_df=df.index, hasher=hasher) clustered = ( pd.DataFrame( ad.pairs_to_clusters(ids=df.index, pairs=pairs, strictness="component") ) .set_index("id") .merge(df, left_index=True, right_index=True) .reset_index() ) assert bad_file not in clustered.index assert missing_file not in clustered.index bad_file_error = next( record for record in caplog.records if bad_file in record.message ) assert bad_file_error assert bad_file_error.levelname == "ERROR" missing_file_warning = next( record for record in caplog.records if missing_file in record.message ) assert missing_file_warning assert missing_file_warning.levelname == "WARNING" def test_handling_hasher_mismatch(): tdir = tempfile.TemporaryDirectory() transformed = pb.BenchmarkImageDataset.from_tuples( files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES] ).transform( transforms={ "noop": lambda image: image, }, storage_dir=tdir.name, ) df = transformed._df.set_index("filepath") reference_df = ldd.build_reference_df(filepaths=df.index, hasher=ldd.SIFT()) query_df = ldd.build_reference_df(filepaths=df.index, hasher=ldd.AKAZE()) with pytest.raises(AssertionError): ldd.deduplicate(reference_df, query_df) def test_viz_pair(): object_sift = ldd.SIFT( max_features=256, ratio=OBJECT_MATCH_PARAMS["ratio"], threshold=OBJECT_MATCH_PARAMS["coarse_threshold"], overlap=OBJECT_MATCH_PARAMS["minimum_coarse_overlap"], validation_match=OBJECT_MATCH_PARAMS["minimum_validation_match"], validation_inliers=OBJECT_MATCH_PARAMS["minimum_validation_inliers"], validation_intersection=OBJECT_MATCH_PARAMS["minimum_validation_intersection"], ) filepaths = [ "tests/images/chair.png", "tests/images/chair3.png", "tests/images/chair-square.png", "tests/images/chair-tall.png", ] reference_df = ldd.build_reference_df( filepaths=filepaths, hasher=object_sift, min_features=10, max_size=1000, show_progress=False, ) pairs = ldd.deduplicate( filepaths_or_reference_df=reference_df, hasher=object_sift, max_size=1000, min_features=10, verbose=True, ) row = pairs[0] viz_img = vizualize_pair( reference_df.loc[row[0]], reference_df.loc[row[1]], 0.5, match_metadata=row[2], sanitized=False, ) viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR) cv2.imwrite("tests/images/debug-image.png", viz_img) def test_viz_pair_symmetry(): # This test catches a regression where if the smaller image was the query one LDD would swap # points during distance calculation, but not unswap points before returning them. object_sift = ldd.SIFT( max_features=256, ratio=OBJECT_MATCH_PARAMS["ratio"], threshold=OBJECT_MATCH_PARAMS["coarse_threshold"], overlap=OBJECT_MATCH_PARAMS["minimum_coarse_overlap"], validation_match=OBJECT_MATCH_PARAMS["minimum_validation_match"], validation_inliers=OBJECT_MATCH_PARAMS["minimum_validation_inliers"], validation_intersection=OBJECT_MATCH_PARAMS["minimum_validation_intersection"], ) filepaths = [ "tests/images/chair.png", "tests/images/chair3.png", ] reference_df = ldd.build_reference_df( filepaths=filepaths, hasher=object_sift, min_features=10, max_size=1000, show_progress=False, ) pairs = ldd.deduplicate( filepaths_or_reference_df=filepaths[:1], query_filepaths_or_df=filepaths[1:], hasher=object_sift, max_size=1000, min_features=10, verbose=True, ) row = pairs[0] viz_img = vizualize_pair( reference_df.loc[row[0]], reference_df.loc[row[1]], 0.5, match_metadata=row[2], sanitized=False, ) viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR) cv2.imwrite("tests/images/debug-image-symmetry-1.png", viz_img) # Swap order of ref and query files. pairs = ldd.deduplicate( filepaths_or_reference_df=filepaths[1:], query_filepaths_or_df=filepaths[:1], hasher=object_sift, max_size=1000, min_features=10, verbose=True, ) row = pairs[0] viz_img = vizualize_pair( reference_df.loc[row[0]], reference_df.loc[row[1]], 0.5, match_metadata=row[2], sanitized=False, ) viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR) cv2.imwrite("tests/images/debug-image-symmetry-2.png", viz_img) ================================================ FILE: tests/test_tmk.py ================================================ import gzip import json from pathlib import Path from typing import cast import platform import numpy as np import pytest from perception.hashers.video import tmk TEST_FILES = Path("perception") / "testing" / "videos" def test_tmk_parity(): if platform.machine() == "arm64": pytest.xfail("TMK is not supported on ARM64") hasher = tmk.TMKL2() with gzip.open(TEST_FILES / "expected_tmk.json.gz", "rt", encoding="utf8") as f: expected_output = json.load(f) expected_output = {k: np.array(v) for k, v in expected_output.items()} output = [] for filepath in [ "perception/testing/videos/v1.m4v", "perception/testing/videos/v2.m4v", ]: hash_value: np.ndarray = cast( np.ndarray, hasher.compute(filepath=filepath, hash_format="vector") ) output.append(hash_value.reshape((4, 64, -1))) # Verify the hashes are the same for o, t in zip(output, expected_output["hashes"]): np.testing.assert_allclose(o.reshape(*t.shape), t) # Verify the pair-wise scores are the same offsets = np.arange(-5, 5) for normalization in ["feat", "feat_freq", "matrix"]: score = hasher._score_pair( output[0], output[1], offsets=offsets, normalization=normalization ) np.testing.assert_allclose(score, expected_output[normalization]) ================================================ FILE: tests/test_tools.py ================================================ import os import shutil import tempfile import io import numpy as np import pytest from perception import hashers, testing, tools def test_deduplicate(): directory = tempfile.TemporaryDirectory() original = testing.DEFAULT_TEST_IMAGES[0] duplicate = os.path.join(directory.name, "image1.jpg") shutil.copy(original, duplicate) pairs = tools.deduplicate( files=[ testing.DEFAULT_TEST_IMAGES[0], testing.DEFAULT_TEST_IMAGES[1], duplicate, ], hashers=[(hashers.PHash(hash_size=16), 0.25)], ) assert len(pairs) == 1 file1, file2 = pairs[0] assert ((file1 == duplicate) and (file2 == original)) or ( (file1 == original) and (file2 == duplicate) ) def test_deduplicate_u8(): # This test verifies that extensions.compute_euclidean_pairwise_duplicates # works properly. directory = tempfile.TemporaryDirectory() original = testing.DEFAULT_TEST_IMAGES[0] duplicate = os.path.join(directory.name, "image1.jpg") shutil.copy(original, duplicate) pairs = tools.deduplicate( files=[ testing.DEFAULT_TEST_IMAGES[0], testing.DEFAULT_TEST_IMAGES[1], duplicate, ], hashers=[(hashers.PHashU8(hash_size=16), 10)], ) assert len(pairs) == 1 file1, file2 = pairs[0] assert ((file1 == duplicate) and (file2 == original)) or ( (file1 == original) and (file2 == duplicate) ) def test_deduplicate_hashes_multiple(): # This test verifies that deduplicate_hashes functions properly # when there is more than one hash for a file. directory = tempfile.TemporaryDirectory() original = testing.DEFAULT_TEST_IMAGES[0] duplicate = os.path.join(directory.name, "image1.jpg") hasher = hashers.PHashU8(hash_size=16) shutil.copy(original, duplicate) hashes = [ (0, hasher.compute(original)), (1, hasher.compute(duplicate)), (1, hasher.compute(duplicate)), (1, hasher.compute(duplicate)), (2, hasher.compute(testing.DEFAULT_TEST_IMAGES[1])), ] pairs = tools.deduplicate_hashes( hashes=hashes, # type: ignore[arg-type] threshold=10, hash_format="base64", hash_length=hasher.hash_length, distance_metric="euclidean", hash_dtype="uint8", ) assert len(pairs) == 1 file1, file2 = pairs[0] assert ((file1 == 0) and (file2 == 1)) or ((file1 == 1) and (file2 == 0)) def test_compute_euclidean_pairwise_duplicates(): # The purpose of this test is to verify that the handling of # deduplication with files that have multiple hashes works # properly. This is particularly important for video where # we are likely to have many hashes. X = np.array( [ # File 1 [0, 0, 0], [1, 1, 1], [2, 2, 2], # File 2 [1, 1, 1], [2, 2, 2], [3, 3, 3], # File 3 [3, 3, 3], [4, 4, 4], # File 4 [5, 5, 5], [6, 6, 6], ] ) # Use grouped files. counts = np.array([3, 3, 2, 2]) expected = np.array( [[2 / 3, 2 / 3], [0, 0], [0, 0], [1 / 3, 1 / 2], [0, 0], [0, 0]] ) actual = tools.extensions.compute_euclidean_pairwise_duplicates( X=X.astype("int32"), threshold=1, counts=counts.astype("uint32"), compute_overlap=True, ) assert (expected == actual).all() # Use without computing overlap. expected = np.array([[2, 2], [0, 0], [0, 0], [1, 1], [0, 0], [0, 0]]) actual = tools.extensions.compute_euclidean_pairwise_duplicates( X=X.astype("int32"), threshold=1, counts=counts.astype("uint32"), compute_overlap=False, ) assert (expected == actual).all() # Use ungrouped files. X = np.array( [ # File 1 [0, 0, 0], [1, 1, 1], [2, 2, 2], [1, 1, 1], ] ) expected = np.array([[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [0, 0]]) actual = tools.extensions.compute_euclidean_pairwise_duplicates( X=X.astype("int32"), threshold=1, compute_overlap=True ) assert (expected == actual).all() def test_api_is_over_https(): matcher_https = tools.SaferMatcher(api_key="foo", url="https://www.example.com/") assert matcher_https if "SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP" in os.environ: del os.environ["SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP"] with pytest.raises(ValueError): tools.SaferMatcher(api_key="foo", url="http://www.example.com/") os.environ["SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP"] = "1" matcher_http_with_escape_hatch = tools.SaferMatcher( api_key="foo", url="http://www.example.com/" ) assert matcher_http_with_escape_hatch def test_unletterbox(): image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0]) padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype="uint8") padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image result = hashers.tools.unletterbox(padded) assert result is not None (x1, x2), (y1, y2) = result assert y1 == 50 assert y2 == 50 + image.shape[0] assert x1 == 25 assert x2 == 25 + image.shape[1] def test_unletterbox_crop(): image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0]) padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype="uint8") padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image cropped_image = hashers.tools.unletterbox_crop(padded) assert cropped_image is not None assert image.shape[0] == cropped_image.shape[0] assert image.shape[1] == cropped_image.shape[1] def test_unletterbox_crop_meaningful_pixels(): """Test the value of .5 min_fraction_meaningful_pixels in unletterbox().""" image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0]) h, w, _ = image.shape # make tall skinny images with lots of padding around the content # so its below min_fraction_meaningful_pixels threshold padding_size = int(5 * h) padded = np.r_[ np.zeros((padding_size, w, 3)), image, np.zeros((padding_size, w, 3)) ] assert None is hashers.tools.unletterbox_crop( padded, min_fraction_meaningful_pixels=0.5 ) def test_unletterbox_color(): image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0]) padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype="uint8") padded[:, :] = (200, 0, 200) padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image # Should not unletterbox since not black. results = hashers.tools.unletterbox(padded, only_remove_black=True) assert results is not None (x1, x2), (y1, y2) = results assert y1 == 0 assert y2 == padded.shape[0] assert x1 == 0 assert x2 == padded.shape[1] # Should unletterbox color: results = hashers.tools.unletterbox(padded, only_remove_black=False) assert results is not None (x1, x2), (y1, y2) = results assert y1 == 50 assert y2 == 50 + image.shape[0] assert x1 == 25 assert x2 == 25 + image.shape[1] def test_unletterbox_aspect_ratio(): """Test the value of .1 in unletterbox().""" image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0]) h, w, z = image.shape # make tall skinny images with non-trivial content just below and # above 10% threshold base = int(4.5 * h) # 2 * base + h = 100% h_fail, h_pass = base + 10, base - 10 padded = np.r_[np.zeros((h_fail, w, 3)), image, np.zeros((h_fail, w, 3))] assert None is hashers.tools.unletterbox(padded) padded = np.r_[np.zeros((h_pass, w, 3)), image, np.zeros((h_pass, w, 3))] results = hashers.tools.unletterbox(padded) assert results is not None (x1, x2), (y1, y2) = results assert y1 == h_pass assert y2 == h_pass + image.shape[0] assert x1 == 0 assert x2 == image.shape[1] def test_unletterbox_noblackbars(): image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0]) results = hashers.tools.unletterbox(image) assert results is not None (x1, x2), (y1, y2) = results assert x1 == 0 assert y1 == 0 assert x2 == image.shape[1] assert y2 == image.shape[0] def test_ffmpeg_video(): """Check that the FFMPEG video parsing code provides substantially similar results to the OpenCV approach (which also uses FFMPEG under the hood but also has different frame selection logic).""" frames_per_second = 2.3 for filepath in testing.DEFAULT_TEST_VIDEOS: filename = os.path.basename(filepath) for (frame1, index1, timestamp1), (frame2, index2, timestamp2) in zip( hashers.tools.read_video_to_generator_ffmpeg( filepath, frames_per_second=frames_per_second ), hashers.tools.read_video_to_generator( filepath, frames_per_second=frames_per_second ), ): diff = np.abs(frame1.astype("int32") - frame2.astype("int32")).flatten() assert index1 == index2, f"Index mismatch for {filename}" np.testing.assert_allclose( timestamp1, timestamp2 ), f"Timestamp mismatch for {filename}" assert np.percentile(diff, 75) < 25, f"Frame mismatch for {filename}" def test_videos_with_extra_channels(): frames_per_second = 1 test_videos = [ "perception/testing/videos/extra_channel_attached_pic.mp4", "perception/testing/videos/extra_channel_attached_pic_audio.mp4", ] expected_frames = 10 for filepath in test_videos: filename = os.path.basename(filepath) frame_count = 0 for frame1, index1, timestamp1 in hashers.tools.read_video_to_generator_ffmpeg( filepath, frames_per_second=frames_per_second ): frame_count += 1 assert frame_count == expected_frames, f"Frame count mismatch for {filename}" def test_image_input_types(): image_expected = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0]) with open(testing.DEFAULT_TEST_IMAGES[0], "rb") as f: image_data = f.read() image_bytes_io = hashers.tools.read(io.BytesIO(image_data)) assert (image_expected == image_bytes_io).all() with tempfile.SpooledTemporaryFile() as f: f.write(image_data) f.seek(0) image_tempfile = hashers.tools.read(f) assert (image_expected == image_tempfile).all()